<a href="https://colab.research.google.com/github/gihoonpark/Fake_News_classification-NLP/blob/main/%EA%B0%80%EC%A7%9C%EB%89%B4%EC%8A%A4_ai_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting Environment

In [None]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Data load

In [None]:
import pandas as pd
import numpy as np
import urllib.request
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from konlpy.tag import Mecab
from pandas import concat

import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/dataset/가짜뉴스ai/news_train.csv')
df2 = pd.read_csv('/content/drive/MyDrive/dataset/가짜뉴스ai/news_test.csv')

In [None]:
train_df = df1.loc[:, ['title','content','info','ord']]
test_df = df2.loc[:, ['title','content','ord']]

Data cleaning & tokenizing

In [None]:
stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', 
             '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게', '만', '되', '음', '면']

In [None]:
train_df['title'] = train_df['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9%$ ]","") # 정규 표현식 수행
train_df['content'] = train_df['content'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9%$ ]","") # 정규 표현식 수행
train_df[['title','content']].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
train_df[['title','content']] = train_df[['title','content']].dropna(how='any') # Null 값 제거


test_df['title'] = test_df['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9%$ ]","") # 정규 표현식 수행
test_df['content'] = test_df['content'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9%$ ]","") # 정규 표현식 수행
test_df[['title','content']].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_df[['title','content']] = test_df[['title','content']].dropna(how='any') # Null 값 제거

In [None]:
mecab = Mecab()
train_df['title'] = train_df['title'].apply(mecab.morphs)
train_df['content'] = train_df['content'].apply(mecab.morphs)
train_df['title'] = train_df['title'].apply(lambda x: [item for item in x if item not in stopwords])
train_df['content'] = train_df['content'].apply(lambda x: [item for item in x if item not in stopwords])

test_df['title'] = test_df['title'].apply(mecab.morphs)
test_df['content'] = test_df['content'].apply(mecab.morphs)
test_df['title'] = test_df['title'].apply(lambda x: [item for item in x if item not in stopwords])
test_df['content'] = test_df['content'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
title_tokenizer = Tokenizer(14000)
title_tokenizer.fit_on_texts(train_df['title'])

content_tokenizer = Tokenizer(160000)
content_tokenizer.fit_on_texts(train_df['content'])

In [None]:
title_sequences = title_tokenizer.texts_to_sequences(train_df['title'])
content_sequences = content_tokenizer.texts_to_sequences(train_df['content'])

test_title_sequences = title_tokenizer.texts_to_sequences(test_df['title'])
test_content_sequences = content_tokenizer.texts_to_sequences(test_df['content'])

In [None]:
# title, content 단어 수
len(title_tokenizer.word_index), len(content_tokenizer.word_index)

In [None]:
# title 시퀀스 수
print('리뷰의 최대 길이 :',max(len(l) for l in train_df['title']))
print('리뷰의 평균 길이 :',sum(map(len, train_df['title']))/len(train_df['title']))
plt.hist([len(s) for s in train_df['title']], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# content 시퀀스 수
print('리뷰의 최대 길이 :',max(len(l) for l in train_df['content']))
print('리뷰의 평균 길이 :',sum(map(len, train_df['content']))/len(train_df['content']))
plt.hist([len(s) for s in train_df['content']], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# test_title 시퀀스 수
print('리뷰의 최대 길이 :',max(len(l) for l in test_df['title']))
print('리뷰의 평균 길이 :',sum(map(len, test_df['title']))/len(test_df['title']))
plt.hist([len(s) for s in test_df['title']], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# test_content 시퀀스 수
print('리뷰의 최대 길이 :',max(len(l) for l in test_df['content']))
print('리뷰의 평균 길이 :',sum(map(len, test_df['content']))/len(test_df['content']))
plt.hist([len(s) for s in test_df['content']], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

padding

In [None]:
X_train1 = pad_sequences(title_sequences, maxlen = 50, padding='post')
X_train2 = pad_sequences(content_sequences, maxlen = 200, padding='post')
y_train = train_df['info']
y_train = y_train.values.reshape(-1,1)

In [None]:
X_test1 = pad_sequences(test_title_sequences, maxlen = 50, padding='post')
X_test2 = pad_sequences(test_content_sequences, maxlen = 200, padding='post')

X_train1.shape, X_train2.shape, y_train.shape, X_test1.shape, X_test2.shape

In [None]:
X_train3 = train_df['ord'].values.reshape(-1,1)

In [None]:
X_test3 = test_df['ord'].values.reshape(-1,1)

train, validation split

In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_val1, X_train2, X_val2, X_train3, X_val3, y_train, y_val = train_test_split(X_train1, X_train2,  X_train3, y_train, test_size=0.3, random_state = 42)

In [None]:
n_fold = 5
seed = 42
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

Build model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout, Input, Flatten, BatchNormalization, Reshape, Concatenate, Conv1D,GRU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

In [None]:
def build_model1():
    
    title_input = Input(shape=(50,))
    title_embed = Embedding(14000, 516, input_length=50)(title_input)
    title_lstm1 = Bidirectional(LSTM(256))(title_embed)
    title_flat = Flatten()(title_lstm1)
    title_output = Dense(32)(title_flat)
    
    content_input = Input(shape=(200,))
    content_embed = Embedding(160000, 516, input_length=200)(content_input)
    content_lstm1 = Bidirectional(LSTM(256))(content_embed)
    content_flat = Flatten()(content_lstm1)
    content_output = Dense(256)(content_flat)

    concatenated = Concatenate()([title_output, content_output])
    batchnorm = BatchNormalization()(concatenated)
    dense = Dense(128, activation='relu')(batchnorm)
    final_output = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[title_input, content_input], outputs=final_output)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

In [None]:
def build_model2():
    
    title_input = Input(shape=(50,))
    title_embed = Embedding(14000, 516, input_length=50)(title_input)
    title_lstm1 = Bidirectional(LSTM(256))(title_embed)
    title_flat = Flatten()(title_lstm1)
    title_output = Dense(32)(title_flat)
    title_dropout = Dropout(0.4)(title_output)
    
    content_input = Input(shape=(200,))
    content_embed = Embedding(160000, 516, input_length=200)(content_input)
    content_lstm1 = Bidirectional(LSTM(256))(content_embed)
    content_flat = Flatten()(content_lstm1)
    content_output = Dense(256)(content_flat)
    content_dropout = Dropout(0.4)(content_output)

    ord_input = Input(shape=(1,))
    ord_dense = Dense(32)(ord_input)

    concatenated = Concatenate()([title_dropout, content_dropout, ord_dense])
    batchnorm1 = BatchNormalization()(concatenated)
    dense1 = Dense(128, activation='relu')(batchnorm1)
    batchnorm2 = BatchNormalization()(dense1)
    dense2 = Dense(64, activation='relu')(batchnorm2)
    final_output = Dense(1, activation='sigmoid')(dense2)

    model = Model(inputs=[title_input, content_input, ord_input], outputs=final_output)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    
    
    return model

In [None]:
def LSTM_model():
    
    title_input = Input(shape=(30,))
    title_embed = Embedding(10000, 256, input_length=30)(title_input)
    title_lstm1 = Bidirectional(LSTM(128, return_sequences=True))(title_embed)
    title_lstm2 = Bidirectional(LSTM(128))(title_lstm1)
    title_flat = Flatten()(title_lstm2)
    title_output = Dense(10)(title_flat)
    
    content_input = Input(shape=(200,))
    content_embed = Embedding(45000, 256, input_length=200)(content_input)
    content_lstm1 = Bidirectional(LSTM(128, return_sequences=True ))(content_embed)
    content_lstm2 = Bidirectional(LSTM(128))(content_lstm1)
    content_flat = Flatten()(content_lstm2)
    content_output = Dense(10)(content_flat)

    concatenated = Concatenate()([title_output, content_output])
    batchnorm = BatchNormalization()(concatenated)
    final_output = Dense(1, activation='sigmoid')(batchnorm)

    model = Model(inputs=[title_input, content_input], outputs=final_output)
    
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

In [None]:
# build_model1 (two input : title, content)
# val_score = 0.9761, private_score = 0.9438878534 (12.02)
from keras.utils import plot_model
es = EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience = 3, restore_best_weights=False)
annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)
model1 = build_model1()
#plot_model(model, show_shapes=True)
model1.fit([X_train1, X_train2], y_train, epochs = 30, batch_size = 256, validation_data=([X_val1, X_val2], y_val), verbose=1, callbacks=[es, annealer])

In [None]:
# build_model2 (three input : title, content, ordinary)
# val_score = 0.9814, private_score = 0.9648182665 (12.02), select this model
es = EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience = 3, restore_best_weights=True)
annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)
model2 = build_model2()
model2.fit([X_train1, X_train2,  X_train3], y_train, epochs = 30, batch_size = 256, validation_data=([X_val1, X_val2, X_val3], y_val), verbose=1, callbacks=[es])

In [None]:
# LSTM_model (two input + add more LSTM layers)
es = EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience = 3, restore_best_weights=True)
annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)
model3 = LSTM_model()
model3.fit([X_train1, X_train2], y_train, epochs = 30, batch_size = 256, validation_data=([X_val1, X_val2], y_val), verbose=1, callbacks=[es])


Predict test data

In [None]:
sub_df = model.predict([X_test1, X_test2, X_test3]) # build_model2

In [None]:
sub_df

In [None]:
submission = []
for i in sub_df:
    if i < 0.5:
        submission.append(0)
    else:
        submission.append(1)

In [None]:
sum(submission)

In [None]:
sub = pd.read_csv('/content/drive/MyDrive/dataset/가짜뉴스ai/sample_submission.csv', index_col=0)

In [None]:
sub['info'] = submission
sub.head()

In [None]:
sub.to_csv('/content/drive/MyDrive/dataset/가짜뉴스ai/sample_submission.csv')