### Setting

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
pip install konlpy

In [None]:
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
cd Mecab-ko-for-Google-Colab/

In [None]:
! bash install_mecab-ko_on_colab190912.sh

In [None]:
import tensorflow as tf
import numpy as np      
import pandas as pd       
import re
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import konlpy
from konlpy.tag import Mecab
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 데이터 불러오기
train = pd.read_csv("/content/gdrive/MyDrive/NH-FakeNews/news_train.csv")
test = pd.read_csv("/content/gdrive/MyDrive/NH-FakeNews/news_test.csv")
train_Y = train['info']

# 데이터 확인
print('train count:', len(train))
print('test count:', len(test))

# pd.set_option('display.max_columns', 1000)
# print(train[:10])

### Prepocessing

In [None]:
def preprocess_text(text_list):
    # 불용어 사전
    stopwords = ['을', '를', '이', '가', '은', '는', '의', '하', '에']
    # 형태소 분석기
    tokenizer = Mecab()
    token_list = []
    
    for text in tqdm(text_list):
        # 한글과 영어 소문자 제외하고 모두 제거
        txt = re.sub('[^가-힣a-z]', ' ', text.lower())
        # 형태소 분석
        token = tokenizer.morphs(txt)
        # 불용어 제외한 결과 추출
        token = [t for t in token if t not in stopwords or type(t) != float]
        token_list.append(token)
        
    return token_list, tokenizer

train['token'], mecab = preprocess_text(train['content'])
train['token']

In [None]:
sentence_len = [len(sentence) for sentence in train['token']]
sentences = train['token']
plt.hist(sentence_len, bins=88)
plt.xlabel('length of contents')
plt.ylabel('number of contents')
plt.show()

print('content 최대 길이 :',max(sentence_len))
print('content 평균 길이 :',sum(sentence_len)/len(sentences))
print(sum([int(l<=50) for l in sentence_len]))
print(sum([int(l<=50) for l in sentence_len])/len(sentences))

In [None]:
# 단어 정제 및 문장 길이 축소
sentences = train['token']
sentences_new = []
for sentence in sentences:
    sentences_new.append([word[:20] for word in sentence][:50])
sentences = sentences_new

# 데이터 확인
# for i in range(5):
#     print(sentences[i])

### Vectorization

In [None]:
tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(sentences)

train_X = tokenizer.texts_to_sequences(sentences)
train_X = pad_sequences(train_X)

vocab_size = len(tokenizer.word_index) + 1

### Modeling

In [None]:
# 모델 정의
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128, input_length=50), 
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.LSTM(units=128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# optimizer 정의
adam = tf.keras.optimizers.Adam(lr = 0.0005)

# 모델 compile
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# 과적합 방지를 위한 EarlyStopping
earlystop_callback = tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_accuracy')

# 모델 학습
history = model.fit(train_X, train_Y, epochs=10, batch_size=512, validation_split=0.25, callbacks=[earlystop_callback])

In [None]:
# 모델 학습 결과 시각화
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.ylim(0.7, 1)
plt.legend()

plt.show()

### Prediction

In [None]:
# test 데이터 전처리 + 형태소 분석
test['token'], mecab = preprocess_text(test['content'])

# 단어 정제 및 문장 길이 축소
sentences_test = test['token']
sentences_new_test = []
for sentence_test in sentences_test:
    sentences_new_test.append([word[:20] for word in sentence_test][:50])
sentences_test = sentences_new_test

# 데이터 확인
# for i in range(5):
#     print(sentences[i])

# Vectorization
test_X = tokenizer.texts_to_sequences(sentences_test)
test_X = pad_sequences(test_X)

# 예측
test_Y = model.predict(test_X)

In [None]:
# 결과 제출
submission = pd.read_csv("/content/gdrive/MyDrive/NH-FakeNews/fake_news_submission.csv")
submission.loc[:,'info'] = np.where(test_Y> 0.5, 1,0).reshape(-1)
submission.loc[:,["id","info"]].to_csv("/content/gdrive/MyDrive/NH-FakeNews/fake_news_submission.csv", index = False)