### 라이브러리 설치, 데이터 준비

In [1]:
# %pip install nltk

import nltk
# nltk.download()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import csv
import json
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
csv.field_size_limit(100000000)

def csv_to_json(csvfilename, jsonfilename, fieldnames):
    csvfile = open(csvfilename, 'r')
    jsonfile = open(jsonfilename, 'w')

    reader = csv.DictReader(csvfile, fieldnames)

    for row in reader:
        json.dump(row, jsonfile)
        jsonfile.write('\n')

csv_to_json('news_articles.csv', 'train_data.json', ("author","published","raw_title","raw_content","language","site_url","main_img_url","type","label","title","content","hasImage"))
DB = []
with open('train_data.json') as j:
    DB = list(map(json.loads, j))

csv_to_json('test_data.csv', 'test_data.json', ("unit_id","title","content","source","date","location","is_real"))
T = []
with open('test_data.json') as j:
    T = list(map(json.loads, j))

print(len(DB), len(T))

2096 804


#### 데이터 정제하기

- 데이터 값 전처리 (정규식으로 영어, 특수기호 없애기)
- 데이터 중 빈 값 제거
- 데이터 중복 샘플 제거

In [6]:
DB = list(filter(lambda x: (x["content"] != '' and x["content"] != None), DB))
T = list(filter(lambda x: (x["content"] != '' and x["content"] != None), T))

# print(DB[:3])

In [7]:
for v in DB:
    v["content"] = re.sub(r'[^a-zA-Z0-9 ]', '', v["content"].lower())
    v["content"] = v["content"].strip()

for v in T:
    v["content"] = re.sub(r'[^a-zA-Z0-9 ]', '', v["content"].lower())
    v["content"] = v["content"].strip()

# print(DB[:5])

In [8]:
DB = list(filter(lambda x : x["content"] != '' and x["content"] != None, DB))
T = list(filter(lambda x : x["content"] != '' and x["content"] != None, T))

# print(DB[:5])

In [9]:
DB = [dict(t) for t in {tuple(d.items()) for d in DB}]
T = [dict(t) for t in {tuple(d.items()) for d in T}]

In [10]:
DB_is = []
for v in DB:
    DB_is.append(1 if v['label'] == "Real" else 0)

T_is = []
for v in T:
    T_is.append(int(v['is_real']))

In [13]:
print(DB_is[:5], T_is[:5])

[0, 0, 0, 0, 0] [1, 1, 0, 0, 0]


### 토큰화

In [11]:
from nltk.corpus import stopwords

DB_content = []
for v in DB:
    DB_content.append([w for w in v['content'].split() if w not in stopwords.words("english")])

T_content = []
for v in T:
    T_content.append([w for w in v['content'].split() if w not in stopwords.words("english")])

In [12]:
print(f'{DB_content[0]}\n=================\n{T_content[0]}')

['muslims', 'terrorize', 'hindus', 'wednesday', 'november', 'daniel', 'greenfield', 'official', 'media', 'narrative', 'muslims', 'worlds', 'greatest', 'victims', 'truth', 'especially', 'majority', 'muslim', 'countries', 'rather', 'strikingly', 'different', 'crowds', 'muslims', 'attacked', 'hindu', 'homes', 'temples', 'eastern', 'bangladesh', 'week', 'raising', 'concerns', 'authorities', 'taking', 'steps', 'curb', 'rising', 'religious', 'tensions', 'attacks', 'hindus', 'unusual', 'bangladesh', 'rare', 'see', 'multiple', 'crowds', 'targeting', 'temples', 'organized', 'way', 'sunday', 'monday', 'note', 'casual', 'language', 'times', 'muslim', 'religious', 'violence', 'commonplace', 'bangladesh', 'common', 'happening', 'scale', 'could', 'otherwise', 'islam', 'structurally', 'xenophobic', 'violently', 'bigoted', 'racist', 'origins', 'islams', 'faith', 'expressed', 'violent', 'campaign', 'nonmuslims', 'jihad', 'muslims', 'attacking', 'hindus', 'christians', 'jews', 'yazidis', 'group', 'means

### 정수 인코딩

In [13]:
tok = Tokenizer()
tok.fit_on_texts(DB_content)

threshold = 3
total_cnt = len(tok.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key, value in tok.word_counts.items():
    total_freq += value

    if value < threshold:
        rare_cnt += 1
        rare_freq += value

print(f'희귀 단어 등장 비율: {rare_freq / total_freq * 100}')

size = total_cnt - rare_cnt + 1

tok = Tokenizer(size)
tok.fit_on_texts(DB_content)
DB_content = tok.texts_to_sequences(DB_content)
T_content = tok.texts_to_sequences(T_content)

희귀 단어 등장 비율: 6.050867768882943


### 빈 샘플 제거

In [14]:
drop_indexes = [i for i in range(len(DB_content)) if len(DB_content[i]) == 0]

for i in drop_indexes:
    DB_content.pop(i)
    DB_is.pop(i)

### 패딩

In [15]:
pad_len = max(len(content) for content in DB_content)

DB_content = pad_sequences(DB_content, maxlen=pad_len)
T_content = pad_sequences(T_content, maxlen=pad_len)

In [19]:
DB_content[:2]

array([[  0,   0,   0, ...,   4, 337, 204],
       [  0,   0,   0, ...,  71,   3, 148]])

In [30]:
DB_is[:2]

[0, 0]

In [31]:
T_content[:2]

array([[   0,    0,    0, ...,  983,  285,    9],
       [   0,    0,    0, ..., 3767,  626,    6]])

In [32]:
T_is[:2]

[0, 1]

### LSTM

In [20]:
DB_is = np.array(DB_is)
T_is = np.array(T_is)

embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(DB_content, DB_is, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.65602, saving model to best_model.h5
Epoch 2/15
Epoch 2: val_acc improved from 0.65602 to 0.67568, saving model to best_model.h5
Epoch 3/15
Epoch 3: val_acc improved from 0.67568 to 0.69779, saving model to best_model.h5
Epoch 4/15
Epoch 4: val_acc did not improve from 0.69779
Epoch 5/15
Epoch 5: val_acc did not improve from 0.69779
Epoch 6/15
Epoch 6: val_acc improved from 0.69779 to 0.70762, saving model to best_model.h5
Epoch 6: early stopping


### 모델 사용

In [21]:
loaded_model = load_model('best_model.h5')
print(f'accuracy: {loaded_model.evaluate(T_content, T_is)[1]}')

accuracy: 0.5124688148498535


In [22]:
def predict(string):
    string = re.sub(r'[^a-zA-Z0-9 ]', '', string.lower())
    string = [w for w in string.split() if w not in stopwords.words("english")]
    tknized = tok.texts_to_sequences([string])
    pad_new = pad_sequences(tknized, maxlen = size)
    score = float(loaded_model.predict(pad_new))
    print(f'{score} positive')

In [23]:
predict("is this fake news?")

0.10758146643638611 positive


In [24]:
predict("Trump got his stock")

0.3122520446777344 positive


In [25]:
predict('how can i get high score')

0.3235936462879181 positive
