In [1]:
import pandas as pd
from konlpy.tag import Okt
import numpy as np
import boto3
import sys
from gensim.models import FastText
import os, csv, math, codecs
from tqdm import tqdm
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


MAX_NB_WORDS = 100000

# =========================================================================
# Vectorizer의 argument인 tokenizer에 KoNLPy의 pos 함수로 대체.
class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        clean_words = []  # 정제된 단어 리스트
        for word in pos:
            # word[1]은 품사를 의미하며, 여기서는 조사, 문장기호, 접두사, Foreign('\n'을 빼주기 위함)인 것은 제외시킴.
            if word[1] not in ['Josa', 'Punctuation', 'Suffix', 'Foreign']:
                if len(word[0]) >= 2:  # 한 글자인 단어들도 의미가 없는 경우가 많으므로 일단 제외.
                    #if word[0] not in ['있다', '했다', '한다', '없다', '된다']:
                    clean_words.append(word[0])
        return clean_words

# =========================================================================

embeddings_index = {}
f = codecs.open('/home/ubuntu/FastText/wiki.ko.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

# ==============================================================
if sys.version_info[0] < 3:
    from io import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# get your credentials from environment variables
aws_id = 'AKIA2EDEFCNPX2G7FWNY'
aws_secret = 'Xt1EJXPsRdI27VI7TBSCsRMNJWsewq9FY0g4vDU7'

client = boto3.client('s3', aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret)

bucket_name = 'snucsv'

object_key = 'SNU_Data_1200.csv'
csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('cp949')

# load data
train_df = pd.read_csv(StringIO(csv_string))
label_names = ["label"]
y_train = train_df[label_names].values

# test_df = pd.read_csv('D:\pythonProject\FakeNewsFiltering\SNU_Validation_200.csv', encoding='CP949')
# test_df = test_df.fillna('_NA_')

# print(train_df)
train_df['doc_len'] = train_df['document'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
# ==================================================================================
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

my_Tokenizer = MyTokenizer(Okt())

raw_docs_train = train_df['document'].tolist()
# raw_docs_test = test_df['document'].tolist()
# print(raw_docs_test)
num_classes = len(label_names)

processed_docs_train = []
# processed_docs_test = []

for doc in tqdm(raw_docs_train):
    tokens = my_Tokenizer(doc)
    processed_docs_train.append(tokens)

# for doc in tqdm(raw_docs_test):
#     tokens = my_Tokenizer(doc)
#     processed_docs_test.append(tokens)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
# tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
tokenizer.fit_on_texts(processed_docs_train)
# tokenizer.fit_on_texts(processed_docs_train+proccessed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
# word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size : ", len(word_index))

word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
# word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)
# ====================================================================================
# training params
batch_size = 256
num_epochs = 40

# model parameters
num_filters = 64
embed_dim = 300
weight_decay = 1e-4

# embedding matrix

words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, embed_dim))

# print(word_index)
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        # cc.ko.300.vec에서 찾지 못한 단어들의 리스트.
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("sample words not found: ", np.random.choice(words_not_found, 10))



879131it [01:46, 8250.41it/s]


found 879130 word vectors


100%|██████████| 1224/1224 [00:09<00:00, 125.62it/s]


dictionary size :  3999
number of null word embeddings: 354
sample words not found:  ['기본소득제' '30년' '파인을' '비분' '13일' '382' '5.5' '걸쳐도' '8.5' '연애횟수']


In [4]:
# ========================================================================================
from keras.layers import BatchNormalization
import tensorflow as tf

model = tf.keras.Sequential()

model.add(Embedding(nb_words, embed_dim, input_length=max_seq_len, weights=[embedding_matrix], trainable=False))

model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))
model.summary()
# ================================================================================
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

es_callback = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(word_seq_train_shuffle, y_train_shuffle, batch_size=128,
          epochs=num_epochs, validation_split=0.3, callbacks=[es_callback], shuffle=False)

# predictions = model.predict_classes(word_seq_test)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 300)           1200000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 300)           0         
_________________________________________________________________
dense_3 (Dense)              (None, 10, 32)            9632      
_________________________________________________________________
bidirectional_5 (Bidirection (None, 10, 128)           49664     
_________________________________________________________________
bidirectional_6 (Bidirection (None, 10, 128)           98816     
_________________________________________________________________
bidirectional_7 (Bidirection (None, 10, 128)           98816     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 10, 128)          

In [3]:
word_seq_all_shuffle = train_df.sample(frac=1).reset_index(drop=True)
word_seq_train_shuffle = word_seq_all_shuffle['document'].tolist()
y_train_shuffle = word_seq_all_shuffle['label'].values

processed_docs_train_shuffle = []

for doc in tqdm(word_seq_train_shuffle):
    tokens = my_Tokenizer(doc)
    processed_docs_train_shuffle.append(tokens)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train_shuffle)
word_seq_train_shuffle = tokenizer.texts_to_sequences(processed_docs_train_shuffle)
word_seq_train_shuffle = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)

100%|██████████| 1224/1224 [00:02<00:00, 485.36it/s]


In [4]:
model.save('SNU_LSTM_Model.h5')

In [4]:
model.save('SNU_LSTM_Model_shuffle.h5')

In [6]:
from tensorflow.keras.models import load_model

SNU_model_shuffle = load_model('SNU_LSTM_Model_shuffle.h5')

In [9]:
response = ['현재 AI기술은 불완전하기 때문에 가짜뉴스 완전히 걸러낼 수 없다', '카카오톡 유료화 서비스 시행하나', '왜이러는거야 도대체']
tokens_response = []

for i in range(len(response)):
    tokens = my_Tokenizer(response[i])
    tokens_response.append(tokens)
#     print(tokens_response)

word_seq_response = tokenizer.texts_to_sequences(tokens_response)
print(word_seq_response[0])
word_seq_response = sequence.pad_sequences(word_seq_response, maxlen=max_seq_len)
print(word_seq_response.shape)

score_list = []
for sco in range(len(word_seq_response)):
    word_seq_response_to_score = word_seq_response[sco].reshape(1,max_seq_len)
    print(word_seq_response_to_score)
    score_list.append(float(SNU_model_shuffle.predict(word_seq_response_to_score)))
    print(score_list)

[594, 2165, 663, 2166, 22, 76, 126, 2167, 2168, 8]
(3, 10)
[[ 594 2165  663 2166   22   76  126 2167 2168    8]]
[0.47938355803489685]
[[   0    0    0    0    0 2157 1842  236  243 1331]]
[0.47938355803489685, 0.5113765001296997]
[[0 0 0 0 0 0 0 0 0 0]]
[0.47938355803489685, 0.5113765001296997, 0.5069465041160583]


In [9]:
response = ['현재 AI기술은 불완전하기 때문에 가짜뉴스 완전히 걸러낼 수 없다']
tokens_response = []

for i in response:
    tokens = my_Tokenizer(response[0])
    tokens_response.append(tokens)
    print(tokens_response)

word_seq_response = tokenizer.texts_to_sequences(tokens_response)
word_seq_response = sequence.pad_sequences(word_seq_response, maxlen=max_seq_len)
score = model.predict(word_seq_response)
score

[['현재', 'AI', '기술', '불완전하기', '때문', '가짜', '뉴스', '완전히', '걸러', '없다']]


array([[0.5054651]], dtype=float32)

In [None]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)