In [3]:
import pandas as pd
from konlpy.tag import Okt
import numpy as np
import boto3
import sys
from gensim.models import FastText
import os, csv, math, codecs
from tqdm import tqdm
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils import shuffle

MAX_NB_WORDS = 100000

# =========================================================================
# Vectorizer의 argument인 tokenizer에 KoNLPy의 pos 함수로 대체.
class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        pos = self.tagger.pos(sent)
        clean_words = []  # 정제된 단어 리스트
        for word in pos:
            # word[1]은 품사를 의미하며, 여기서는 조사, 문장기호, 접두사, Foreign('\n'을 빼주기 위함)인 것은 제외시킴.
            if word[1] not in ['Josa', 'Punctuation', 'Suffix', 'Foreign']:
                if len(word[0]) >= 2:  # 한 글자인 단어들도 의미가 없는 경우가 많으므로 일단 제외.
                    #if word[0] not in ['있다', '했다', '한다', '없다', '된다']:
                    clean_words.append(word[0])
        return clean_words

# =========================================================================

embeddings_index = {}
f = codecs.open('/home/ubuntu/FastText/wiki.ko.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

# ==========================================================================

# 승호 S3로부터 CSV 파일 불러오기.
if sys.version_info[0] < 3:
    from io import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# get your credentials from environment variables
aws_id = ''
aws_secret = ''

client = boto3.client('s3', aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret)

bucket_name = 'snucsv'

object_key1 = 'Comments_false_1200_label.csv'
object_key2 = 'Comments_true_1200_label.csv'

csv_obj1 = client.get_object(Bucket=bucket_name, Key=object_key1)
body1 = csv_obj1['Body']
csv_string1 = body1.read().decode('utf-8')
train_df_fake = pd.read_csv(StringIO(csv_string1))
print(train_df_fake)

csv_obj2 = client.get_object(Bucket=bucket_name, Key=object_key2)
body2 = csv_obj2['Body']
csv_string2 = body2.read().decode('utf-8')
train_df_true = pd.read_csv(StringIO(csv_string2))

# Comments_fake 데이터와 Comments_true 데이터 병합 및 섞기.
train_df_merged = pd.concat([train_df_fake, train_df_true])
train_df_merged = train_df_merged.sample(frac=1).reset_index(drop=True)

# ====================================================================
train_df = train_df_merged
label_names = ["label"]
y_train = train_df[label_names].values

# test_df = pd.read_csv('D:/pythonProject/FakeNewsFiltering/Commnets_TestSet_200.csv', encoding='utf-8')
# test_df = test_df.fillna('_NA_')

# print(train_df)
train_df['doc_len'] = train_df['Comments'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
# ==================================================================================

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

my_Tokenizer = MyTokenizer(Okt())

raw_docs_train = train_df['Comments'].tolist()
# raw_docs_test = test_df['Comments'].tolist()
# print(raw_docs_test)
num_classes = len(label_names)

processed_docs_train = []
# processed_docs_test = []

for doc in tqdm(raw_docs_train):
    tokens = my_Tokenizer(doc)
    processed_docs_train.append(tokens)
print(processed_docs_train)

# for doc in tqdm(raw_docs_test):
#     tokens = my_Tokenizer(doc)
#     processed_docs_test.append(tokens)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train)
# tokenizer.fit_on_texts(processed_docs_train+proccessed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
# word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size : ", len(word_index))

word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
# word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)
# ====================================================================================
# training params
batch_size = 256
num_epochs = 40

# model parameters
num_filters = 64
embed_dim = 300
weight_decay = 1e-4

# embedding matrix

words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, embed_dim))

# print(word_index)
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        # cc.ko.300.vec에서 찾지 못한 단어들의 리스트.
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("sample words not found: ", np.random.choice(words_not_found, 10))


879131it [02:00, 7308.68it/s]


found 879130 word vectors


  0%|          | 0/8023 [00:00<?, ?it/s]

      Unnamed: 0                                           Comments  label
0              0                  "네이버가 더 악질이지 광고는 ㅋㅋ 기본 15초 혈압 ㅋㅋ"      0
1              1                                   "이래서 경쟁자가 있어야한다"      0
2              2                    "처음에만 광고붙여라. 중간에 나오는 광고가 짜증난다."      0
3              3                             "중간 광고 없는 넷플릭스나 봐야겠네."      0
4              4                            "지상파는 중간광고없어서 좋았는데 ㅠㅠㅠ"      0
...          ...                                                ...    ...
4381        4381              "제발 이젠 내탓이요좀 해라 남의탓부터하지말고 정신세계가맑지못하다"      0
4382        4382                 "대통령임기가 1 2년도아니고 .. 4년됐으면 알아서좀 해라"      0
4383        4383  "NLL포기란 표현은 어디에??내가 보기에는 오히려 북쪽이 양보한다고 되있고 경제적...      0
4384        4384                      "이게 NLL포기냐????  일베 똥누리 OOO들아"      0
4385        4385  "이걸 공개하다니 중간중간 미국일본 중국 내용 나오던데 보면 난리가 나겠구만 대한민...      0

[4386 rows x 3 columns]


100%|██████████| 8023/8023 [01:34<00:00, 84.92it/s] 


[['80만원', 'ㅋㅋ', '참나', '의원직', '유지', '시키겠다네', '얼굴', '봐라', '좋단다', 'ㅋㅋㅋ'], ['정부', '하지', '들이', '원칙', '한다', '그냥', '국민', '원칙', '지켜라', '하고', '2.75', '단계', '정부', '원칙', '무시라는데', '국민', '지켜야', '되는지', '알수가', '없네', '자식'], ['멋져', '보였는데', '추하네', 'ㅉㅉ'], ['마무리', '멘트', '진행자', '한당', '의원', '버리네', 'ㅋㅋㅋ'], ['유아', '아동', '접종', '하는데', '어쩌라고', 'ㅠㅠ'], ['힐러리', '진건', '해커', '때문', '아니라', '힐러리', '같아서', '진거잖아', '세상', '퍼플', '스테이', '그렇게', '신경안', '쓴건', '마치', '수학', '포기', '과대', '가려는거랑', '똑같은', '거지', '그리고', '동맹', '해킹', '하면서', '러시아', '보고', '해커', '트집', '잡는건', '웃긴거고', '차라리', '인권', '우크라이나', '집중', '했어야지'], ['빨리', '강화', '시켜라', '지금', '늦었다', '썩을'], ['인천', '정신', '나간', '소리', '하고', '있네'], ['역시', '지사', '그릇', '신분', '입니다', '경청', '하시고', '대통령', '원희룡', '입니다'], ['성소수자', '남자', '기만', '자르고', '여자', '해달라는면다', '여자', '본인', '좋을지', '몰라도', '그걸', '지켜봐', '야하는', '사람', '입장', '생각', '안해', '봤냐'], ['이런', '안습', '하나', '하도', '쓰레기', '같아서', '대강', '훓어보', '말았지만', '결론', '이제', '마지막', '이용', '먹을건', '없는', '빠순이', '한겨레', '자기', '고백', '이군', '하긴', '여론', '관심', '기용', '그만한', '없지'

In [4]:
from keras.layers import BatchNormalization
import tensorflow as tf

model = tf.keras.Sequential()

model.add(Embedding(nb_words, embed_dim, input_length=max_seq_len, weights=[embedding_matrix], trainable=False))

model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))
model.summary()
# ================================================================================
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

es_callback = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(word_seq_train, y_train, batch_size=256,
          epochs=num_epochs, validation_split=0.3, callbacks=[es_callback], shuffle=False)

# predictions = model.predict_classes(word_seq_test)

Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/tensorflow2_latest_p37/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2021-09-16 12:15:52.327 ip-172-31-32-76:2770 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-09-16 12:15:53.372 ip-172-31-32-76:2770 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 300)           8650800   
_________________________________________________________________
dropout (Dropout)            (None, 32, 300)           0         
_________________________________________________________________
dense (Dense)       

In [None]:
model.save('Naver_Comments_Model')