In [1]:
import pandas as pd
import numpy as np
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./src/네이버종토방댓글_카카오_15000_태그토큰화_라벨.csv')
df = df.drop_duplicates('토큰화댓글')
df = df.reset_index(drop=True)
df = df[['토큰화댓글','공포탐욕']]
df.to_csv(f'./src/train.csv', index=False)
df

Unnamed: 0,토큰화댓글,공포탐욕
0,"['개미', '조련', '하다', '법']",m
1,"['카카오', '끝물', '이다', '사실']",m
2,"['안티', '살발']",0
3,"['이렇다', '호로', '개미', '지옥탕', '만들다']",m
4,"['곡', '소리', '나다']",m
...,...,...
207270,"['여기', '털리다', '빠가사리', '나가다', '죽다']",0
207271,"['카카오', '네이버', '제휴']",m
207272,"['어차피', '또', '폭등', '하다', '떨다', '필요', '있다', '회주']",1
207273,"['오늘', '물리다', '생각', '하다', '사람', '도대체']",0


In [3]:
tokenizer = Tokenizer(num_words=30000, oov_token = True)
tokenizer.fit_on_texts(df['토큰화댓글'])
df['토큰'] = tokenizer.texts_to_sequences(df['토큰화댓글'])
# 저장
with open('./src/mytokenizer1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
df_train = df[df['공포탐욕'] != 'm']
df_train = df_train.reset_index(drop=True)
df_train

Unnamed: 0,날짜,댓글,조회수,좋아요,싫어요,한글댓글,토큰화댓글,공포탐욕,토큰
0,2022-05-03,안티 살발하다,34,2,0,안티 살발하다,"['안티', '살발']",0,"[46, 9126]"
1,2022-05-03,카카오는 왜 이렇게 국민들의 안티기업이 ...,81,3,2,카카오는 왜 이렇게 국민들의 안티기업이,"['카카오', '왜', '국민', '안티', '기업']",0,"[2, 30, 143, 46, 63]"
2,2022-05-03,팔아처묵은 개미새0 말참많은내,96,4,4,팔아처묵은 개미새 말참많은내,"['팔', '처', '묵다', '개미', '새', '말', '차다', '많다']",0,"[226, 421, 1066, 7, 965, 50, 219, 71]"
3,2022-05-03,내일 카카오 -14프로 내리는날,588,7,6,내일 카카오 프로 내리는날,"['내일', '카카오', '프로', '내리다', '날']",0,"[24, 2, 81, 151, 99]"
4,2022-05-03,버핏은 애플 더 담았단다,128,1,1,버핏은 애플 더 담았단다,"['버핏', '애플', '더', '담다']",1,"[2776, 999, 35, 182]"
...,...,...,...,...,...,...,...,...,...
74493,2017-10-18,■낼 셀트로 가삐까?,480,3,0,낼 셀트로 가삐까,"['낼', '셀트', '가다', '삐', '끄다']",1,"[124, 595, 9, 3387, 159]"
74494,2017-10-18,■셀트는 낼도 폭등이다,623,0,4,셀트는 낼도 폭등이다,"['셀', '틀다', '낼', '폭등']",1,"[1473, 1099, 124, 213]"
74495,2017-10-18,여기서 털리는 빠가사리들은 나가죽어라,693,6,6,여기서 털리는 빠가사리들은 나가죽어라,"['여기', '털리다', '빠가사리', '나가다', '죽다']",0,"[79, 310, 12007, 207, 312]"
74496,2017-10-18,어차피 또 폭등할텐데 떨필요있나 기회주면...,795,15,3,어차피 또 폭등할텐데 떨필요있나 기회주면,"['어차피', '또', '폭등', '하다', '떨다', '필요', '있다', '회주']",1,"[348, 60, 213, 3, 495, 1360, 14, 4709]"


In [6]:
train = pad_sequences(df_train['토큰'], maxlen=15)
print(train.shape)

In [16]:
label = df_train['공포탐욕']

encoder = LabelEncoder()
batch_size = label.shape[0]
input_dim = 1
label = encoder.fit_transform(label)
label = np.reshape(label, (batch_size, input_dim))

label.shape

(74498,)

In [20]:
model = Sequential()
model.add(Embedding(30000, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
hist = model.fit(train, label, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.save('./src/mymodel1.h5')