## requirements

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev python3-dev
!pip3 install JPype1-py3 konlpy pandas tqdm
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"

## read data

In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/fllff/nlp_basic_lec/master/ratings_train.txt',sep='\t')

# NaN 삭제
df = df.dropna()

# shuffle
df = df.sample(frac=1)

print(df)

## tokenization

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

mecab.morphs('한국어 토큰 분리기 사용방법')


In [None]:
from tqdm import tqdm

for i, row in tqdm(df.iterrows()):
    df.at[i,'document'] = mecab.morphs(row['document'])
    
print(df)                                

In [None]:
doc_list = df['document']
label_list = df['label']

doc_len_list = [len(doc) for doc in doc_list]

# 문장길이 분포 확인
distribution = [0]*20
for doc_len in doc_len_list:
    distribution[int(doc_len/10)] += 1
print(distribution)

# label균형 확인
label_list.value_counts()

In [None]:
# 문장 길이 trim
doc_list = [doc[:50] for doc in doc_list]

# 문장길이 분포 확인
distribution = [0]*20
for doc in doc_list:
    distribution[int(len(doc)/10)] += 1
print(distribution)

## vectorization


In [None]:
from gensim.models import Word2Vec
word_model = Word2Vec(doc_list, size=16, window=5, min_count=50, workers=2, iter=3)


In [None]:
word_model.wv.most_similar('배우')

In [None]:
word_model.wv['배우']

In [None]:
# token -> vector 변환
vector_list = []
for doc in tqdm(doc_list):
    tmp = []
    for token in doc:
        try:
            tmp.append(word_model.wv[token].tolist())
        except:
            tmp.append([0]*16)
        
    vector_list.append(tmp)
    
print(vector_list[0])


In [None]:
MAX_LEN = 50
tmp=[]

for i, vector in tqdm(enumerate(vector_list)):
    vector_list[i].extend([[0]*16 for x in range(MAX_LEN-len(vector))])

print(len(vector_list[0]))

## Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [None]:
# create model
inputs = keras.Input(shape=(50,16))
#x = layers.SimpleRNN(16)(inputs)
x = layers.LSTM(16)(inputs)
x = layers.Dense(1)(x)
outputs = keras.activations.sigmoid(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()
keras.utils.plot_model(model)

In [None]:
model.compile(optimizer='Adam',
              metrics=['accuracy'],
              loss=keras.losses.BinaryCrossentropy())

In [None]:
import datetime
log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

model.fit(np.asarray(vector_list), 
          np.asarray(label_list), 
          batch_size=32, 
          epochs=3, 
          validation_split=0.01,
          tf.keras.callbacks.TensorBoard(log_dir=log_dir, update_freq='batch'))


## predict

In [None]:
sentence = '난 좀 지루한 느낌'

test_vector = []
for token in mecab.morphs(sentence):
    try:
        test_vector.append(word_model.wv[token].tolist())
    except:
        test_vector.append([0]*16)

test_vector.extend([[0]*16 for x in range(MAX_LEN-len(test_vector))])
print(test_vector)
model.predict(np.asarray([test_vector]))


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs