# 한글 단어열 예측 

In [1]:
import pandas as pd
import numpy as np  
import re
import pickle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers

import codecs
from tqdm import tqdm
import shutil

# 데이터

## Tokenizer 설치

https://github.com/kakao/khaiii

In [2]:
import os
home_path = os.getcwd()
print(home_path)

/content


In [3]:
!git clone https://github.com/kakao/khaiii.git

Cloning into 'khaiii'...
remote: Enumerating objects: 1016, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 1016 (delta 46), reused 76 (delta 29), pack-reused 877[K
Receiving objects: 100% (1016/1016), 33.06 MiB | 30.12 MiB/s, done.
Resolving deltas: 100% (404/404), done.


In [4]:
!pip install cmake



In [5]:
%cd khaiii
!mkdir build
%cd build

!cmake ..

!make all
!make resource
!make install
!make package_python

%cd package_python
!pip install .

/content/khaiii
/content/khaiii/build
-- [hunter] Initializing Hunter workspace (70287b1ffa810ee4e952052a9adff9b4856d0d54)
-- [hunter]   https://github.com/ruslo/hunter/archive/v0.23.34.tar.gz
-- [hunter]   -> /root/.hunter/_Base/Download/Hunter/0.23.34/70287b1
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test fma_compiles
-- Performing Test fma_compiles - Success
-- Performing Test fma_runs
-- Performing Test fma_runs - Success
-- [

In [6]:
%cd {home_path}

/content


In [7]:
from khaiii import KhaiiiApi
api = KhaiiiApi()
for word in api.analyze('토크나이징이 잘 되나요?'):
	print(word)

토크나이징이	토크나이/NNP + 징/NNG + 이/JKS
잘	잘/MAG
되나요?	되/VV + 나요/EF + ?/SF


## 데이터 다운로드

In [8]:
!wget https://github.com/dhrim/deep_learning_data/raw/master/movie_ratings.txt

--2021-11-10 02:02:12--  https://github.com/dhrim/deep_learning_data/raw/master/movie_ratings.txt
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dhrim/deep_learning_data/master/movie_ratings.txt [following]
--2021-11-10 02:02:12--  https://raw.githubusercontent.com/dhrim/deep_learning_data/master/movie_ratings.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19515078 (19M) [text/plain]
Saving to: ‘movie_ratings.txt’


2021-11-10 02:02:13 (124 MB/s) - ‘movie_ratings.txt’ saved [19515078/19515078]



## 데이터 로딩

In [9]:
df = pd.read_table("movie_ratings.txt")

In [10]:
df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


## 데이터 섞기

In [11]:
df = df.sample(frac=1).reset_index(drop=True) 

df.head()

Unnamed: 0,id,document,label
0,3505106,정말 재미 있네요.,1
1,3996727,"귤이 회수를 건너니,탱자가 됐다.이걸 리메이크 하느니 차라리 독수리오형제가 어때?",0
2,7092100,촌스럽고 지루해여 근데 유동근아내가 선생님팰땐 존나웃겨ㅛ음ㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0
3,8622367,단디하세여~~~282828,1
4,6214038,너무 한 출연자에게만 어필하고...송은희씨는 너무 끼어들고 그치만 신동엽씨는 역시 ...,0


## 입력과 출력 데이터 분리

In [12]:
reviews = df.document.values.copy()
labels = df.label.values.copy()

In [13]:
print(reviews.shape)
print(labels.shape)

(200000,)
(200000,)


## 토큰나이징

In [15]:
PAD = "[PAD]"
UNK = "[UNK]"

In [19]:
VOCA_SIZE = 4000 # 어휘 사전의 크기
SEQ_LENGTH = 128 # 리뷰 최대 길이

In [43]:
from khaiii import KhaiiiApi
from tqdm import tqdm


def tokenize(reviews):
  api = KhaiiiApi()
  tokenized = []
  # for review in tqdm(reviews):
  for review in reviews:
    review = str(review)
    # review = '어릴때보고 지금다시봐도 재밌어요ㅋㅋ'
    tokens = []
    words = api.analyze(review)
    # words = ['어릴때보고' '지금다시봐도' '재밌어요ㅋㅋ']

    for word in words:
      # word = '어릴때보고'  
      # word.morphs = ['어리', 'ㄹ', '때', '보', '고']
      for i in range(len(word.morphs)):
        token = word.morphs[i].lex
        tokens.append(token)
      # tokens = ['어리', 'ㄹ', '때', '보', '고', '지금다시', '보', '아도', '재미있', '어요', 'ㅋㅋ']
    tokens = tokens[:SEQ_LENGTH]
    tokenized.append(tokens)
  return tokenized

In [44]:
tokenized_reviews = tokenize(reviews)

## 인코딩

In [46]:
import collections
from collections import OrderedDict

def build_index(tokenized_reviews):

  all_tokens = []
  for tokens in tokenized_reviews:
    all_tokens.extend(tokens)

 # 빈도 순으로 정열
  counts = collections.Counter(all_tokens)
  sorted_tokens = sorted(all_tokens, key=counts.get, reverse=True)

  # 단어 중복 삭제
  sorted_tokens = list(OrderedDict.fromkeys(sorted_tokens))

  # voca 크기 이상의 단어는 버린다.
  print("len(sorted_tokens) =", len(sorted_tokens))
  sorted_tokens = sorted_tokens[:VOCA_SIZE-2] # PAD, UNK 2개
  sorted_tokens.insert(0, UNK)
  sorted_tokens.insert(0, PAD)
  print("len(sorted_tokens) =", len(sorted_tokens))

  word2index = { word:index for index,word in enumerate(sorted_tokens)}
  index2word = { index:word for index,word in enumerate(sorted_tokens)}

  return word2index, index2word


def encode(tokenized_reviews, word2index):

  encoded_reviews = []
  for tokens in tokenized_reviews:
    # tokens = ['아', '더', '빙', '..', '진짜', '짜증', '나네', '요', '목소리']

    encoded = []
    for token in tokens:
      token = token if token in word2index else UNK
      encoded.append(word2index[token])

    # encoding = [2, 3, 4, 1, 5, 6, 7, 8, 9]

    # 길이가 모자라면 PAD(0)으로 채운다.
    padding_length = SEQ_LENGTH - len(encoded)
    encoded.extend([word2index[PAD]]*padding_length)
    # encoded = [2, 3, 4, 1, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, ... ]
    encoded_reviews.append(encoded)

  return encoded_reviews

word2index, index2word = build_index(tokenized_reviews)
encoded_reviews = encode(tokenized_reviews, word2index)


len(sorted_tokens) = 108036
len(sorted_tokens) = 4000


In [47]:
print(word2index)
print(index2word)

{'[PAD]': 0, '[UNK]': 1, '이': 2, '.': 3, '하': 4, 'ㄴ': 5, '는': 6, '다': 7, '영화': 8, '고': 9, '보': 10, '가': 11, '의': 12, '도': 13, '에': 14, '은': 15, '을': 16, '지': 17, '었': 18, '!': 19, '어': 20, 'ㄹ': 21, '들': 22, '게': 23, '았': 24, '..': 25, '나': 26, ',': 27, '것': 28, '있': 29, '를': 30, '없': 31, '...': 32, '아': 33, '?': 34, '만': 35, '되': 36, '좋': 37, '는데': 38, '기': 39, '로': 40, '주': 41, '적': 42, '너무': 43, '였': 44, '여': 45, '네': 46, '으로': 47, '음': 48, '정말': 49, 'ㅁ': 50, '같': 51, 'ㄴ다': 52, '에서': 53, '~': 54, '어요': 55, 'ㅋ': 56, '점': 57, '지만': 58, '않': 59, '안': 60, '말': 61, '수': 62, '면': 63, '아니': 64, '과': 65, '거': 66, '시': 67, '그': 68, '만들': 69, '네요': 70, '재미있': 71, '뭐': 72, '연기': 73, '던': 74, '평점': 75, 'ㅂ니다': 76, '진짜': 77, '잘': 78, 'ㅠ': 79, '라': 80, '나오': 81, '겠': 82, '재밌': 83, '1': 84, '이런': 85, '와': 86, '최고': 87, '요': 88, '습니다': 89, '이것': 90, '듯': 91, 'ㅋㅋ': 92, '왜': 93, '생각': 94, '싶': 95, '더': 96, '내': 97, '어서': 98, '스토리': 99, '사람': 100, '아서': 101, '까지': 102, '감동': 103, '오': 104, '한': 105, '^'

In [49]:
x = np.array(encoded_reviews)
print(x.shape)

y = labels
print(y.shape)

(200000, 128)
(200000,)


# 모델 학습

## 데이터 준비

In [50]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, shuffle=True)

In [51]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(160000, 128)
(160000,)
(40000, 128)
(40000,)


## 학습 실행

In [52]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

EMBEDDING_SIZE = 64

model = Sequential()
model.add(Input(SEQ_LENGTH))
model.add(Embedding(VOCA_SIZE, EMBEDDING_SIZE))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64)))
model.add(Dense(250, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_x, train_y, batch_size=32, epochs=5, validation_split=0.1)

# Evaluation
loss, acc = model.evaluate(test_x, test_y)
print("loss =", loss)
print("acc =", acc)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 64)           256000    
                                                                 
 batch_normalization_2 (Batc  (None, 128, 64)          256       
 hNormalization)                                                 
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 250)               32250     
                                                                 
 batch_normalization_3 (Batc  (None, 250)              1000      
 hNormalization)                                                 
                                                      

## 분류 실행

In [95]:
test_text = "흥미롭기는 했는데 전개가 뻔함. 그래도 재밌음. 추추추"

def do_classify(test_text):
  tokenized_text = tokenize([test_text])
  encoded_text = encode(tokenized_text, word2index)
  model_input = np.array(encoded_text)
  y_ = model.predict(model_input)
  predicted = "긍정" if y_>0.5 else "부정"

  print(test_text, "-->", predicted, ",score :",y_[0][0])

do_classify("여운이 많이 남는 영화")
do_classify("여운이 많이 남는 영화. 스토리 전개는 뻔함.")
do_classify("여운이 많이 남는 영화. 스토리 전개는 뻔함. 시간 때우기 용")
do_classify("여운이 많이 남는 영화. 스토리 전개는 뻔함. 시간 때우기 용, 비추.")

여운이 많이 남는 영화 --> 긍정 ,score : 0.98757905
여운이 많이 남는 영화. 스토리 전개는 뻔함. --> 긍정 ,score : 0.87313455
여운이 많이 남는 영화. 스토리 전개는 뻔함. 시간 때우기 용 --> 긍정 ,score : 0.60144633
여운이 많이 남는 영화. 스토리 전개는 뻔함. 시간 때우기 용, 비추. --> 부정 ,score : 0.42041746
