In [1]:
import multiprocessing
import numpy as np
import os

In [2]:
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary

In [3]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout

Using TensorFlow backend.


In [4]:
#한글 토큰화를 위한 API들
from konlpy.tag import Okt
from pprint import pprint
import nltk

In [5]:
np.random.seed(1337)

In [6]:
vocab_dim = 300 #특징 Vector의 차원 설정
maxlen = 100 #최대 시퀀스의 길이.
n_iterations = 10  # 반복 수, 많을수록 word2vec의 품질 향상.
n_exposures = 30 # word 빈도 최소 기준 수
window_size = 7 #target word를 기준으로 예측 단어와의 최대 거리
batch_size = 32 #한 번에 학습할 데이터의 크기
n_epoch = 2 #학습 반복 수
input_length = 100 # 상수 일 때 입력 시퀀스의 길이. Maxlen과 같아야 함.
cpu_count = multiprocessing.cpu_count() #학습시킬 컴퓨터 사양 확인

In [7]:
def read_data(filename):
    with open(filename, 'r', encoding='utf8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] #header 제외
    return data

In [8]:
train_data = read_data('./ratings_train.txt') 
test_data = read_data('./ratings_test.txt')

In [9]:
print(len(train_data)) # train_data : 150,000
print(len(train_data[0]))
print (train_data[0])

print(len(test_data)) # test_data : 50,000
print(len(test_data[0]))
print(test_data[0])

150000
3
['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']
50000
3
['6270596', '굳 ㅋ', '1']


In [10]:
train = {}
for i in range(0, len(train_data)):
    train[i] = train_data[i][1]

In [11]:
test = {}
for i in range(0, len(test_data)):
    test[i] = test_data[i][1]

In [12]:
pos_tagger = Okt()

In [13]:
pos_tagger.morphs(train[0])

['아', '더빙', '..', '진짜', '짜증나네요', '목소리']

In [14]:
def tokenizer(text):
    text = [pos_tagger.morphs(document) for document in text]
    return text

In [15]:
combined = train.copy()
combined.update(test)

In [16]:
combined = tokenizer(combined.values())

In [17]:
model = Word2Vec(size = vocab_dim,
                 min_count = n_exposures,
                 window = window_size,
                 workers = cpu_count,
                 iter = n_iterations)

In [18]:
model.build_vocab(combined)

In [19]:
model.train(combined, total_examples=model.corpus_count, epochs=model.iter)

  """Entry point for launching an IPython kernel.


(14830623, 21536280)

In [20]:
def create_dictionaries(train = None, test = None, model = None):
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #단어 목록을 (token_id, token_count)인 2-tuples 생성
        w2indx = {v: k+1 for k, v in gensim_dict.items()} # (key, value)인 2-tuples 생성
        w2vec = {word: model[word] for word in w2indx.keys()}
        #각 단어의 word2vec array 생성

        #word의 value를 기준으로 수치화 된 train, test데이터 생성. 
        def parse_dataset(data):
            for i in range(0, len(data)):
                txt = pos_tagger.morphs(data[i])
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[i] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')

In [21]:
index_dict, word_vectors, train, test = create_dictionaries(train = train,
                                                            test = test,
                                                            model = model)

  import sys


In [22]:
n_symbols = len(index_dict) + 1  # adding 1 to account for 0th index
embedding_weights = np.zeros((n_symbols, vocab_dim))
for word, index in index_dict.items():
    embedding_weights[index, :] = word_vectors[word]

In [23]:
X_train = train.values()
y_train = []
for z in train_data:
    y_train.append(z[2])
y_train = y_train

X_test = test.values()
y_test = []
for zz in test_data:
    y_test.append(zz[2])
y_test = y_test

In [24]:
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (150000, 100)
X_test shape: (50000, 100)


In [25]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [26]:
model = Sequential()
model.add(Embedding(output_dim = vocab_dim,
                    input_dim = n_symbols,
                    mask_zero = True,
                    weights = [embedding_weights],
                    input_length = input_length))

model.add(LSTM(vocab_dim))
model.add(Dropout(0.3)) #한번에 학습시킬 학습률
model.add(Dense(1, activation = 'sigmoid')) #output이 0과 1이므로 sigmoid로 설정
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          1741500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 2,463,001
Trainable params: 2,463,001
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy', #output이 0과 1이므로 binary
              metrics = ['accuracy'])

In [29]:
model.fit(X_train, y_train,
          batch_size = batch_size,
          nb_epoch = n_epoch,
          validation_data = (X_test, y_test),
          shuffle = True)

  """


Train on 150000 samples, validate on 50000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1e3925b3630>

In [30]:
score = model.evaluate(X_test, y_test, batch_size = batch_size)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.32008245854854583
Test accuracy: 0.8579800128936768
