# Preprocessing

In [254]:
import os
import sys
import numpy as np
import gensim

In [255]:
file_path = os.path.join('review_data','ratings.txt')

In [256]:
sentences = []
labels = []
with open(file_path, 'r') as f:
    next(f) # header skip
    for line in f.readlines():
        _, doc, label = line.strip().split('\t')
        sentences.append(doc.strip())
        labels.append(label.strip())

In [257]:
len(sentences) # 전체 문장

200000

In [258]:
sentences[:5]

['어릴때보고 지금다시봐도 재밌어요ㅋㅋ',
 '디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.',
 '폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.',
 '와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지',
 '안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.']

In [259]:
from konlpy.tag import Twitter
twi_tagger = Twitter()

In [260]:
# 품사 중 명사, 동사, 형용사, 부사, 감탄사(헐, 어머나), 한국어약어(ㅋㅋㅋ) 만 추출해봄.
Non_Stop_words = set(["Noun", "Verb","Adjective","Adverb","Exclamation","KoreanParticle"])

# (돈, Noun) -> "돈/Noun" 형태로 형태소 분석.
def tokenizer_twit(doc, remove_stopwords=True):
    if remove_stopwords:
        word_list = ['/'.join(t) for t in twi_tagger.pos(doc, norm=True, stem=True) if t[-1] in Non_Stop_words]
    else:
        word_list = ['/'.join(t) for t in twi_tagger.pos(doc, norm=True, stem=True)]
    return word_list

In [261]:
tokenizer_twit(sentences[1], remove_stopwords=True)

['디자인/Noun',
 '배우다/Verb',
 '학생/Noun',
 '외국/Noun',
 '디자이너/Noun',
 '그/Noun',
 '일군/Noun',
 '전통/Noun',
 '통해/Noun',
 '발전/Noun',
 '하다/Verb',
 '문화/Noun',
 '산업/Noun',
 '부럽다/Adjective',
 '사실/Noun',
 '우리나라/Noun',
 '그/Noun',
 '어렵다/Adjective',
 '시절/Noun',
 '끝/Noun',
 '열정/Noun',
 '지키다/Verb',
 '노라노/Noun',
 '같다/Adjective',
 '전통/Noun',
 '있다/Adjective',
 '저/Noun',
 '같다/Adjective',
 '사람/Noun',
 '꿈/Noun',
 '꾸다/Verb',
 '이루다/Verb',
 '나가다/Verb',
 '수/Noun',
 '있다/Adjective',
 '것/Noun',
 '감사/Noun',
 '하다/Verb']

In [262]:
words_list=[]
statement_num=0
for sent in sentences:
    words_list.append(tokenizer_twit(sent, remove_stopwords=False))
    statement_num += 1
    if statement_num%10000 == 0:
        print("statement_num : %d" % statement_num)

statement_num : 10000
statement_num : 20000
statement_num : 30000
statement_num : 40000
statement_num : 50000
statement_num : 60000
statement_num : 70000
statement_num : 80000
statement_num : 90000
statement_num : 100000
statement_num : 110000
statement_num : 120000
statement_num : 130000
statement_num : 140000
statement_num : 150000
statement_num : 160000
statement_num : 170000
statement_num : 180000
statement_num : 190000
statement_num : 200000


# Word2vec Modeling

In [263]:
from gensim.models import word2vec
import multiprocessing
import time
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import ColumnDataSource, LabelSet
from bokeh import palettes

In [37]:
# Set parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = multiprocessing.cpu_count() # Number of threads to run in parallel
context = 3 # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
negative = 20
iter_ = 20 # 얼마나 반복할지.
sg=1

In [38]:
start = time.time() # 작업시간 검사용!
# Initialize and train the model (this will take some time)
print("Training model...")
model = word2vec.Word2Vec(words_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, negative=negative, iter=iter_, sg=sg)
end = time.time()
print(end-start)

Training model...
151.4017848968506


In [43]:
len(model.wv.index2word) # 전체 단어

10327

In [95]:
# 모델 저장
model_name = os.path.join('w2v_file','300features_10minwords_3context_twi')
model.save(model_name)

INFO:gensim.utils:saving Word2Vec object under w2v_file/300features_10minwords_3context_twi, separately None
INFO:gensim.utils:not storing attribute syn0norm
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved w2v_file/300features_10minwords_3context_twi


In [203]:
# # 모델 로드.
# model_name = os.path.join('w2v_file','300features_10minwords_3context_twi')
# model = word2vec.Word2Vec.load(model_name)

INFO:gensim.utils:loading Word2Vec object from w2v_file/300features_10minwords_3context_twi
INFO:gensim.utils:loading wv recursively from w2v_file/300features_10minwords_3context_twi.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded w2v_file/300features_10minwords_3context_twi


In [57]:
def vis_top_n(model, filepath, top, lr=500, n_iter=1000, perplexity=10):
    """
    전체 단어 중 특정 빈도 이상의 단어만 시각화.
    :param model: gensim word2vec model
    :param filepath: 시각화파일(html) 저장할 파일 경로
    :param top: 빈도 상위 몇개의 단어까지 시각화할지
    """
    vectors = model.wv.syn0[:top]
    labels = model.wv.index2word[:top]
    counts = [model.wv.vocab[label].count for label in labels]  # _labels의 빈도수 (빈도수별 사이즈 다르게 하기 위해)

    if np.shape(vectors)[1] > 2:  # 입력값이 2차원일경우에는 차원축소 하지 않음.
        print('tsne....')
        start = time.time()
        tsne = TSNE(perplexity=perplexity, n_components=2, init='random', n_iter=n_iter, verbose=1, learning_rate=lr, method='exact')  # tsne를 이용한 차원 축소 (n차원 -> 2차원)
        vectors = tsne.fit_transform(vectors)
        print(time.time()-start)

    _filepath = '%s[top_%i].html' % (filepath, top)

    source = ColumnDataSource(
        data=dict(
            x=vectors.T[0],
            y=vectors.T[1],
            size=(np.log1p(np.array(counts))) * 1.2 + 2,
            word=labels,
            color=['#0099ff'] * len(labels),
        )
    )
    label_set = LabelSet(x='x', y='y', text='word', level='glyph', text_color="#111111", text_alpha=0.6, text_font_size='8pt', x_offset=5, y_offset=5, source=source, render_mode='canvas')
    tools = "pan,wheel_zoom,box_zoom,reset,resize"
    p = figure(plot_width=900, plot_height=900, tools=[tools], title='word2vec vis top %i' % top)
    p.circle('x', 'y', size='size', source=source, alpha=0.6, fill_color='color', line_color='#eeeeee')
    p.add_layout(label_set)

    output_file(_filepath, title=_filepath)
    show(p)

In [64]:
# vis_top_n(model=model, filepath=os.path.join('w2v_file','300features_10minwords_3context_twi'), top=2000, lr=500, n_iter=1000, perplexity=10)

tsne....
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 0.954084
[t-SNE] KL divergence after 100 iterations with early exaggeration: 35.931646


INFO:bokeh.core.state:Session output file 'w2v_file/300features_10minwords_3context_twi[top_2000].html' already exists, will be overwritten.


[t-SNE] Error after 200 iterations: 35.931646
32.547072887420654


# CNN

In [253]:
from bs4 import BeautifulSoup

import re
import sys
import os
import numpy as np
import pandas as pd
from nltk import tokenize

from sklearn.metrics import classification_report

In [198]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import losses

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model

In [99]:
words_list[:5]

[['어리다/Verb',
  '때/Noun',
  '보고/Noun',
  '지금/Noun',
  '다시/Noun',
  '봐/Noun',
  '재밌다/Adjective',
  'ㅋㅋ/KoreanParticle'],
 ['디자인/Noun',
  '배우다/Verb',
  '학생/Noun',
  '외국/Noun',
  '디자이너/Noun',
  '그/Noun',
  '일군/Noun',
  '전통/Noun',
  '통해/Noun',
  '발전/Noun',
  '하다/Verb',
  '문화/Noun',
  '산업/Noun',
  '부럽다/Adjective',
  '사실/Noun',
  '우리나라/Noun',
  '그/Noun',
  '어렵다/Adjective',
  '시절/Noun',
  '끝/Noun',
  '열정/Noun',
  '지키다/Verb',
  '노라노/Noun',
  '같다/Adjective',
  '전통/Noun',
  '있다/Adjective',
  '저/Noun',
  '같다/Adjective',
  '사람/Noun',
  '꿈/Noun',
  '꾸다/Verb',
  '이루다/Verb',
  '나가다/Verb',
  '수/Noun',
  '있다/Adjective',
  '것/Noun',
  '감사/Noun',
  '하다/Verb'],
 ['폴리스스토리/Noun',
  '시리즈/Noun',
  '부터/Noun',
  '뉴/Noun',
  '버리다/Verb',
  '하나/Noun',
  '없다/Adjective',
  '최고/Noun'],
 ['와/Noun',
  '연기/Noun',
  '진짜/Noun',
  '개/Noun',
  '쩔다/Verb',
  '지루하다/Adjective',
  '생각/Noun',
  '하다/Verb',
  '몰입/Noun',
  '보다/Verb',
  '그렇다/Adjective',
  '이렇다/Adjective',
  '진짜/Noun',
  '영화/Noun'],
 ['안개/Noun',
  '자욱/Noun',
  '밤하늘/No

In [100]:
labels[:5]

['1', '1', '1', '1', '1']

In [92]:
w2v_dics = set(model.wv.index2word)

In [104]:
model.wv.index2word

['영화/Noun',
 '하다/Verb',
 '보다/Verb',
 '없다/Adjective',
 '있다/Adjective',
 '좋다/Adjective',
 '너무/Noun',
 '되다/Verb',
 '재밌다/Adjective',
 '정말/Noun',
 'ㅋㅋ/KoreanParticle',
 '것/Noun',
 '이/Noun',
 '진짜/Noun',
 '같다/Adjective',
 '아니다/Adjective',
 '않다/Verb',
 '점/Noun',
 '연기/Noun',
 '나오다/Verb',
 '만들다/Verb',
 '이렇다/Adjective',
 '평점/Noun',
 '최고/Noun',
 '이다/Verb',
 '왜/Noun',
 '생각/Noun',
 '스토리/Noun',
 '드라마/Noun',
 '감동/Noun',
 '사람/Noun',
 '보고/Noun',
 '말/Noun',
 '더/Noun',
 '아깝다/Adjective',
 '내/Noun',
 '다/Adverb',
 '때/Noun',
 '배우/Noun',
 '안/Noun',
 '감독/Noun',
 '재미있다/Adjective',
 '그냥/Noun',
 '거/Noun',
 '뭐/Noun',
 '내용/Noun',
 '봐/Noun',
 '재미/Noun',
 '시간/Noun',
 '아/Exclamation',
 '그/Noun',
 '들다/Verb',
 '재미없다/Adjective',
 '자다/Verb',
 '가다/Verb',
 '좀/Noun',
 '그렇다/Adjective',
 '지루하다/Adjective',
 '쓰레기/Noun',
 '주다/Verb',
 '나/Noun',
 '수/Noun',
 '싶다/Verb',
 '사랑/Noun',
 '알다/Verb',
 '작품/Noun',
 '하나/Noun',
 '다시/Noun',
 'ㅠㅠ/KoreanParticle',
 '마지막/Noun',
 '볼/Noun',
 'ㅋ/KoreanParticle',
 '이다/Adjective',
 '이렇게/Adverb',
 '모르다/Ve

In [110]:
sequences = [[model.wv.index2word.index(word) for word in wl if word in model.wv.index2word] for wl in words_list]

In [111]:
sequences[:10]

[[144, 37, 31, 83, 67, 46, 8, 10],
 [3429,
  419,
  1335,
  1262,
  6823,
  50,
  4428,
  1093,
  1243,
  1,
  1013,
  5607,
  1603,
  353,
  337,
  50,
  533,
  526,
  92,
  1225,
  1041,
  14,
  4428,
  4,
  169,
  14,
  30,
  559,
  1353,
  1317,
  380,
  61,
  4,
  11,
  511,
  1],
 [6598, 232, 1002, 4429, 115, 66, 3, 23],
 [126, 18, 13, 100, 887, 57, 26, 1, 256, 2, 56, 21, 13, 0],
 [5608, 527, 4, 14, 0],
 [63, 352, 30, 406, 75, 92, 272, 4, 0],
 [77, 29, 72, 67, 46, 29],
 [100, 442, 19, 19, 2751, 31, 62],
 [226],
 [804, 15, 1131, 5471, 24]]

In [120]:
MAX_SEQUENCE_LENGTH = 50
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # MAX_SEQUENCE_LENGTH만큼 뒤 기준으로 잘림.

In [121]:
labels = to_categorical(np.asarray(labels)) # label one-hot encoding
labels

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [122]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (200000, 50)
Shape of label tensor: (200000, 2)


In [123]:
indices = np.arange(data.shape[0])
indices[:20]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [125]:
np.random.shuffle(indices) # 데이터 셔플링

In [126]:
indices[:20]

array([131525, 105165, 188297,  25203, 161328,  27851,  81846, 118409,
       142287,  32300, 160441, 134339, 172886, 121093,  41662, 104376,
         7027, 112700,  26236,  32529])

In [127]:
data = data[indices] # 데이터 셔플링
labels = labels[indices]

In [185]:
VALIDATION_SPLIT = 0.3
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
X_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]

In [186]:
print('Number of positive and negative reviews in traing and validation set ')
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

Number of positive and negative reviews in traing and validation set 
[ 70115.  69885.]
[ 29885.  30115.]


In [131]:
EMBEDDING_DIM = 300

In [160]:
embedding_matrix = np.random.random((len(model.wv.index2word) + 1, EMBEDDING_DIM)) # bias term 때문에 +1 해준듯.. 
embedding_matrix # 랜덤값으로 초기화

array([[ 0.19821844,  0.8549015 ,  0.83916541, ...,  0.78965441,
         0.6625393 ,  0.40503732],
       [ 0.63219422,  0.32725754,  0.47736539, ...,  0.78555428,
         0.58840358,  0.7010478 ],
       [ 0.75109203,  0.2466851 ,  0.29521671, ...,  0.39468461,
         0.85269475,  0.31744998],
       ..., 
       [ 0.94201106,  0.57715724,  0.85454293, ...,  0.8685932 ,
         0.86615721,  0.19788842],
       [ 0.26090202,  0.21765173,  0.97367337, ...,  0.68190049,
         0.5219335 ,  0.68640958],
       [ 0.8431265 ,  0.1606079 ,  0.05116419, ...,  0.12646525,
         0.48029007,  0.56875867]])

In [161]:
## TODO : 랜덤값 -1~1로 하기

In [162]:
embedding_matrix.shape

(10328, 300)

In [163]:
for i, word in enumerate(model.wv.index2word):
    embedding_matrix[i] = model[word]

## 1) A simplified Convolutional
- Simply use total 128 filters with size 5 and max pooling of 5 and 35

In [204]:
embedding_layer = Embedding(len(model.wv.index2word) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [205]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
# l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
# l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

In [206]:
model = Model(sequence_input, preds)
model.compile(loss=losses.categorical_crossentropy, optimizer='rmsprop', metrics=['acc'])

In [207]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 50)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 50, 300)           3098400   
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 46, 128)           192128    
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 9, 128)            0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 5, 128)            82048     
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 1, 128)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 128)               0         
__________

In [208]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=4096)

Train on 140000 samples, validate on 60000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14ffccf60>

In [264]:
# preds = model.predict_classes(X_test, batch_size=4096)

In [241]:
preds = model.predict(X_test, batch_size=1024)
preds = [1 if p[1]>p[0] else 0 for p in preds]

In [242]:
actuals = [1 if a[1]>a[0] else 0 for a in y_test]

In [243]:
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))

* 정확도 : 0.80422


In [244]:
grade = [0,1]

In [247]:
preds = pd.Categorical(preds, categories=grade)
actuals = pd.Categorical(actuals, categories=grade)

In [248]:
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))

preds        0      1
actuals              
0        22461   7424
1         4323  25792


In [252]:
print(classification_report(actuals, preds))

             precision    recall  f1-score   support

          0       0.84      0.75      0.79     29885
          1       0.78      0.86      0.81     30115

avg / total       0.81      0.80      0.80     60000

