# Preprocessing

In [66]:
import os
import sys
import numpy as np
import gensim

import re

In [69]:
file_path = os.path.join('data','ratings.txt')

In [70]:
sentences = []
labels = []
with open(file_path, 'r') as f:
    next(f) # header skip
    for line in f.readlines():
        _, doc, label = line.strip().split('\t')
        sentences.append(doc.strip())
        labels.append(label.strip())

In [71]:
len(sentences) # 전체 문장

200000

In [72]:
sentences[:5]

['어릴때보고 지금다시봐도 재밌어요ㅋㅋ',
 '디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.',
 '폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.',
 '와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지',
 '안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.']

In [73]:
from konlpy.tag import Twitter
twi_tagger = Twitter()

In [74]:
# 품사 중 명사, 동사, 형용사, 부사, 감탄사(헐, 어머나), 한국어약어(ㅋㅋㅋ) 만 추출해봄.
Non_Stop_words = set(["Noun", "Verb","Adjective","Adverb","Exclamation","KoreanParticle"])

# (돈, Noun) -> "돈/Noun" 형태로 형태소 분석.
def tokenizer_twit(doc, remove_stopwords=True):
    if remove_stopwords:
        word_list = ['/'.join(t) for t in twi_tagger.pos(doc, norm=True, stem=True) if t[-1] in Non_Stop_words]
    else:
        word_list = ['/'.join(t) for t in twi_tagger.pos(doc, norm=True, stem=True)]
    return word_list

def tokenizer(sentence):
    tokens = re.findall(r"[\w]+|[^\s\w]", sentence)
    return tokens

In [75]:
tokenizer(sentences[1])

['디자인을',
 '배우는',
 '학생으로',
 ',',
 '외국디자이너와',
 '그들이',
 '일군',
 '전통을',
 '통해',
 '발전해가는',
 '문화산업이',
 '부러웠는데',
 '.',
 '사실',
 '우리나라에서도',
 '그',
 '어려운시절에',
 '끝까지',
 '열정을',
 '지킨',
 '노라노',
 '같은',
 '전통이있어',
 '저와',
 '같은',
 '사람들이',
 '꿈을',
 '꾸고',
 '이뤄나갈',
 '수',
 '있다는',
 '것에',
 '감사합니다',
 '.']

In [76]:
tokenizer_twit(sentences[1], remove_stopwords=True)

['디자인/Noun',
 '배우다/Verb',
 '학생/Noun',
 '외국/Noun',
 '디자이너/Noun',
 '그/Noun',
 '일군/Noun',
 '전통/Noun',
 '통해/Noun',
 '발전/Noun',
 '하다/Verb',
 '문화/Noun',
 '산업/Noun',
 '부럽다/Adjective',
 '사실/Noun',
 '우리나라/Noun',
 '그/Noun',
 '어렵다/Adjective',
 '시절/Noun',
 '끝/Noun',
 '열정/Noun',
 '지키다/Verb',
 '노라노/Noun',
 '같다/Adjective',
 '전통/Noun',
 '있다/Adjective',
 '저/Noun',
 '같다/Adjective',
 '사람/Noun',
 '꿈/Noun',
 '꾸다/Verb',
 '이루다/Verb',
 '나가다/Verb',
 '수/Noun',
 '있다/Adjective',
 '것/Noun',
 '감사/Noun',
 '하다/Verb']

In [22]:
words_list=[]
statement_num=0
for sent in sentences:
#     words_list.append(tokenizer_twit(sent, remove_stopwords=False))
    words_list.append(tokenizer(sent))
    statement_num += 1
    if statement_num%10000 == 0:
        print("statement_num : %d" % statement_num)

statement_num : 10000
statement_num : 20000
statement_num : 30000
statement_num : 40000
statement_num : 50000
statement_num : 60000
statement_num : 70000
statement_num : 80000
statement_num : 90000
statement_num : 100000
statement_num : 110000
statement_num : 120000
statement_num : 130000
statement_num : 140000
statement_num : 150000
statement_num : 160000
statement_num : 170000
statement_num : 180000
statement_num : 190000
statement_num : 200000


# Word2vec Modeling

In [23]:
from gensim.models import word2vec
import multiprocessing
import time
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import ColumnDataSource, LabelSet
from bokeh import palettes

In [24]:
# Set parameters
num_features = 500    # Word vector dimensionality                      
min_word_count = 4   # Minimum word count                        
num_workers = multiprocessing.cpu_count() # Number of threads to run in parallel
context = 3 # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
negative = 20
iter_ = 20 # 얼마나 반복할지.
sg=1

In [25]:
start = time.time() # 작업시간 검사용!
# Initialize and train the model (this will take some time)
print("Training model...")
model = word2vec.Word2Vec(words_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, negative=negative, iter=iter_, sg=sg)
end = time.time()
print(end-start)

Training model...
215.6655411720276


In [26]:
len(model.wv.index2word) # 전체 단어

40109

In [27]:
# 모델 저장
model_name = os.path.join('w2v_file','500features_4minwords_3context')
model.save(model_name)

In [78]:
# # 모델 로드.
# model_name = os.path.join('w2v_file','300features_10minwords_3context_twi')
# model = word2vec.Word2Vec.load(model_name)

In [127]:
def vis_top_n(model, filepath, top, lr=500, n_iter=1000, perplexity=10):
    """
    전체 단어 중 특정 빈도 이상의 단어만 시각화.
    :param model: gensim word2vec model
    :param filepath: 시각화파일(html) 저장할 파일 경로
    :param top: 빈도 상위 몇개의 단어까지 시각화할지
    """
    vectors = model.wv.syn0[:top]
    labels = model.wv.index2word[:top]
    counts = [model.wv.vocab[label].count for label in labels]  # _labels의 빈도수 (빈도수별 사이즈 다르게 하기 위해)

    if np.shape(vectors)[1] > 2:  # 입력값이 2차원일경우에는 차원축소 하지 않음.
        print('tsne....')
        start = time.time()
        tsne = TSNE(perplexity=perplexity, n_components=2, init='random', n_iter=n_iter, verbose=1, learning_rate=lr, method='exact')  # tsne를 이용한 차원 축소 (n차원 -> 2차원)
        vectors = tsne.fit_transform(vectors)
        print(time.time()-start)

    _filepath = '%s[top_%i].html' % (filepath, top)

    source = ColumnDataSource(
        data=dict(
            x=vectors.T[0],
            y=vectors.T[1],
            size=(np.log1p(np.array(counts))) * 1.2 + 2,
            word=labels,
            color=['#0099ff'] * len(labels),
        )
    )
    label_set = LabelSet(x='x', y='y', text='word', level='glyph', text_color="#111111", text_alpha=0.6, text_font_size='8pt', x_offset=5, y_offset=5, source=source, render_mode='canvas')
    tools = "pan,wheel_zoom,box_zoom,reset,resize"
    p = figure(plot_width=900, plot_height=900, tools=[tools], title='word2vec vis top %i' % top)
    p.circle('x', 'y', size='size', source=source, alpha=0.6, fill_color='color', line_color='#eeeeee')
    p.add_layout(label_set)

    output_file(_filepath, title=_filepath)
    show(p)

In [128]:
# vis_top_n(model=model, filepath=os.path.join('w2v_file','300features_10minwords_3context_twi'), top=2000, lr=500, n_iter=1000, perplexity=10)

# CNN

In [79]:
from bs4 import BeautifulSoup

import re
import sys
import os
import numpy as np
import pandas as pd
from nltk import tokenize

from sklearn.metrics import classification_report

In [80]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import losses

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model

In [81]:
words_list[:5]

[['어릴때보고', '지금다시봐도', '재밌어요ㅋㅋ'],
 ['디자인을',
  '배우는',
  '학생으로',
  ',',
  '외국디자이너와',
  '그들이',
  '일군',
  '전통을',
  '통해',
  '발전해가는',
  '문화산업이',
  '부러웠는데',
  '.',
  '사실',
  '우리나라에서도',
  '그',
  '어려운시절에',
  '끝까지',
  '열정을',
  '지킨',
  '노라노',
  '같은',
  '전통이있어',
  '저와',
  '같은',
  '사람들이',
  '꿈을',
  '꾸고',
  '이뤄나갈',
  '수',
  '있다는',
  '것에',
  '감사합니다',
  '.'],
 ['폴리스스토리', '시리즈는', '1부터', '뉴까지', '버릴께', '하나도', '없음', '.', '.', '최고', '.'],
 ['와',
  '.',
  '.',
  '연기가',
  '진짜',
  '개쩔구나',
  '.',
  '.',
  '지루할거라고',
  '생각했는데',
  '몰입해서',
  '봤다',
  '.',
  '.',
  '그래',
  '이런게',
  '진짜',
  '영화지'],
 ['안개', '자욱한', '밤하늘에', '떠', '있는', '초승달', '같은', '영화', '.']]

In [82]:
labels[:5]

['1', '1', '1', '1', '1']

In [83]:
w2v_dics = set(model.wv.index2word)

In [84]:
model.wv.index2word

['영화/Noun',
 '하다/Verb',
 '보다/Verb',
 '없다/Adjective',
 '있다/Adjective',
 '좋다/Adjective',
 '너무/Noun',
 '되다/Verb',
 '재밌다/Adjective',
 '정말/Noun',
 'ㅋㅋ/KoreanParticle',
 '것/Noun',
 '이/Noun',
 '진짜/Noun',
 '같다/Adjective',
 '아니다/Adjective',
 '않다/Verb',
 '점/Noun',
 '연기/Noun',
 '나오다/Verb',
 '만들다/Verb',
 '이렇다/Adjective',
 '평점/Noun',
 '최고/Noun',
 '이다/Verb',
 '왜/Noun',
 '생각/Noun',
 '스토리/Noun',
 '드라마/Noun',
 '감동/Noun',
 '사람/Noun',
 '보고/Noun',
 '말/Noun',
 '더/Noun',
 '아깝다/Adjective',
 '내/Noun',
 '다/Adverb',
 '때/Noun',
 '배우/Noun',
 '안/Noun',
 '감독/Noun',
 '재미있다/Adjective',
 '그냥/Noun',
 '거/Noun',
 '뭐/Noun',
 '내용/Noun',
 '봐/Noun',
 '재미/Noun',
 '시간/Noun',
 '아/Exclamation',
 '그/Noun',
 '들다/Verb',
 '재미없다/Adjective',
 '자다/Verb',
 '가다/Verb',
 '좀/Noun',
 '그렇다/Adjective',
 '지루하다/Adjective',
 '쓰레기/Noun',
 '주다/Verb',
 '나/Noun',
 '수/Noun',
 '싶다/Verb',
 '사랑/Noun',
 '알다/Verb',
 '작품/Noun',
 '하나/Noun',
 '다시/Noun',
 'ㅠㅠ/KoreanParticle',
 '마지막/Noun',
 '볼/Noun',
 'ㅋ/KoreanParticle',
 '이다/Adjective',
 '이렇게/Adverb',
 '모르다/Ve

In [85]:
sequences = [[model.wv.index2word.index(word) for word in wl if word in model.wv.index2word] for wl in words_list]

KeyboardInterrupt: 

In [86]:
sequences[:10]

[[22866, 31821],
 [1644,
  2,
  2096,
  20049,
  743,
  0,
  574,
  11693,
  24,
  87,
  6737,
  26645,
  85,
  14693,
  85,
  333,
  1153,
  20050,
  18,
  683,
  1770,
  617,
  0],
 [17860, 1492, 22867, 262, 200, 0, 0, 53, 0],
 [91, 0, 0, 118, 9, 0, 0, 1479, 1221, 117, 0, 0, 954, 612, 9, 2482],
 [13495, 9721, 45, 85, 3, 0],
 [470, 419, 87, 10940, 5693],
 [42, 2443, 457, 120],
 [22868, 11694, 4, 1195, 23, 3118],
 [113],
 [5196, 133, 5345, 31822, 5694]]

In [87]:
MAX_SEQUENCE_LENGTH = 50
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # MAX_SEQUENCE_LENGTH만큼 뒤 기준으로 잘림.

In [89]:
data[:10]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 22866, 31821],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
         1644,     2,  2096, 20049,   743,     0,   574, 11693,    24,
           87,  6737, 26645,    85, 14693,    85,   333,  1153, 20050,
           18,   683,  1770,   617,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    

In [37]:
labels = to_categorical(np.asarray(labels)) # label one-hot encoding
labels

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [38]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (200000, 50)
Shape of label tensor: (200000, 2)


In [39]:
indices = np.arange(data.shape[0])
indices[:20]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [40]:
np.random.shuffle(indices) # 데이터 셔플링

In [41]:
indices[:20]

array([189292,  49433, 149627,  74340,  32661,  53611,  74196, 122769,
       181070, 194968,   9034, 171999, 135883, 142516, 180531, 185325,
       180853, 156681,  69943, 167383])

In [42]:
data = data[indices] # 데이터 셔플링
labels = labels[indices]

In [43]:
VALIDATION_SPLIT = 0.3
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
X_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]

In [44]:
print('Number of positive and negative reviews in traing and validation set ')
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

Number of positive and negative reviews in traing and validation set 
[ 69881.  70119.]
[ 30119.  29881.]


In [45]:
EMBEDDING_DIM = 500

In [46]:
embedding_matrix = np.random.random((len(model.wv.index2word) + 1, EMBEDDING_DIM)) # bias term 때문에 +1 해준듯.. 
embedding_matrix # 랜덤값으로 초기화

array([[ 0.25110671,  0.74958261,  0.48287228, ...,  0.29680174,
         0.13636745,  0.82997856],
       [ 0.71810511,  0.7693661 ,  0.78176874, ...,  0.35410389,
         0.72157469,  0.10957847],
       [ 0.5162775 ,  0.27797974,  0.43872767, ...,  0.5107279 ,
         0.14181544,  0.79095857],
       ..., 
       [ 0.43403742,  0.14534533,  0.83694272, ...,  0.44914193,
         0.62685315,  0.85047853],
       [ 0.56578023,  0.24007962,  0.7752408 , ...,  0.50057457,
         0.12875998,  0.85488561],
       [ 0.9580615 ,  0.20605413,  0.7281029 , ...,  0.32208812,
         0.72357084,  0.78160447]])

In [47]:
## TODO : 랜덤값 -1~1로 하기

In [48]:
embedding_matrix.shape

(40110, 500)

In [49]:
for i, word in enumerate(model.wv.index2word):
    embedding_matrix[i] = model[word]

## 1) A simplified Convolutional
- Simply use total 128 filters with size 5 and max pooling of 5 and 35

In [51]:
embedding_layer = Embedding(len(model.wv.index2word) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [52]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
# l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
# l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

In [53]:
model = Model(sequence_input, preds)
model.compile(loss=losses.categorical_crossentropy, optimizer='rmsprop', metrics=['acc'])

In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 500)           20055000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 46, 128)           320128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5, 128)            82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
__________

In [55]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=4096)

Train on 140000 samples, validate on 60000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13b8bd6d8>

In [56]:
# preds = model.predict_classes(X_test, batch_size=4096)

In [57]:
preds = model.predict(X_test, batch_size=1024)
preds = [1 if p[1]>p[0] else 0 for p in preds]

In [58]:
actuals = [1 if a[1]>a[0] else 0 for a in y_test]

In [59]:
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))

* 정확도 : 0.78082


In [60]:
grade = [0,1]

In [61]:
preds = pd.Categorical(preds, categories=grade)
actuals = pd.Categorical(actuals, categories=grade)

In [62]:
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))

preds        0      1
actuals              
0        19624  10495
1         2656  27225


In [63]:
print(classification_report(actuals, preds))

             precision    recall  f1-score   support

          0       0.88      0.65      0.75     30119
          1       0.72      0.91      0.81     29881

avg / total       0.80      0.78      0.78     60000

