### 네이버 영화리뷰 데이터 불러오기

In [32]:
import json
import numpy as np
from pandas import DataFrame

# 영화 한줄평 데이터 불러오기
with open('train_data.json', encoding="utf-8") as data_file:    
    data = json.load(data_file)
data = DataFrame(data)

#### class 3개로 나누기 

In [33]:
# 영화 리뷰 3개의 class로 나눈다 ("POS", "NEU", "NEG")
data['rate'] = np.where(data['rating']>=8, 1,
                       np.where(data['rating']>=4, 2, 0))

In [34]:
# one-hot-encoding
import keras
label = keras.utils.to_categorical(data["rate"], num_classes=3)

In [35]:
del data['date']
del data['movie_id']
del data['rating']

In [36]:
data.head()

Unnamed: 0,review,rate
0,종합 평점은 4점 드립니다.,2
1,원작이 칭송받는 이유는 웹툰 계 자체의 질적 저하가 심각하기 때문. 원작이나 영화...,0
2,나름의 감동도 있고 안타까운 마음에 가슴도 먹먹 배우들의 연기가 good 김수현...,1
3,이런걸 돈주고 본 내자신이 후회스럽다 최악의 쓰레기 영화 김수현 밖에없는 저질 삼류영화,0
4,"초반엔 코미디, 후반엔 액션, 결론은 코미디.",2


### train / test 나누기

In [37]:
from sklearn.cross_validation import train_test_split 

train, test, label_train, label_test = train_test_split(data['review'], label, random_state = 0, test_size=0.1)

print(len(train), len(test))   
print(len(label_train), len(label_test)) 

630000 70000
630000 70000


## 데이터 전처리(형태소분석)

In [None]:
import re

# 필요없는 숫자 등 제거
text1 = [re.sub('\d+',' ',tmp) for tmp in train]
train_text = [re.sub('\W+',' ',tmp) for tmp in text1]

text1 = [re.sub('\d+',' ',tmp) for tmp in test]
test_text = [re.sub('\W+',' ',tmp) for tmp in text1]

train_text[0]

'올해 최고의 영화'

In [None]:
from konlpy.tag import Twitter
twitter = Twitter()

train_tagger = [twitter.pos(line) for line in train_text]
train_tagger[0]

In [None]:
test_tagger = [twitter.pos(line) for line in test_text]
test_tagger[0]

In [None]:
def pumsa(doc):    
    token = []
    rate = []
    
    for items in doc:        
        words = []
        for item in items: 
            if (item[1] in ['Noun', 'Verb', 'Adjective'])&(len(item[0])>1):
                words.append(item[0])
        token.append(words)
    
    return token

In [None]:
train_pumsa = pumsa(train_tagger)
test_pumsa = pumsa(test_tagger)

## word2vec

In [13]:
from gensim.models import Word2Vec

model = Word2Vec(train_pumsa, size=100, window=10, min_count=10, workers=4, sg=1) # Skip-gram

In [14]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.


In [15]:
max_features = len(model.wv.vocab)

## Input embedding(mean, tf-idf)

In [16]:
def mean_embedding(word2vec, text):
    return np.array([np.mean([word2vec[w] for w in words if w in word2vec] or [np.zeros(100)], axis=0) for words in text])

train_m = mean_embedding(w2v, train_pumsa)
test_m = mean_embedding(w2v, test_pumsa)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

# word2vec embedding vector와 word에 대한 tfidf 가중치를 이용한 vectorizer 함수
class TfidfEmbeddingVectorizer:
    def __init__(self, word2vec):
        self.word2vec = word2vec
        
    def transform(self, X):
        tfidf = TfidfVectorizer(analyzer = lambda x : x) 
        tfidf.fit(X)
        max_idf = max(tfidf.idf_) 
        word2weight = defaultdict(lambda : max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) 
        
        array_list =[]
        for words in X:
            array_list.append(np.array(np.mean([self.word2vec[w]*word2weight[w] for w in words if w in self.word2vec] or [np.zeros(100)], axis = 0)))
        return(array_list)

In [18]:
vec_tf = TfidfEmbeddingVectorizer(w2v)
train_tf = vec_tf.transform(train_pumsa)
test_tf = vec_tf.transform(test_pumsa)

In [19]:
train_tf = np.array(train_tf)
test_tf = np.array(test_tf)

## DNN

In [23]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, RNN, Flatten
from keras.engine.topology import Input 
from keras.optimizers import Adagrad, SGD
from keras.layers import Conv2D, MaxPooling2D

In [266]:
model = Sequential()

# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
model.add(Dense(64, activation='relu', input_dim=100))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
ada = Adagrad(lr=0.01, decay=1e-6)
model.compile(loss='categorical_crossentropy',
              optimizer=ada,
              metrics=['accuracy'])

In [267]:
model.fit(train_tf, label_train,
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1259c6ac8>

In [268]:
score = model.evaluate(test_tf, label_test, batch_size=128)



In [269]:
score #[loss, accuracy]

[0.6700169602530344, 0.7175785714285714]

### RNN을 위한 input embedding

In [20]:
def seq_embedding(text):
    return np.array([w2v[w] for w in text if w in w2v])

In [21]:
train_emb = list(map(seq_embedding,train_pumsa))
test_emb = list(map(seq_embedding,test_pumsa))

In [24]:
# 문장 길이 맞춰주기(반복 패딩)
train_pd = sequence.pad_sequences(train_emb, maxlen=10, dtype='float64')
test_pd = sequence.pad_sequences(test_emb, maxlen=10, dtype='float64')

## RNN

In [581]:
model = Sequential()

#model.add(Embedding()) # input_dim / output_dim / input_length
model.add(LSTM(128, input_shape=(10, 100)))
model.add(Dense(3, activation='softmax'))
          
# try using different optimizers and different optimizer configs
model.compile(optimizer=ada, 'categorical_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(train_pd, label_train,
          batch_size=128,
          epochs=20)

SyntaxError: positional argument follows keyword argument (<ipython-input-581-22eba94cfd82>, line 8)

In [543]:
score = model.evaluate(test_pd, label_test, batch_size=128)



In [544]:
score

[0.7147932805470057, 0.7174142857006618]

### CNN (미완성)

In [29]:
def reshape(x):
    return np.reshape(x,(-1,10,100,1))

train_re = reshape(train_pd)
test_re = reshape(test_pd)
train_re.shape

(9000, 10, 100, 1)

In [28]:
model = Sequential()

# input: 100x100 images with 3 channels -> (100, 100, 3) tensors.
# this applies 32 convolution filters of size 3x3 each.
model.add(Conv2D(32, (2, 100), activation='relu', input_shape=(10,100,1)))
#model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))

#model.add(Conv2D(64, (2, 100), activation='relu'))
#model.add(Conv2D(64, (2, 100), activation='relu'))
#model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

model.fit(train_re, label_train, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x110632ef0>

In [30]:
score = model.evaluate(test_re, label_test, batch_size=128)



In [31]:
score

[0.9214001455307007, 0.6249999990463256]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
text_clf = clf.fit(train_m, label_train)
predicted_kkn = clf.predict(test_m)
np.mean(predicted_knn == label_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
text_clf = clf.fit(train_tf, label_train)
predicted_kkn = clf.predict(test_tf)
np.mean(predicted_knn == label_test)

In [100]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [104]:
x_train.shape

(25000, 100)

In [105]:
train_tf.shape

(8000, 100)