## Sentiment Analysis for Korean Movie Review Data

## 1. Load Pickle File

In [1]:
import pickle

with open("movie_data.pickle", "rb") as f:
    movie_data = pickle.load(f)

In [2]:
movie_data.keys()

dict_keys(['reviews', 'scores', 'reviews_ix', 'word2ix', 'ix2word', 'max_seq_length'])

In [3]:
reviews = movie_data["reviews"]
scores = movie_data["scores"]
reviews_ix = movie_data["reviews_ix"]
word2ix = movie_data["word2ix"]
ix2word = movie_data["ix2word"]
max_seq_length = movie_data["max_seq_length"]

In [4]:
for i, score in enumerate(scores):
    if score <= 6:
        scores[i] = 0
    else:
        scores[i] = 1

In [5]:
from collections import Counter
Counter(scores)

Counter({1: 1138, 0: 689})

## 2. load Word2Vec model

In [6]:
from gensim.models import Word2Vec
model = Word2Vec.load('word2vec.model')

## 3. Prepare Trian / Test Datasets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_ix, 
                                                    scores, 
                                                    test_size=0.3, 
                                                    random_state=777)

In [8]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.1, 
                                                  random_state=777)

In [9]:
len(reviews_ix)

1827

In [10]:
len(X_train)

1150

In [11]:
len(X_dev)

128

In [12]:
len(X_test)

549

In [13]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np


class Dataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = Dataset(np.array(X_train), y_train)
dev_data = Dataset(np.array(X_dev), y_dev)

## 4. Prepare Embedding Matrix

In [14]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

embedding_matrix = []

for word in word2ix.keys():
    try:
        embedding_matrix.append(model[word])
    except:
        embedding_matrix.append(np.zeros(100))

print(len(word2ix))
print(len(embedding_matrix))

embedding_matrix = torch.Tensor(embedding_matrix)

5982
5982




## 5. Model_load

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
torch.manual_seed(777);

from classifier import CNN_Clf

In [16]:
torch.cuda.is_available()

True

In [17]:
checkpoint = torch.load("./log_files/model_save/EPOCH_999.pt")

# 글로변 변수 선언 고민
model = CNN_Clf(vocab_size=len(word2ix),
                embed_size=100,
                output_size = 2,
                embedding_matrix = embedding_matrix,
                out_chs = 100,
                DR_rate = 0.5,
                filter_sizes = [3, 4, 5]).to(device)

model.load_state_dict(checkpoint["model_state_dict"])

In [18]:
test_correct = 0
test_count = len(X_test)
filter_sizes = [3, 4, 5]

model = model.eval()
for i, seq in enumerate(X_test):
    if len(seq) < max(filter_sizes):
        seq = seq + [word2ix.get("<PAD>")]*(max(filter_sizes)-len(seq))
    input = Variable(torch.LongTensor(seq).view(1,-1)).to(device)
    pred = model.predict(input, test_batch_size=1)
    _, pred = torch.max(pred, 1)
    
    true = y_test[i]
    if true == pred.item():
        test_correct +=1
    
    if i%100 == 0:
        input_seq = [ix2word.get(ix) for ix in seq if ix != 0]
        print("Input :", input_seq)
        print("Prediction :", pred.item())
        print("Truth :",y_test[i])
        print("\n")

test_acc = test_correct / test_count

print('test_acc: {:.3f} ({:d}/{:d})'.format(test_acc, test_correct, test_count))

Input : ['후반/Noun', '쫄렸다/Noun']
Prediction : 0
Truth : 1


Input : ['감동/Noun', '영화/Noun', '보고/Noun', '운/Noun', '거의/Noun', '애니메이션/Noun', '보면서/Verb', '울줄/Verb', '몰랐네요/Verb', 'ㅜ/KoreanParticle', '감동/Noun', 'ㅜㅜ/KoreanParticle']
Prediction : 0
Truth : 1


Input : ['스토리/Noun', '별로/Noun', '노래/Noun']
Prediction : 0
Truth : 1


Input : ['히어로/Noun', '물/Noun', '찍어도/Verb', '될/Verb', '정도/Noun', '그래픽/Noun', '본/Verb', '한국영/Noun', '화의/Noun', '희망/Noun', '이야기/Noun', '거기/Noun']
Prediction : 0
Truth : 0


Input : ['보통/Noun', '서로/Noun', '아는/Verb', '상황/Noun', '주먹/Noun', '메/Noun', '쳐서/Verb', '다른/Noun', '격방/Noun', '시도/Noun', '하는게/Verb', '정상/Noun', '로메/Noun', '쳐/Verb', '대는게/Verb', '인상/Noun', '또/Noun', '와칸/Noun', '다인/Noun', '가에서/Verb', '개때/Noun', '닥치는데/Verb', '굳이/Noun', '칼/Noun', '빼/Noun', '일일이/Noun', '상대/Noun', '하는것도/Verb', '졸/Noun', '인상/Noun', '과거/Noun', '마징/Noun', '가가/Noun', '싸우다가/Verb', '죽기/Verb', '직전/Noun', '가슴/Noun', '원자력/Noun', '빔/Noun', '쏴서/Verb', '이기는거/Verb', '배운듯/Verb']
Prediction : 0
Truth : 0


Inpu