## Sentiment Analysis for Korean Movie Review Data

## 1. Load Pickle File

In [1]:
import pickle

with open("movie_data.pickle", "rb") as f:
    movie_data = pickle.load(f)

In [2]:
movie_data.keys()

dict_keys(['reviews', 'scores', 'reviews_ix', 'word2ix', 'ix2word', 'max_seq_length'])

In [3]:
reviews = movie_data["reviews"]
scores = movie_data["scores"]
reviews_ix = movie_data["reviews_ix"]
word2ix = movie_data["word2ix"]
ix2word = movie_data["ix2word"]
max_seq_length = movie_data["max_seq_length"]

In [4]:
for i, score in enumerate(scores):
    if score <= 6:
        scores[i] = 0
    else:
        scores[i] = 1

In [5]:
from collections import Counter
Counter(scores)

Counter({1: 1138, 0: 689})

## 2. load Word2Vec model

In [6]:
from gensim.models import Word2Vec
model = Word2Vec.load('word2vec.model')

## 3. Prepare Trian / Test Datasets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_ix, 
                                                    scores, 
                                                    test_size=0.3, 
                                                    random_state=777)

In [8]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.1, 
                                                  random_state=777)

In [9]:
len(reviews_ix)

1827

In [10]:
len(X_train)

1150

In [11]:
len(X_dev)

128

In [12]:
len(X_test)

549

In [13]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np


class Dataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = Dataset(np.array(X_train), y_train)
dev_data = Dataset(np.array(X_dev), y_dev)

## 4. Prepare Embedding Matrix

In [14]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

embedding_matrix = []

for word in word2ix.keys():
    try:
        embedding_matrix.append(model[word])
    except:
        embedding_matrix.append(np.zeros(100))

print(len(word2ix))
print(len(embedding_matrix))

embedding_matrix = torch.Tensor(embedding_matrix)

5982
5982




## 5. Model Retrain

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
torch.manual_seed(777);
from classifier import CNN_Clf

In [16]:
torch.cuda.is_available()

True

In [17]:
 def pad_sequence(batch):
        X_batch, y_batch = zip(*batch)
        max_seq_length = max([len(x) for x in X_batch])
        if max_seq_length < max(filter_sizes):
            max_seq_length = max(filter_sizes)

        res = []
        for seq in X_batch:
            if len(seq) < max_seq_length:
                pad_seq = torch.LongTensor(seq + [0]*(max_seq_length-len(seq)))
                res.append(pad_seq)
            else:
                res.append(torch.LongTensor(seq))
        return torch.cat(res).reshape(batch_size, max_seq_length), torch.LongTensor(y_batch)

In [18]:
import copy

def model_save(epoch, model, optimizer, train_loss, train_acc, val_loss, val_acc, PATH):
    torch.save({
            'epoch': epoch,
            'model_state_dict': copy.deepcopy(model.state_dict()), # deep copy는 normalize를 이겨내고 저장한다(?)
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc
    }, PATH)

In [19]:
checkpoint = torch.load("./log_files/model_save/EPOCH_999")

model = CNN_Clf(vocab_size=len(word2ix),
                embed_size=100,
                output_size = 2,
                embedding_matrix = embedding_matrix,
                out_chs = 100,
                DR_rate = 0.5,
                filter_sizes = [3, 4, 5]).to(device)

model.load_state_dict(checkpoint["model_state_dict"])

In [20]:
%%time
from tensorboardX import SummaryWriter

EPOCHS = 1000
LR = 0.01 # Adadelta default learning_rate is 1.0
batch_size = 50
filter_sizes = [3, 4, 5]
DR_rate = 0.5
out_chs = 100

writer = SummaryWriter('./log_files/model_retrain/')

# Add_Graph to Tensorboard
dummy_input = Variable(torch.zeros(batch_size, max_seq_length).long()).to(device)
writer.add_graph(model, dummy_input)

# Add_Embedding to Tensorboard
word_labels = [ix2word[i] for i in range(len(ix2word))]
writer.add_embedding(model.embed.weight.data, metadata=word_labels)    

# weighted CrossEntropyLoss
# Approximately 1/2 negative_data per 1 positive data
# criterion = nn.CrossEntropyLoss(weight=torch.Tensor([2.0, 1.0]).to(device))
criterion = nn.CrossEntropyLoss(weight=torch.Tensor([1.3, 1.0]).to(device))
# criterion = nn.CrossEntropyLoss()

optimizer = optim.Adadelta(model.parameters(),lr=LR, weight_decay=1e-5) # use L2-Norm
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# optimizer = optim.SGD(model.parameters(),lr=LR, momentum=0.9, weight_decay=1e-5) # use L2-Norm

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)
dev_loader = DataLoader(dataset=dev_data, batch_size=1, shuffle=False)

for epoch in range(EPOCHS):
    train_correct = 0
    train_count = 0
    train_loss = 0
    val_correct = 0
    val_count = 0
    val_loss = 0
    
    
    # Training
    model = model.train()
    
    for X_batch, y_batch in train_loader:
        model.zero_grad()
        inputs = Variable(X_batch).to(device)
        targets = Variable(y_batch).to(device)
        
        preds = model(inputs)
     
        loss = criterion(preds, targets)
        train_loss += loss.item()
        
        compare = torch.max(preds, 1)[1]*targets
        train_correct += torch.sum(compare).item()
        train_count += X_batch.size(0)

        
        loss.backward()
        optimizer.step()
    
    train_acc = train_correct/train_count
    avg_train_loss = train_loss/(train_count/batch_size)
    
    # Validation

    model = model.eval()
    with torch.no_grad():          
        for X_batch, y_batch in dev_loader:
            val_count += 1         
            if len(X_batch) < max(filter_sizes):
                X_batch = torch.LongTensor(X_batch + [word2ix.get("<PAD>")]*(max(filter_sizes)-len(X_batch))).to(device)
            else:
                X_batch = torch.LongTensor(X_batch).to(device)
            input = Variable(X_batch).to(device)
            target = Variable(torch.LongTensor(y_batch)).to(device)
            pred = model.predict(input, test_batch_size=1)

            loss = criterion(pred, target)
            val_loss += loss.item()

            _, pred = torch.max(pred, 1)
            true = y_batch.item()
            if true == pred.item():
                val_correct +=1
                          
        val_acc = val_correct/val_count
        avg_val_loss = val_loss/val_count

    writer.add_scalars('Compare/train-val acc', {'Train Acc': train_acc,
                                                    'Val Acc': val_acc}, epoch+1000)
    
    writer.add_scalars('Compare/train-val losses', {'Train Loss': avg_train_loss,
                                                    'Val Loss': avg_val_loss}, epoch+1000)
    
    model_save(epoch+1000, model, optimizer, 
               train_loss, train_acc, 
               val_loss, val_acc, PATH="./log_files/model_retrain/EPOCH_"+str(epoch+1000))
    
    if epoch % 100 == 0 or epoch == (EPOCHS-1):
        print('epoch: {:d}'.format(epoch+1000))
        print('train_acc: {:.3f} ({:d}/{:d})'.format(train_acc, train_correct, train_count))
        print('val_acc: {:.3f} ({:d}/{:d})'.format(val_acc, val_correct, val_count))
        print('avg_train_loss: {:.3f}'.format(avg_train_loss))        
        print('avg_val_loss: {:.3f}'.format(avg_val_loss))
        print("==========================================================")

writer.close()



epoch: 1000
train_acc: 0.558 (642/1150)
val_acc: 0.758 (97/128)
avg_train_loss: 0.486
avg_val_loss: 0.604
epoch: 1100
train_acc: 0.562 (646/1150)
val_acc: 0.758 (97/128)
avg_train_loss: 0.463
avg_val_loss: 0.599
epoch: 1200
train_acc: 0.573 (659/1150)
val_acc: 0.750 (96/128)
avg_train_loss: 0.454
avg_val_loss: 0.597
epoch: 1300
train_acc: 0.570 (655/1150)
val_acc: 0.750 (96/128)
avg_train_loss: 0.441
avg_val_loss: 0.593
epoch: 1400
train_acc: 0.577 (664/1150)
val_acc: 0.734 (94/128)
avg_train_loss: 0.420
avg_val_loss: 0.589
epoch: 1500
train_acc: 0.577 (663/1150)
val_acc: 0.727 (93/128)
avg_train_loss: 0.419
avg_val_loss: 0.587
epoch: 1600
train_acc: 0.581 (668/1150)
val_acc: 0.734 (94/128)
avg_train_loss: 0.407
avg_val_loss: 0.585
epoch: 1700
train_acc: 0.583 (670/1150)
val_acc: 0.734 (94/128)
avg_train_loss: 0.399
avg_val_loss: 0.583
epoch: 1800
train_acc: 0.593 (682/1150)
val_acc: 0.742 (95/128)
avg_train_loss: 0.392
avg_val_loss: 0.581
epoch: 1900
train_acc: 0.594 (683/1150)
val_ac

In [21]:
test_correct = 0
test_count = len(X_test)
model = model.eval()
for i, seq in enumerate(X_test):
    if len(seq) < max(filter_sizes):
        seq = seq + [word2ix.get("<PAD>")]*(max(filter_sizes)-len(seq))
    input = Variable(torch.LongTensor(seq).view(1,-1)).to(device)
    pred = model.predict(input, test_batch_size=1)
    _, pred = torch.max(pred, 1)
    
    true = y_test[i]
    if true == pred.item():
        test_correct +=1
    
    if i%100 == 0:
        input_seq = [ix2word.get(ix) for ix in seq if ix != 0]
        print("Input :", input_seq)
        print("Prediction :", pred.item())
        print("Truth :",y_test[i])
        print("\n")

test_acc = test_correct / test_count

print('test_acc: {:.3f} ({:d}/{:d})'.format(test_acc, test_correct, test_count))

Input : ['후반/Noun', '쫄렸다/Noun']
Prediction : 0
Truth : 1


Input : ['감동/Noun', '영화/Noun', '보고/Noun', '운/Noun', '거의/Noun', '애니메이션/Noun', '보면서/Verb', '울줄/Verb', '몰랐네요/Verb', 'ㅜ/KoreanParticle', '감동/Noun', 'ㅜㅜ/KoreanParticle']
Prediction : 1
Truth : 1


Input : ['스토리/Noun', '별로/Noun', '노래/Noun']
Prediction : 1
Truth : 1


Input : ['히어로/Noun', '물/Noun', '찍어도/Verb', '될/Verb', '정도/Noun', '그래픽/Noun', '본/Verb', '한국영/Noun', '화의/Noun', '희망/Noun', '이야기/Noun', '거기/Noun']
Prediction : 0
Truth : 0


Input : ['보통/Noun', '서로/Noun', '아는/Verb', '상황/Noun', '주먹/Noun', '메/Noun', '쳐서/Verb', '다른/Noun', '격방/Noun', '시도/Noun', '하는게/Verb', '정상/Noun', '로메/Noun', '쳐/Verb', '대는게/Verb', '인상/Noun', '또/Noun', '와칸/Noun', '다인/Noun', '가에서/Verb', '개때/Noun', '닥치는데/Verb', '굳이/Noun', '칼/Noun', '빼/Noun', '일일이/Noun', '상대/Noun', '하는것도/Verb', '졸/Noun', '인상/Noun', '과거/Noun', '마징/Noun', '가가/Noun', '싸우다가/Verb', '죽기/Verb', '직전/Noun', '가슴/Noun', '원자력/Noun', '빔/Noun', '쏴서/Verb', '이기는거/Verb', '배운듯/Verb']
Prediction : 1
Truth : 0


Inpu