## Sentiment Analysis for Korean Movie Review Data

## 1. Load Pickle File

In [1]:
import pickle

with open("movie_data.pickle", "rb") as f:
    movie_data = pickle.load(f)

In [2]:
movie_data.keys()

dict_keys(['reviews', 'scores', 'reviews_ix', 'word2ix', 'ix2word', 'max_seq_length'])

In [3]:
reviews = movie_data["reviews"]
scores = movie_data["scores"]
reviews_ix = movie_data["reviews_ix"]
word2ix = movie_data["word2ix"]
ix2word = movie_data["ix2word"]
max_seq_length = movie_data["max_seq_length"]

In [4]:
for i, score in enumerate(scores):
    if score <= 6:
        scores[i] = 0
    else:
        scores[i] = 1

In [5]:
from collections import Counter
Counter(scores)

Counter({1: 1138, 0: 689})

## 2. load Word2Vec model

In [6]:
from gensim.models import Word2Vec
model = Word2Vec.load('word2vec.model')

## 3. Prepare Trian / Test Datasets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_ix, 
                                                    scores, 
                                                    test_size=0.3, 
                                                    random_state=777)

In [8]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.1, 
                                                  random_state=777)

In [9]:
len(reviews_ix)

1827

In [10]:
len(X_train)

1150

In [11]:
len(X_dev)

128

In [12]:
len(X_test)

549

In [13]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np


class Dataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = Dataset(np.array(X_train), y_train)
dev_data = Dataset(np.array(X_dev), y_dev)

## 4. Prepare Embedding Matrix

In [14]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

embedding_matrix = []

for word in word2ix.keys():
    try:
        embedding_matrix.append(model[word])
    except:
        embedding_matrix.append(np.zeros(100))

print(len(word2ix))
print(len(embedding_matrix))

embedding_matrix = torch.Tensor(embedding_matrix)

5982
5982




In [15]:
embedding_matrix[0] #<PAD>

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
index = 12
print(ix2word[index])
print(embedding_matrix[index])
print(model[ix2word[index]])

자가/Noun
tensor([ 0.0457,  0.2863,  0.1584, -0.2280,  0.3399,  0.5188, -0.1892,  0.0012,
         0.0902,  0.0804,  0.0565,  0.0179,  0.1807,  0.2784, -0.0129,  0.0499,
         0.1685,  0.0106,  0.1513, -0.0043, -0.4650,  0.1250,  0.4412, -0.1720,
        -0.1823,  0.5343, -0.0155,  0.4647, -0.0782, -0.1674,  0.0878,  0.2256,
        -0.3060, -0.2599, -0.3809,  0.0017, -0.2124, -0.1787, -0.3202, -0.1626,
         0.0439, -0.0524,  0.1458, -0.2895,  0.3095, -0.4761,  0.1152, -0.2603,
        -0.1557,  0.0177, -0.1418, -0.0305,  0.0217,  0.5578,  0.1623, -0.0064,
         0.2204, -0.3902, -0.0899, -0.0673,  0.0064, -0.3541,  0.1090,  0.2966,
        -0.0477,  0.2387, -0.4078, -0.0144,  0.0955, -0.3711,  0.1420,  0.2721,
         0.0743, -0.1678, -0.0891, -0.0016,  0.3909, -0.2036,  0.0517,  0.1920,
         0.0964,  0.1253,  0.1749, -0.2563, -0.1837, -0.2182,  0.2514, -0.2555,
         0.1615,  0.1022,  0.1005,  0.2121, -0.4298, -0.0241,  0.1238,  0.0650,
        -0.0466, -0.1136, -0.107



## 5. Modeling

#### 5. CNN + nn.Embedding + word2vec

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
torch.manual_seed(777);

In [18]:
torch.cuda.is_available()

True

In [19]:
class CNN_Clf(nn.Module):
    def __init__(self, vocab_size, embed_size, output_size, embedding_matrix):
        super(CNN_Clf, self).__init__()
#         self.embed = nn.Embedding.from_pretrained(embedding_matrix, freeze=False) # True is Default
        self.embed = nn.Embedding.from_pretrained(embedding_matrix)
#         torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
        
        self.conv_1d_1 = nn.Sequential(
                            nn.Conv1d(1, out_chs, embed_size*filter_sizes[0]),
                            nn.Tanh()
                            #, nn.MaxPool1d((max_seq_length-filter_sizes[0])*batch_size + 1)
                        )
        
        self.conv_1d_2 = nn.Sequential(
                            nn.Conv1d(1, out_chs, embed_size*filter_sizes[1]),
                            nn.Tanh()
                            #, nn.MaxPool1d((max_seq_length-filter_sizes[1])*batch_size + 1)
                        ) 
        self.conv_1d_3 = nn.Sequential(
                            nn.Conv1d(1, out_chs, embed_size*filter_sizes[2]),
                            nn.Tanh()
                            #, nn.MaxPool1d((max_seq_length-filter_sizes[2])*batch_size + 1)
                        )
        
#         self.conv_layer = nn.ModuleList([nn.Conv1d(1, out_chs, embed_size*fs) for fs in filter_sizes]) 

        self.dropout = nn.Dropout(DR_rate)
        
        self.fc_layer = nn.Linear(out_chs*len(filter_sizes), output_size)
            
    def forward(self, inputs):
        embed = self.embed(inputs)
#         print(embed.size()) => (batch_size, seq_len, embed_size) => (50, 113, 100)

        embed_cat = embed.reshape(batch_size, 1, -1)
#         print(embed_cat.size()) => (batch_size, 1, seq_len*embed_size) => (50, 1, 11300)

        x = [self.conv_1d_1(embed_cat), self.conv_1d_2(embed_cat), self.conv_1d_3(embed_cat)]
#         print(x[0].size()) => (batch_size, out_chs, feature_map_size)
#                            => feature_map_size = (seq_len - filter_size)*embed_size + 1 
#                            => (113-3)*100 + 1 = 11001
#                            => (50, 100, 11001)
#                            

#         print(x[1].size()) => (batch_size, out_chs, feature_map_size)
#                            => feature_map_size = (seq_len - filter_size)*embed_size + 1 
#                            => (113-4)*100 + 1 = 10901
#                            => (50, 100, 10901)
#                           

#         print(x[2].size()) => (batch_size, out_chs, feature_map_size)
#                            => feature_map_size = (seq_len - filter_size)*embed_size + 1 
#                            => (113-5)*100 + 1 = 10801
#                            => (50, 100, 10801)
#   
        x = [F.max_pool1d(conv, (conv.size(2), )).squeeze(2) for conv in x]
#         print(x[0].size()) => (50, 100)
#         print(x[1].size()) => (50, 100)
#         print(x[2].size()) => (50, 100)

        x = torch.cat(x, 1)
#         print(x.size()) => (50, 300)
        

        x = self.dropout(x)
        x = self.fc_layer(x)
#         print(x.size()) => (50, 2)

        x = F.softmax(x ,dim=1)
#         print(x.size()) => (50, 2)

        return x
      
    def predict(self, inputs, test_batch_size):
        embed = self.embed(inputs)
        embed_cat = embed.reshape(test_batch_size, 1, -1)        
        x = [self.conv_1d_1(embed_cat), self.conv_1d_2(embed_cat), self.conv_1d_3(embed_cat)]
        x = [F.max_pool1d(conv, (conv.size(2), )).squeeze(2) for conv in x]

        x = torch.cat(x, 1)
        x = self.fc_layer(x)
        x = F.softmax(x, dim=1)
        
        return x

In [20]:
 def pad_sequence(batch):
        X_batch, y_batch = zip(*batch)
        max_seq_length = max([len(x) for x in X_batch])
        if max_seq_length < max(filter_sizes):
            max_seq_length = max(filter_sizes)

        res = []
        for seq in X_batch:
            if len(seq) < max_seq_length:
                pad_seq = torch.LongTensor(seq + [0]*(max_seq_length-len(seq)))
                res.append(pad_seq)
            else:
                res.append(torch.LongTensor(seq))
        return torch.cat(res).reshape(batch_size, max_seq_length), torch.LongTensor(y_batch)

https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [21]:
import copy

def model_save(epoch, model, optimizer, train_loss, train_acc, val_loss, val_acc, PATH):
    torch.save({
            'epoch': epoch,
            'model_state_dict': copy.deepcopy(model.state_dict()),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc
    }, PATH)

In [22]:
%%time
from tensorboardX import SummaryWriter

EPOCHS = 1000
LR = 0.01 # Adadelta default learning_rate is 1.0
batch_size = 50
filter_sizes = [3, 4, 5]
DR_rate = 0.5
out_chs = 100

writer = SummaryWriter('./log_files/model_save/')

model = CNN_Clf(len(word2ix), 100, 2, embedding_matrix).to(device)

# Add_Graph to Tensorboard
dummy_input = Variable(torch.zeros(batch_size, max_seq_length).long()).to(device)
writer.add_graph(model, dummy_input)

# Add_Embedding to Tensorboard
word_labels = [ix2word[i] for i in range(len(ix2word))]
writer.add_embedding(model.embed.weight.data, metadata=word_labels)    

# weighted CrossEntropyLoss
# Approximately 1/2 negative_data per 1 positive data
# criterion = nn.CrossEntropyLoss(weight=torch.Tensor([2.0, 1.0]).to(device))
criterion = nn.CrossEntropyLoss(weight=torch.Tensor([1.3, 1.0]).to(device))
# criterion = nn.CrossEntropyLoss()

optimizer = optim.Adadelta(model.parameters(),lr=LR, weight_decay=1e-5) # use L2-Norm
# optimizer = optim.SGD(model.parameters(),lr=LR, momentum=0.9, weight_decay=1e-5) # use L2-Norm

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)
dev_loader = DataLoader(dataset=dev_data, batch_size=1, shuffle=False)

for epoch in range(EPOCHS):
    train_correct = 0
    train_count = 0
    train_loss = 0
    val_correct = 0
    val_count = 0
    val_loss = 0
    
    
    # Training
    model = model.train()
    
    for X_batch, y_batch in train_loader:
        model.zero_grad()
        inputs = Variable(X_batch).to(device)
        targets = Variable(y_batch).to(device)
        
        preds = model(inputs)
     
        loss = criterion(preds, targets)
        train_loss += loss.item()
        
        compare = torch.max(preds, 1)[1]*targets
        train_correct += torch.sum(compare).item()
        train_count += X_batch.size(0)

        
        loss.backward()
        optimizer.step()
    
    train_acc = train_correct/train_count
    avg_train_loss = train_loss/(train_count/batch_size)
    
    # Validation

    model = model.eval()
    with torch.no_grad():          
        for X_batch, y_batch in dev_loader:
            val_count += 1         
            if len(X_batch) < max(filter_sizes):
                X_batch = torch.LongTensor(X_batch + [word2ix.get("<PAD>")]*(max(filter_sizes)-len(X_batch))).to(device)
            else:
                X_batch = torch.LongTensor(X_batch).to(device)
            input = Variable(X_batch).to(device)
            target = Variable(torch.LongTensor(y_batch)).to(device)
            pred = model.predict(input, test_batch_size=1)

            loss = criterion(pred, target)
            val_loss += loss.item()

            _, pred = torch.max(pred, 1)
            true = y_batch.item()
            if true == pred.item():
                val_correct +=1
                          
        val_acc = val_correct/val_count
        avg_val_loss = val_loss/val_count

    writer.add_scalars('Compare/train-val acc', {'Train Acc': train_acc,
                                                    'Val Acc': val_acc}, epoch)
    
    writer.add_scalars('Compare/train-val losses', {'Train Loss': avg_train_loss,
                                                    'Val Loss': avg_val_loss}, epoch)
    
    model_save(epoch, model, optimizer, 
               train_loss, train_acc, 
               val_loss, val_acc, PATH="./log_files/model_save/EPOCH_"+str(epoch)+".pt") # pt는 확장자명
    
    if epoch % 100 == 0 or epoch == (EPOCHS-1):
        print('epoch: {:d}'.format(epoch))
        print('train_acc: {:.3f} ({:d}/{:d})'.format(train_acc, train_correct, train_count))
        print('val_acc: {:.3f} ({:d}/{:d})'.format(val_acc, val_correct, val_count))
        print('avg_train_loss: {:.3f}'.format(avg_train_loss))        
        print('avg_val_loss: {:.3f}'.format(avg_val_loss))
        print("==========================================================")

writer.close()



epoch: 0
train_acc: 0.097 (112/1150)
val_acc: 0.328 (42/128)
avg_train_loss: 0.705
avg_val_loss: 0.715
epoch: 100
train_acc: 0.510 (587/1150)
val_acc: 0.672 (86/128)
avg_train_loss: 0.676
avg_val_loss: 0.673
epoch: 200
train_acc: 0.523 (602/1150)
val_acc: 0.688 (88/128)
avg_train_loss: 0.663
avg_val_loss: 0.666
epoch: 300
train_acc: 0.510 (586/1150)
val_acc: 0.688 (88/128)
avg_train_loss: 0.641
avg_val_loss: 0.657
epoch: 400
train_acc: 0.517 (594/1150)
val_acc: 0.680 (87/128)
avg_train_loss: 0.606
avg_val_loss: 0.647
epoch: 500
train_acc: 0.518 (596/1150)
val_acc: 0.680 (87/128)
avg_train_loss: 0.591
avg_val_loss: 0.637
epoch: 600
train_acc: 0.538 (619/1150)
val_acc: 0.711 (91/128)
avg_train_loss: 0.563
avg_val_loss: 0.628
epoch: 700
train_acc: 0.532 (612/1150)
val_acc: 0.727 (93/128)
avg_train_loss: 0.541
avg_val_loss: 0.622
epoch: 800
train_acc: 0.542 (623/1150)
val_acc: 0.734 (94/128)
avg_train_loss: 0.517
avg_val_loss: 0.614
epoch: 900
train_acc: 0.555 (638/1150)
val_acc: 0.742 (95

In [23]:
test_correct = 0
test_count = len(X_test)
model = model.eval()
for i, seq in enumerate(X_test):
    if len(seq) < max(filter_sizes):
        seq = seq + [word2ix.get("<PAD>")]*(max(filter_sizes)-len(seq))
    input = Variable(torch.LongTensor(seq).view(1,-1)).to(device)
    pred = model.predict(input, test_batch_size=1)
    _, pred = torch.max(pred, 1)
    
    true = y_test[i]
    if true == pred.item():
        test_correct +=1
    
    if i%100 == 0:
        input_seq = [ix2word.get(ix) for ix in seq if ix != 0]
        print("Input :", input_seq)
        print("Prediction :", pred.item())
        print("Truth :",y_test[i])
        print("\n")

test_acc = test_correct / test_count

print('test_acc: {:.3f} ({:d}/{:d})'.format(test_acc, test_correct, test_count))

Input : ['후반/Noun', '쫄렸다/Noun']
Prediction : 0
Truth : 1


Input : ['감동/Noun', '영화/Noun', '보고/Noun', '운/Noun', '거의/Noun', '애니메이션/Noun', '보면서/Verb', '울줄/Verb', '몰랐네요/Verb', 'ㅜ/KoreanParticle', '감동/Noun', 'ㅜㅜ/KoreanParticle']
Prediction : 1
Truth : 1


Input : ['스토리/Noun', '별로/Noun', '노래/Noun']
Prediction : 1
Truth : 1


Input : ['히어로/Noun', '물/Noun', '찍어도/Verb', '될/Verb', '정도/Noun', '그래픽/Noun', '본/Verb', '한국영/Noun', '화의/Noun', '희망/Noun', '이야기/Noun', '거기/Noun']
Prediction : 0
Truth : 0


Input : ['보통/Noun', '서로/Noun', '아는/Verb', '상황/Noun', '주먹/Noun', '메/Noun', '쳐서/Verb', '다른/Noun', '격방/Noun', '시도/Noun', '하는게/Verb', '정상/Noun', '로메/Noun', '쳐/Verb', '대는게/Verb', '인상/Noun', '또/Noun', '와칸/Noun', '다인/Noun', '가에서/Verb', '개때/Noun', '닥치는데/Verb', '굳이/Noun', '칼/Noun', '빼/Noun', '일일이/Noun', '상대/Noun', '하는것도/Verb', '졸/Noun', '인상/Noun', '과거/Noun', '마징/Noun', '가가/Noun', '싸우다가/Verb', '죽기/Verb', '직전/Noun', '가슴/Noun', '원자력/Noun', '빔/Noun', '쏴서/Verb', '이기는거/Verb', '배운듯/Verb']
Prediction : 1
Truth : 0


Inpu

In [24]:
model.state_dict().keys()

odict_keys(['embed.weight', 'conv_1d_1.0.weight', 'conv_1d_1.0.bias', 'conv_1d_2.0.weight', 'conv_1d_2.0.bias', 'conv_1d_3.0.weight', 'conv_1d_3.0.bias', 'fc_layer.weight', 'fc_layer.bias'])

In [25]:
model.state_dict()

OrderedDict([('embed.weight',
              tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      ...,
                      [-0.4206, -0.3258, -0.0437,  ..., -0.0978, -0.1347,  0.0543],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                      [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
                     device='cuda:0')),
             ('conv_1d_1.0.weight',
              tensor([[[-0.0483, -0.0010, -0.0112,  ..., -0.0376,  0.0209,  0.0545]],
              
                      [[ 0.0564, -0.0210, -0.0373,  ...,  0.0514,  0.0521, -0.0202]],
              
                      [[ 0.0461,  0.0403,  0.0070,  ..., -0.0032, -0.0398, -0.0090]],
              
                      ...,
              
                 