## Sentiment Analysis for Korean Movie Review Data

- 이번에는 같은 Bag of Words 모델이지만, 단어를 100차원에 embedding한 뒤 풀어보도록 하겠습니다.

## 1. Load Pickle File

In [1]:
import pickle

with open("movie_data.pickle", "rb") as f:
    movie_data = pickle.load(f)

In [2]:
movie_data.keys()

dict_keys(['reviews', 'scores', 'reviews_ix', 'word2ix', 'ix2word', 'max_seq_length'])

In [3]:
reviews = movie_data["reviews"]
scores = movie_data["scores"]
reviews_ix = movie_data["reviews_ix"]
word2ix = movie_data["word2ix"]
ix2word = movie_data["ix2word"]
max_seq_length = movie_data["max_seq_length"]

## 2. Prepare Trian / Test Datasets

In [4]:
# 리뷰 문장의 길이를 max_seq_length에 맞춰주고 빈 공간에는 <PAD>를 넣어 줍니다.
def pad_sequence(seq, max_seq_length):
    if len(seq) < max_seq_length:
        seq += [word2ix.get("<PAD>")]*(max_seq_length-len(seq))
    return seq

print(len(reviews_ix[0]))
print(len(pad_sequence(reviews_ix[0], max_seq_length)))

reviews_ix = [pad_sequence(reviews_ix[i], max_seq_length) for i in range(len(reviews_ix))] 

4
113


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_ix, 
                                                    scores, 
                                                    test_size=0.3, 
                                                    random_state=777)

In [6]:
len(reviews_ix)

1827

In [7]:
len(X_train)

1278

In [8]:
len(X_test)

549

## 3. Modeling

#### 2. BoW + nn.Embedding
https://github.com/Chogyuwon/PyTorch_Fast_Campus_2018/blob/master/week6/2_Embedding_basic.ipynb

In [9]:
import torch
import torch.nn as nn
#import torch.nn.functional as F
#from torch.autograd import Variable
import torch.optim as optim
torch.manual_seed(101)

<torch._C.Generator at 0x223db7d8710>

In [10]:
torch.cuda.is_available()

True

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
reviews_ix[0]

[1512,
 2279,
 1491,
 4296,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [12]:
class BoW_Clf_Embed_mean(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size):
        super(BoW_Clf_Embed_mean, self).__init__()
        
        self.embed = nn.Embedding(vocab_size,
                                  embedding_size,
                                  padding_idx=word2ix.get("<PAD>"))  
        
        self.linear = nn.Linear(embedding_size, output_size)
    
    def forward(self,inputs):
        
        embed = self.embed(inputs)
        # 각 임베딩 차원의 평균 값을 가져옵니다. 
        # ex) torch.mean(torch.Tensor([[0,0,1,0],[0,1,0,0]]),1) = tensor([0.25,0.25])
        # 100차원의 값 113개를 각각 평균해서 113차원의 값 1개로 가져옵니다. 
        # 해당되는 각 단어를 100차원으로 embedding하고 그 값을 평균내서 1개의 값으로 만듭니다.
        # 최대 문장의 길이가 113개 이므로 이런 값이 113개 나오고 이 값으로 분류를 합니다.
        embed_mean = torch.mean(embed, 1)
        
        out = self.linear(embed_mean)
        
        return out

In [18]:
torch.mean(torch.Tensor([[0,0,1,0],[0,1,0,0]]),1)

tensor([0.2500, 0.2500])

In [20]:
EPOCHS = 5000
LR = 0.01

model = BoW_Clf_Embed_mean(len(word2ix),100, 11).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR, momentum=0.9)

for epoch in range(EPOCHS):

    model.zero_grad()
    inputs = torch.LongTensor(X_train).to(device)
    targets = torch.LongTensor(y_train).to(device)
    
    preds = model(inputs)
    
    loss = loss_function(preds, targets)

    if epoch % 500 == 0:
        print(loss.item())
    
    loss.backward()
    optimizer.step()

2.386758804321289
2.0446617603302
2.0384373664855957
2.033135175704956
2.027993679046631
2.0229556560516357
2.018003463745117
2.0131256580352783
2.0083093643188477
2.0035412311553955


In [21]:
correct = 0
model.eval()
for i, seq in enumerate(X_test):
    
    input = torch.LongTensor(seq).view(1,-1).to(device)
    pred = model(input)
    _, pred = torch.max(pred, 1)
    true = y_test[i]
    
    if true == pred.item():
        correct +=1
    
    if i%100 == 0:
        input_seq = [ix2word.get(ix) for ix in seq if ix != 0]
        print("Input :", input_seq)
        print("Prediction :", pred.item())
        print("Truth :", y_test[i])
        print("\n")

print("Accuracy :", (correct/len(X_test)*100))

Input : ['후반/Noun', '쫄렸다/Noun']
Prediction : 10
Truth : 9


Input : ['감동/Noun', '영화/Noun', '보고/Noun', '운/Noun', '거의/Noun', '애니메이션/Noun', '보면서/Verb', '울줄/Verb', '몰랐네요/Verb', 'ㅜ/KoreanParticle', '감동/Noun', 'ㅜㅜ/KoreanParticle']
Prediction : 10
Truth : 10


Input : ['스토리/Noun', '별로/Noun', '노래/Noun']
Prediction : 10
Truth : 8


Input : ['히어로/Noun', '물/Noun', '찍어도/Verb', '될/Verb', '정도/Noun', '그래픽/Noun', '본/Verb', '한국영/Noun', '화의/Noun', '희망/Noun', '이야기/Noun', '거기/Noun']
Prediction : 10
Truth : 5


Input : ['보통/Noun', '서로/Noun', '아는/Verb', '상황/Noun', '주먹/Noun', '메/Noun', '쳐서/Verb', '다른/Noun', '격방/Noun', '시도/Noun', '하는게/Verb', '정상/Noun', '로메/Noun', '쳐/Verb', '대는게/Verb', '인상/Noun', '또/Noun', '와칸/Noun', '다인/Noun', '가에서/Verb', '개때/Noun', '닥치는데/Verb', '굳이/Noun', '칼/Noun', '빼/Noun', '일일이/Noun', '상대/Noun', '하는것도/Verb', '졸/Noun', '인상/Noun', '과거/Noun', '마징/Noun', '가가/Noun', '싸우다가/Verb', '죽기/Verb', '직전/Noun', '가슴/Noun', '원자력/Noun', '빔/Noun', '쏴서/Verb', '이기는거/Verb', '배운듯/Verb']
Prediction : 10
Truth : 0
