## Sentiment Analysis for Korean Movie Review Data

- 가장 기본적인 Bag of Words를 이용해 1~10점을 분류하는 문제를 풀어보도록 하겠습니다.

## 1. Load Pickle File

In [1]:
# 저장해둔 moive.pickle data를 import합니다.
import pickle

with open("movie_data.pickle", "rb") as f:
    movie_data = pickle.load(f)

In [2]:
movie_data.keys()

dict_keys(['reviews', 'scores', 'reviews_ix', 'word2ix', 'ix2word', 'max_seq_length'])

In [3]:
reviews = movie_data["reviews"]
scores = movie_data["scores"]
reviews_ix = movie_data["reviews_ix"]
word2ix = movie_data["word2ix"]
ix2word = movie_data["ix2word"]
max_seq_length = movie_data["max_seq_length"]

In [4]:
vocab_size = len(word2ix)
vocab_size

5982

In [5]:
padding_idx = word2ix["<PAD>"]
padding_idx

0

In [6]:
reviews[0]

['오/Noun', '종합/Noun', '선물/Noun', '셋트/Noun']

In [7]:
scores[0]

9

In [8]:
reviews_ix[0]

[1512, 2279, 1491, 4296]

In [9]:
# Honey Tip

a = ["1","2","3"]
b = a[::-1]
print(b)

['3', '2', '1']


## 2. Prepare Trian / Test Datasets

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_ix, 
                                                    scores, 
                                                    test_size=0.3, 
                                                    random_state=101)

In [11]:
len(reviews_ix)

1827

In [12]:
len(X_train)

1278

In [13]:
len(X_test)

549

## 3. Modeling

#### 1. Simple way - Bag of Words Model
https://github.com/Chogyuwon/PyTorch_Fast_Campus_2018/blob/master/week6/1_Bag_of_Words.ipynb

In [1]:
import torch
import torch.nn as nn
# import torch.nn.functional as F
# from torch.autograd import Variable
import torch.optim as optim
torch.manual_seed(101)

<torch._C.Generator at 0x1e20deffe30>

In [2]:
torch.cuda.is_available()

False

In [16]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [17]:
# vocab_size 길이의 zero 벡터를 만들고 
# 문장에서 해당 인덱스 단어가 등장하는 빈도를 체크
# 가장 단순한 Text Classification 방법

def make_BoW(seq, word2ix):
    tensor = torch.zeros(len(word2ix))
    for w in seq:
        index = word2ix.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index = word2ix['<UNK>']
            tensor[index]+=1.
    
    return tensor

In [18]:
X_train = torch.cat([make_BoW(x, word2ix).view(1,-1).to(device) for x in X_train]) # 차원을 맞추기 위해 view(1,-1)
y_train = torch.cat([torch.LongTensor([y]).to(device) for y in y_train]) # classification을 위해 longtensor로 생성합니다.
print(X_train.size()) # (num_sentences, vocab_size)

torch.Size([1278, 5982])


In [19]:
class BoW_Clf(nn.Module):
    def __init__(self, vocab_size, output_size):
        super(BoW_Clf, self).__init__()
        
        self.linear = nn.Linear(vocab_size, output_size)
    
    def forward(self,inputs):
        
        x = self.linear(inputs)
        
        return x

In [20]:
EPOCHS = 5000
LR = 0.01

model = BoW_Clf(len(word2ix), 10).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR, momentum=0.9)

for epoch in range(EPOCHS):

    model.zero_grad()
    preds = model(X_train)
    
    loss = loss_function(preds, y_train)

    if epoch % 500 == 0:
        print(loss.item())
    
    loss.backward()
    optimizer.step()

2.407938003540039
2.041318893432617
2.038267135620117
2.0378644466400146
2.0377914905548096
2.037775754928589
2.037771701812744
2.037770986557007
2.037771224975586
2.0377705097198486


In [21]:
correct = 0

for i, seq in enumerate(X_test):
    input = torch.tensor(make_BoW(seq, word2ix)).to(device).view(1,-1)
    pred = model(input)
    _, pred = torch.max(pred, 1)
    true = y_test[i]
    if true == pred.item():
        correct +=1
    
    if i%100 == 0:
        input_seq = [ix2word.get(ix) for ix in seq]
        print("Input :", input_seq)
        print("Prediction :", pred.item())
        print("Truth :", y_test[i])
        print("\n")

print("Accuracy :", (correct/len(X_test)*100))

Input : ['후반/Noun', '쫄렸다/Noun']
Prediction : 10
Truth : 9


Input : ['감동/Noun', '영화/Noun', '보고/Noun', '운/Noun', '거의/Noun', '애니메이션/Noun', '보면서/Verb', '울줄/Verb', '몰랐네요/Verb', 'ㅜ/KoreanParticle', '감동/Noun', 'ㅜㅜ/KoreanParticle']
Prediction : 10
Truth : 10


Input : ['스토리/Noun', '별로/Noun', '노래/Noun']
Prediction : 10
Truth : 8


Input : ['히어로/Noun', '물/Noun', '찍어도/Verb', '될/Verb', '정도/Noun', '그래픽/Noun', '본/Verb', '한국영/Noun', '화의/Noun', '희망/Noun', '이야기/Noun', '거기/Noun']
Prediction : 10
Truth : 5


Input : ['보통/Noun', '서로/Noun', '아는/Verb', '상황/Noun', '주먹/Noun', '메/Noun', '쳐서/Verb', '다른/Noun', '격방/Noun', '시도/Noun', '하는게/Verb', '정상/Noun', '로메/Noun', '쳐/Verb', '대는게/Verb', '인상/Noun', '또/Noun', '와칸/Noun', '다인/Noun', '가에서/Verb', '개때/Noun', '닥치는데/Verb', '굳이/Noun', '칼/Noun', '빼/Noun', '일일이/Noun', '상대/Noun', '하는것도/Verb', '졸/Noun', '인상/Noun', '과거/Noun', '마징/Noun', '가가/Noun', '싸우다가/Verb', '죽기/Verb', '직전/Noun', '가슴/Noun', '원자력/Noun', '빔/Noun', '쏴서/Verb', '이기는거/Verb', '배운듯/Verb']
Prediction : 10
Truth : 0


In [2]:
import torch
a = torch.randn(4,4)

In [3]:
a

tensor([[-1.3187,  0.9413, -2.1196,  0.1565],
        [-0.4245,  0.6083, -0.0161, -0.5470],
        [-1.0243, -0.0547,  1.0429,  2.1947],
        [-0.2953, -0.4888, -0.1113, -0.1537]])

In [4]:
torch.mean(a,1)

tensor([-0.5851, -0.0948,  0.5396, -0.2623])

In [5]:
torch.mean(a,2)

RuntimeError: Dimension out of range (expected to be in range of [-2, 1], but got 2)