# Word2Vec

In [12]:
import pandas as pd

from konlpy.tag import Okt
from gensim.models import word2vec

import torch
import torch.nn as nn
import torch.utils.data as Data

import numpy as np
import random

from sklearn.model_selection import train_test_split

In [2]:
filename = 'data/score_발열.xlsx'
sheet_name = 'Sheet1'
data = pd.read_excel(filename, sheet_name=sheet_name, header=0)

csv_data = [item.replace('#', '').strip() for item in data['Review']]
csv_label = data['Score']

In [3]:
csv_data[:5]

['발열히 심한거 같은데 여름이라 그런가?..',
 '발열이좀 심한거 같아서 걱정이에요',
 '발열이심하더라구요',
 '발열이너무심한게 제일큰 단점인것 같고 그외에 불편한점은',
 '발열이...정말...심합니다']

In [10]:
twitter = Okt()
size = 500

doc = []

for sentence in csv_data:
    results = []
    tokens = twitter.pos(sentence, norm=True, stem=True)

    for token in tokens:
        if not token[1] in ["Josa", "Eomi", "Punctuation"]:
            results.append(token[0])
    doc.append(results)

In [11]:
doc[:5]

[['발열', '히', '심하다', '같다', '여름', '그', '런가'],
 ['발열', '이', '좀', '심하다', '같다', '걱정'],
 ['발열', '심하다'],
 ['발열', '이', '너', '무심하다', '제일', '크다', '단점', '것', '같다', '그', '외', '불편하다', '점'],
 ['발열', '정말', '심하다']]

In [14]:
model = word2vec.Word2Vec(doc, size=size, window=2, hs=0, min_count = 3, sg=0)

w2v = dict(zip(model.wv.index2word, model.wv.vectors))

del model

In [16]:
w2v.keys()

dict_keys(['발열', '없다', '있다', '하다', '소음', '좋다', '심하다', '같다', '않다', '거의', '좀', '적다', '자다', '것', '잡다', '문제', '조금', '되다', '정도', '생각', '만족하다', '괜찮다', '못', '노트북', '사용', '이', '느끼다', '잘', '너무', '부분', '이다', '팬', '성능', '쿨러', '정말', '돌아가다', '제품', '별로', '걱정', '크다', '배터리', '수', '많이', '더', '전혀', '게임', '보다', '속도', '도', '아직', '매우', '아주', '가볍다', '소리', '안', '쓸다', '어쩔', '느껴지다', '다', '때문', '적', '나다', '그렇다', '키', '편이', '관리', '아니다', '신경', '크게', '조용하다', '약간', '때', '많다', '하', '및', '네', '보이다', '만족', '그', '쓰다', '상당하다', '들다', '없이', '거', '꽤', '보드', '오래', '쿨링', '무게', '아쉽다', '디자인', '제어', '점', '심해', '높다', '빠르다', '양호', '또한', '대', '모두', '빼다', '돌리다', '모르다', '발생', '다른', '맘', '감다', '잡히다', '되어다', '확실하다', '요', '진짜', '들', '삼성', '시간', '해보다', '사양', '가격', '감', '편', '모델', '안나', '가다', '비', '개선', '단점', '사', '느낌', '하지만', '제', '뜨겁다', '펜', '하나', '시', '듯', '사은', '써다', '수준', '최고', '만족스럽다', '은', '구매', '습', '메탈', '약하다', '상태', '충전', '판', '안되다', '굉장하다', '훨씬', '품', '작업', '화면', '발', '열량', '그렇게', '장시간', '생기다', '지다', '중', '무엇', '걸리다', '받침', '구

In [17]:
doc2vec = []
max_length = 0

for sentence in doc:
    temp = []
    length = 0
    
    for word in sentence:
        if word in w2v.keys():
            temp.append(w2v[word])
            length += 1
    doc2vec.append(temp)
    
    if max_length <= length:
        max_length = length

In [18]:
for sentence in doc2vec:
    length = len(sentence)
    
    while length < max_length:
        sentence.append(np.zeros(size))
        length += 1

doc2vec = np.array(doc2vec)

In [19]:
doc2vec

array([[[-1.60265770e-02,  5.03251748e-03,  2.23066099e-02, ...,
          6.12424174e-03,  6.98140776e-03,  2.67418269e-02],
        [-1.97218289e-03,  5.79213200e-04,  2.39920197e-03, ...,
          7.89380923e-04, -8.01704009e-05,  2.63706618e-03],
        [-4.46797255e-03,  1.79528794e-03,  6.11944683e-03, ...,
          2.59042811e-03,  2.68662092e-03,  6.86611468e-03],
        ...,
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

       [[-1.60265770e-02,  5.03251748e-03,  2.23066099e-02, ...,
          6.12424174e-03,  6.98140776e-03,  2.67418269e-02],
        [-5.93850017e-03,  8.62687302e-04,  7.74806459e-03, ...,
          3.07461247e-03,  2.85412045e

In [20]:
doc2vec.shape

(1211, 12, 500)

In [21]:
data = doc2vec
label = csv_label.values
label

array([2, 2, 2, ..., 0, 0, 0])

# Train Test Split

In [22]:
train_data, test_data, train_label, test_label = train_test_split(data, label)

print(len(train_data))
print(len(test_data))

908
303


In [23]:
x = torch.from_numpy(train_data).type(torch.FloatTensor)
y = torch.from_numpy(train_label).type(torch.LongTensor)

In [24]:
x.size(), y.size()

(torch.Size([908, 12, 500]), torch.Size([908]))

In [26]:
x = x.view(-1, 1, 12, 500)
x.shape

torch.Size([908, 1, 12, 500])

In [27]:
train_data = Data.TensorDataset(x, y)

batch_size = 10

train_loader = Data.DataLoader(dataset=train_data, batch_size = batch_size,
                              shuffle=True, num_workers=1, drop_last=True)

In [28]:
text, label = iter(train_loader).next()
text.shape

torch.Size([10, 1, 12, 500])

# Define Model for sentiment analysis

In [29]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        self.layer = nn.Sequential(
            nn.Conv2d(1, 16, 3), # 1*12*500 -> 16 * 10 * 498
            nn.ReLU(),
            nn.Conv2d(16, 32, 3), # 16 * 10 * 498 -> 32 * 8 * 496
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # 32 * 8 * 496 -> 32 * 4 * 248
            nn.Conv2d(32, 64, 3), # 32 * 4 * 248 -> 64 * 2 * 246
            nn.ReLU(),
            nn.MaxPool2d(2, 2) # 64 * 2 * 246 -> 64 * 1 * 123
        )
        
        self.fc_layer = nn.Sequential(
            nn.Linear(64 * 1 * 123, 100),
            nn.ReLU(),
            nn.Linear(100, 3)
        )
        
    def forward(self, x):
        out = self.layer(x)
        out = out.view(-1, 64 * 1 * 123)
        out = self.fc_layer(out)
        
        return out

In [31]:
model = CNN()
# if cuda -> model = CNN().cuda()

# Train Model

In [32]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [33]:
num_epochs = 50

In [34]:
for epoch in range(num_epochs):
    total_batch = len(train_data) // batch_size
    
    for i, (batch_text, batch_labels) in enumerate(train_loader):
        X = batch_text # if cuda -> X = batch_text.cuda()
        Y = batch_labels # if cuda -> Y = batch_text.cuda()
        
        pre = model(X)
        cost = loss(pre, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        if (i + 1) % 20 == 0:
            print('Epoch [%d/%d], lter [%d/%d], Loss: %.4f'
                 %(epoch+1, num_epochs, i+1, total_batch, cost.item()))
print("Learning Finished")

Epoch [1/50], lter [20/90], Loss: 0.9991
Epoch [1/50], lter [40/90], Loss: 0.8754
Epoch [1/50], lter [60/90], Loss: 0.7098
Epoch [1/50], lter [80/90], Loss: 1.0356
Epoch [2/50], lter [20/90], Loss: 1.0631
Epoch [2/50], lter [40/90], Loss: 0.9328
Epoch [2/50], lter [60/90], Loss: 1.1051
Epoch [2/50], lter [80/90], Loss: 0.9744
Epoch [3/50], lter [20/90], Loss: 0.9786
Epoch [3/50], lter [40/90], Loss: 1.0117
Epoch [3/50], lter [60/90], Loss: 0.8976
Epoch [3/50], lter [80/90], Loss: 0.6685
Epoch [4/50], lter [20/90], Loss: 0.8985
Epoch [4/50], lter [40/90], Loss: 0.9027
Epoch [4/50], lter [60/90], Loss: 0.8532
Epoch [4/50], lter [80/90], Loss: 0.7124
Epoch [5/50], lter [20/90], Loss: 0.7895
Epoch [5/50], lter [40/90], Loss: 0.9662
Epoch [5/50], lter [60/90], Loss: 1.0996
Epoch [5/50], lter [80/90], Loss: 0.9959
Epoch [6/50], lter [20/90], Loss: 0.7937
Epoch [6/50], lter [40/90], Loss: 0.7738
Epoch [6/50], lter [60/90], Loss: 0.6531
Epoch [6/50], lter [80/90], Loss: 0.8387
Epoch [7/50], lt

Epoch [50/50], lter [20/90], Loss: 0.9618
Epoch [50/50], lter [40/90], Loss: 0.9584
Epoch [50/50], lter [60/90], Loss: 0.8582
Epoch [50/50], lter [80/90], Loss: 0.5748
Learning Finished


# Test Model

In [35]:
x_test = torch.from_numpy(test_data).type(torch.FloatTensor)
x_test = x_test.view(-1, 1, 12, 500)

y_test = torch.from_numpy(test_label).type(torch.LongTensor)

test_data = Data.TensorDataset(x_test, y_test)

test_loader = Data.DataLoader(dataset=test_data, batch_size=1, shuffle=True)

In [37]:
model.eval()

correct = 0
total = 0

for text, labels in test_loader:
    # if cuda -> text = text.cuda()
    
    outputs = model(text)
    
    _, predicted = torch.max(outputs.data, 1)
    total += 1
    correct += (predicted == labels).sum() # if cuda -> correct += (pre == labels.cuda()).sum()
    
print('Accuracy of test text: %f %%' % (100 * float(correct) / total))

Accuracy of test text: 55.445545 %
