# 한국어 Sentiment Classification

- 영어(IMDb 데이터셋) Sentiment Classification을 참고하여 NSMC 네이버 영화 리뷰 감정분석 모델링하기

In [1]:
import os, random
import pandas as pd
import re

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data, datasets
from torchtext.legacy.data import BucketIterator
from torchtext.legacy import data
from torchtext.legacy.data import TabularDataset

In [2]:
path = "/content/drive/MyDrive/3.AI_트랜스포머_이재원 강사님(3.24~3.26)/data"

In [6]:
train_df = pd.read_csv(os.path.join(path, "ratings_train.txt"), sep='\t', encoding='utf-8')
test_df = pd.read_csv(os.path.join(path, "ratings_test.txt"), sep='\t', encoding='utf-8')

train_df[:1000].to_csv(os.path.join(path, "ratings_train_small.txt"), sep='\t', index=False)
test_df[:100].to_csv(os.path.join(path, "ratings_test_small.txt"), sep='\t', index=False)

In [None]:
# [1, 3, 6, 17, 0, 0, 0] 이 영화는 너무 재미없다

In [5]:
print(train_df.shape)
print(test_df.shape)

(150000, 3)
(50000, 3)


- 한국어 전처리 및 토크나이징

In [7]:
!python3 -m pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 11.9MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 40.4MB/s 
Installing collected packages: beautifulsoup4

In [12]:
from konlpy.tag import Okt, Komoran, Hannanum, Kkma
tokenizer = Okt()
tokenizer.morphs('안녕하세요. 오늘 날씨가 참 좋습니다!')

['안녕하세요', '.', '오늘', '날씨', '가', '참', '좋습니다', '!']

In [11]:
def preprocess_sent(sentence):
  # sentence = re.sub("[^가-힣0-9a-zA-Z\\s]", " ", x)
  sentence = tokenizer.morphs(sentence)
  return sentence

In [9]:
train_df[:10]

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
5,5403919,막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.,0
6,7797314,원작의 긴장감을 제대로 살려내지못했다.,0
7,9443947,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...,0
8,7156791,액션이 없는데도 재미 있는 몇안되는 영화,1
9,5912145,왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?,1


In [10]:
test_df[:10] ' '

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
5,7898805,"음악이 주가 된, 최고의 음악영화",1
6,6315043,진정한 쓰레기,0
7,6097171,"마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다",0
8,8932678,갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한c...,0
9,6242223,"이별의 아픔뒤에 찾아오는 새로운 인연의 기쁨 But, 모든 사람이 그렇지는 않네..",1


In [13]:
# 필드 정의
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=preprocess_sent,
                  lower=True,
                  batch_first=True,
                  include_lengths=True)

LABEL = data.LabelField(dtype = torch.float)

In [14]:
train_data, test_data = TabularDataset.splits(
    path=path,
    train=os.path.join(path, "ratings_train_small.txt"),
    test=os.path.join(path, "ratings_test_small.txt"), format='tsv',
    fields=[(id, None), ('text', TEXT), ('label', LABEL)], skip_header=True)

train_data, valid_data = train_data.split(random_state = random.seed(1234))

In [15]:
print(vars(test_data[9]))

{'text': '이별 의 아픔 뒤 에 찾아오는 새로운 인연 의 기쁨 but , 모든 사람 이 그렇지는 않네 ..', 'label': '1'}


In [16]:
print(train_data.fields.items())

dict_items([(<built-in function id>, None), ('text', <torchtext.legacy.data.field.Field object at 0x7ff5aa683b90>), ('label', <torchtext.legacy.data.field.LabelField object at 0x7ff5aa683810>)])


In [17]:
TEXT.build_vocab(train_data,
                min_freq=10, 
                max_size=1000)

LABEL.build_vocab(train_data)

In [18]:
print(len(TEXT.vocab))

333


In [19]:
print(TEXT.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7ff5aac315d0>>, {'<unk>': 0, '<pad>': 1, ' ': 2, '.': 3, '이': 4, '다': 5, '는': 6, '고': 7, '화': 8, '영': 9, '지': 10, '하': 11, '도': 12, '가': 13, '나': 14, '아': 15, '기': 16, '에': 17, 'ㅋ': 18, '어': 19, '은': 20, '한': 21, '만': 22, '의': 23, '보': 24, '서': 25, '인': 26, '들': 27, '요': 28, '게': 29, '로': 30, '니': 31, '라': 32, '!': 33, '재': 34, ',': 35, '리': 36, '을': 37, '스': 38, '시': 39, '정': 40, '그': 41, '없': 42, '미': 43, '데': 44, '진': 45, '사': 46, '음': 47, '있': 48, '마': 49, '무': 50, '말': 51, '연': 52, '해': 53, '네': 54, '대': 55, '전': 56, '면': 57, '자': 58, '감': 59, '내': 60, '?': 61, '점': 62, '수': 63, '를': 64, '주': 65, '짜': 66, '작': 67, '거': 68, '까': 69, '안': 70, '좋': 71, '으': 72, '드': 73, '우': 74, '너': 75, '상': 76, '일': 77, '적': 78, '었': 79, '제': 80, '간': 81, '여': 82, '오': 83, '개': 84, '생': 85, '더': 86, '부': 87, '최': 88, '했': 89, '0': 90, '1': 91, '본': 92, '야': 93, '던': 94, '히': 95, '신': 96, '장': 97, '중': 98, '구': 

In [20]:
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.text),
    device = device)

In [21]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))        
        #embedded = [sent len, batch size, emb dim]
        
        # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [23]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,343,957 trainable parameters


In [25]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-1.8409,  2.7472,  0.2431,  ...,  1.0215,  0.9549, -1.3657],
        ...,
        [-0.6160,  1.7185,  0.6662,  ...,  0.9717,  1.4248,  0.6425],
        [-2.4122,  1.4499, -1.3026,  ...,  1.4633, -2.2898, -2.1753],
        [ 0.4361, -0.6469,  0.4701,  ...,  0.4114,  0.3125,  0.1226]])


In [26]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [27]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [29]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text       
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(path, 'rnn-nsmc-sentiment.pt'))
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.646 | Train Acc: 64.54%
	 Val. Loss: 0.666 |  Val. Acc: 55.59%


In [34]:
model.load_state_dict(torch.load(os.path.join(path, 'rnn-imdb-sentiment.pt')))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.682 | Test Acc: 55.36%


In [35]:
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = preprocess_sent(sentence).split()
    indexed = [[TEXT.vocab.stoi[t] for t in tokenized]]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    
    return prediction.item()

In [38]:
predict_sentiment(model, '날씨가 참 좋다')

0.4943055212497711