In [82]:
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [83]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [84]:
from Korpora import Korpora
from konlpy.tag import Okt
from gensim.models import Word2Vec

import torch
from torch import optim
from torch import nn
from torch.utils.data import TensorDataset,DataLoader

from collections import Counter
import pandas as pd
import numpy as np

In [85]:
#문장 분류 모델

class SentenceClassifier(nn.Module):
    def __init__(self,n_vocab,hidden_dim,embedding_dim,n_layers,dropout=0.5,bidirectional=True,model_type='lstm'):
        super().__init__()

        self.embedding=nn.Embedding(num_embeddings=n_vocab,embedding_dim=embedding_dim,padding_idx=0)

        if model_type=='rnn':
            self.model=nn.RNN(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=n_layers, bidirectional=bidirectional,dropout=dropout,batch_first=True)

        elif model_type=='lstm':
            self.model=nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=n_layers,bidirectional=bidirectional,dropout=dropout,batch_first=True)

        if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2,1)
        else:
            self.classifier=nn.Linear(hidden_dim,1)

        self.dropout=nn.Dropout(dropout)

    def forward(self,inputs):
        embeddings=self.embedding(inputs)
        output,_=self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [86]:
#데이터세트 불러오기

corpus=Korpora.load('nsmc')
corpus_df=pd.DataFrame(corpus.test)

train=corpus_df.sample(frac=0.9,random_state=42)
test=corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print('Training Data Size:',len(train))
print('Testing Data Size:',len(test))


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\MSI\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\MSI\K

In [87]:
#데이터 토큰화 및 단어 사전 구축

def build_vocab(corpus,n_vocab,special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token,count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

In [88]:
tokenizer=Okt()
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab=build_vocab(corpus=train_tokens,n_vocab=5000,special_tokens=['<pad>','<unk>'])
token_to_id={token:idx for idx,token in enumerate(vocab)}
id_to_token={idx:token for idx,token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [89]:
#정수 인코딩 및 패딩

def pad_sequences(sequences,max_length,pad_value):
    result=list()
    for sequence in sequences:
        sequence=sequence[:max_length]
        pad_length=max_length-len(sequence)
        padded_sequence=sequence+[pad_value]*pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [90]:
unk_id=token_to_id['<unk>']
train_ids=[[token_to_id.get(token,unk_id) for token in review] for review in train_tokens]
test_ids=[[token_to_id.get(token,unk_id) for token in review] for review in test_tokens]

max_length=32
pad_id=token_to_id['<pad>']
train_ids=pad_sequences(train_ids,max_length,pad_id)
test_ids=pad_sequences(test_ids,max_length,pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [91]:
train_tokens[0]

['모든',
 '편견',
 '을',
 '날려',
 '버리는',
 '가슴',
 '따뜻한',
 '영화',
 '.',
 '로버트',
 '드',
 '니',
 '로',
 ',',
 '필립',
 '세이모어',
 '호프만',
 '영원하라',
 '.']

In [92]:
test_tokens[0]

['이별',
 '의',
 '아픔',
 '뒤',
 '에',
 '찾아오는',
 '새로운',
 '인연',
 '의',
 '기쁨',
 'But',
 ',',
 '모든',
 '사람',
 '이',
 '그렇지는',
 '않네',
 '..']

In [93]:
train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values,dtype=torch.float32)
test_labels=torch.tensor(test.label.values,dtype=torch.float32)

train_dataset=TensorDataset(train_ids,train_labels)
test_dataset=TensorDataset(test_ids,test_labels)

train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)

In [94]:
#손실함수와 최적화 함수 정의

n_vocab=len(token_to_id)
hidden_dim=64
embedding_dim=128
n_layers=2

device='cuda' if torch.cuda.is_available() else 'cpu'

classifier=SentenceClassifier(n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(),lr=0.001)

In [95]:
#모델 학습 및 테스트
def train(model,datasets, criterion,optimizer,device,interval):
    model.train()
    losses=list()

    for step,(inputs_ids,labels) in enumerate(datasets):
        inputs_ids=inputs_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(inputs_ids)
        loss=criterion(logits,labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step%interval==0:
            print(f'Trainn Loss {step}: {np.mean(losses)}')

In [96]:
def test(model, datasets,criterion,device):
    model.eval()
    losses=list()
    corrects=list()

    for step,(input_ids,labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits,labels)
        losses.append(loss.item())
        yhat=torch.sigmoid(logits)>.5
        corrects.extend(torch.eq(yhat,labels).cpu().tolist())
    print(f'Val Loss: {np.mean(losses)}, Val Accuracy: {np.mean(corrects)}')

In [97]:
epochs=5
interval=500

for epoch in range(epochs):
    train(classifier,train_loader,criterion,optimizer,device,interval)
    test(classifier,test_loader,criterion,device)

Trainn Loss 0: 0.6923755407333374
Trainn Loss 500: 0.693011573093856
Trainn Loss 1000: 0.692117984299655
Trainn Loss 1500: 0.6798009841661307
Trainn Loss 2000: 0.6656260613886372
Trainn Loss 2500: 0.6550038043664294
Val Loss: 0.6041284230189582, Val Accuracy: 0.676
Trainn Loss 0: 0.44447994232177734
Trainn Loss 500: 0.5811614636294619
Trainn Loss 1000: 0.5763473912076159
Trainn Loss 1500: 0.5726397355185756
Trainn Loss 2000: 0.570101874223773
Trainn Loss 2500: 0.5587293476652785
Val Loss: 0.4782313286020352, Val Accuracy: 0.7752
Trainn Loss 0: 0.3493023216724396
Trainn Loss 500: 0.4706128910332621
Trainn Loss 1000: 0.457756325796053
Trainn Loss 1500: 0.4493404351060665
Trainn Loss 2000: 0.44373881154093725
Trainn Loss 2500: 0.43817531561455886
Val Loss: 0.4332203692711961, Val Accuracy: 0.802
Trainn Loss 0: 0.4356110692024231
Trainn Loss 500: 0.39436917744591804
Trainn Loss 1000: 0.38678825131945915
Trainn Loss 1500: 0.3846074265615889
Trainn Loss 2000: 0.3822411625017261
Trainn Loss 2

In [98]:
#학습된 모델로부터 임베딩 추출
token_to_embedding=dict()
embedding_metrix=classifier.embedding.weight.detach().cpu().numpy()

for word,emb in zip(vocab,embedding_metrix):
    token_to_embedding[word]=emb

token=vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [ 0.93049943 -0.4485399   1.4206077  -0.27747744 -1.1286433  -0.09754803
 -0.5689745  -1.3475473   0.75959384 -0.20456786 -0.66203004 -1.3312459
  0.5068424  -0.332373    0.5859555   0.36862454 -0.4461149   0.6443309
 -0.8920234  -0.545456    1.2616271   0.3397996   1.5604184  -0.658941
  0.68777114 -0.86183417  0.85466355 -0.32745874  1.0780392  -0.80722123
 -0.37858653 -1.8825042  -0.2599015   0.41788548 -2.049854    0.03660351
 -1.986395   -0.6441814  -0.5370713   0.30739018  2.1693003   0.8093875
 -1.8111361  -0.425444   -1.3921982   0.09051845 -1.5179828   1.8653108
 -1.2423953  -0.766455    0.5589561  -0.6365321   1.001903   -1.3328335
 -1.2263488  -0.31271574  0.13457614  0.42126116 -2.0591753   0.8467508
  1.8132298   0.63114214  0.1685237   1.0012673   1.1539768  -0.32910252
 -0.79264176 -0.27544707 -0.25350335 -0.5939801  -1.0158545  -2.4215739
 -0.77588964  0.43241242 -2.4584782   0.43895575  0.3485663  -1.1287143
 -0.62887126 -1.1895298  -0.55301994  2.1574662   1.1629

In [99]:
tokens=[tokenizer.morphs(review) for review in corpus_df.text]
word2vec=Word2Vec(sentences=tokens, vector_size=128, window=5, min_count=1, sg=1, epochs=3, max_final_vocab=10000)

word2vec.save('../models/word2vec.model')

In [100]:
#사전 학습된 모델로 임베딩 계층 초기화
word2vec=Word2Vec.load('../models/word2vec.model')
init_embeddings=np.zeros((n_vocab,embedding_dim))

for index,token in id_to_token.items():
    if token not in ['<pad>','<unk>']:
        init_embeddings[index]=word2vec.wv[token]

embedding_layer=nn.Embedding.from_pretrained(torch.tensor(init_embeddings,dtype=torch.float32))

In [104]:
#문장 분류 모델

class SentenceClassifier(nn.Module):
    def __init__(self,n_vocab,hidden_dim,embedding_dim,n_layers,dropout=0.5,bidirectional=True,model_type='lstm',pretrained_embedding=None):
        super().__init__()

        self.embedding=nn.Embedding(num_embeddings=n_vocab,embedding_dim=embedding_dim,padding_idx=0)

        if model_type=='rnn':
            self.model=nn.RNN(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=n_layers, bidirectional=bidirectional,dropout=dropout,batch_first=True)

        elif model_type=='lstm':
            self.model=nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=n_layers,bidirectional=bidirectional,dropout=dropout,batch_first=True)

        if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2,1)
        else:
            self.classifier=nn.Linear(hidden_dim,1)

        if pretrained_embedding is not None:
            self.embedding=nn.Embedding.from_pretrained(torch.tensor(pretrained_embedding,dtype=torch.float32))
        else:
            self.embedding=nn.Embedding(num_embeddings=n_vocab,embedding_dim=embedding_dim,padding_idx=0)

        self.dropout=nn.Dropout(dropout)

    def forward(self,inputs):
        embeddings=self.embedding(inputs)
        output,_=self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [105]:
classifier=SentenceClassifier(n_vocab=n_vocab, hidden_dim=hidden_dim,embedding_dim=embedding_dim,n_layers=n_layers, pretrained_embedding=init_embeddings).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(),lr=0.001)

epochs=5
interval=500

for epoch in range(epochs):
    train(classifier,train_loader,criterion,optimizer,device,interval)
    test(classifier,test_loader,criterion,device)

Trainn Loss 0: 0.6932138800621033
Trainn Loss 500: 0.6806622262367469
Trainn Loss 1000: 0.6459312205250327
Trainn Loss 1500: 0.6303231384379319
Trainn Loss 2000: 0.6119799771379197
Trainn Loss 2500: 0.5942872163177919
Val Loss: 0.47723259041294125, Val Accuracy: 0.7694
Trainn Loss 0: 0.5867852568626404
Trainn Loss 500: 0.48554271587473663
Trainn Loss 1000: 0.48511802724429537
Trainn Loss 1500: 0.4801067741591481
Trainn Loss 2000: 0.48137028840766555
Trainn Loss 2500: 0.4789809270626733
Val Loss: 0.4617430915752539, Val Accuracy: 0.7746
Trainn Loss 0: 0.47082436084747314
Trainn Loss 500: 0.46430438349465886
Trainn Loss 1000: 0.4568564516383332
Trainn Loss 1500: 0.4555144006613491
Trainn Loss 2000: 0.45327121367876316
Trainn Loss 2500: 0.4525008214408996
Val Loss: 0.43470091636950214, Val Accuracy: 0.797
Trainn Loss 0: 0.4236885905265808
Trainn Loss 500: 0.43712030105783556
Trainn Loss 1000: 0.4409991873504518
Trainn Loss 1500: 0.44162708560102865
Trainn Loss 2000: 0.43961852168229626
Tr