In [1]:
import torch
import Korpora
from Korpora import Korpora
import numpy as np
import pandas as pd
import gensim

In [2]:
from gensim.models import FastText

In [3]:
corpus = Korpora.load('kornli')
corpus_texts = corpus.get_all_texts() + corpus.get_all_pairs()



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : KakaoBrain
    Repository : https://github.com/kakaobrain/KorNLUDatasets
    References :
        - Ham, J., Choe, Y. J., Park, K., Choi, I., & Soh, H. (2020). KorNLI and KorSTS: New Benchmark
           Datasets for Korean Natural Language Understanding. arXiv preprint arXiv:2004.03289.
           (https://arxiv.org/abs/2004.03289)

    This is the dataset repository for our paper
    "KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding."
    (https://arxiv.org/abs/2004.03289)
    We introduce KorNLI and KorSTS, which are NLI and STS datasets in Korean.

    # License
    Creative Commons Attribution-ShareAlike license (CC BY-SA 4.0)
    Details in https://creativecommons.org/licenses

In [4]:
tokens = [sentence.split() for sentence in corpus_texts]

In [5]:
fasttext = FastText(
    sentences = tokens,
    vector_size = 128,
    window = 5,
    min_count = 5,
    sg = 1,
    epochs = 3,
    min_n = 2,
    max_n = 6
)

In [6]:
oov_token = '사랑해요'
oov_vector = fasttext.wv[oov_token]

In [7]:
print(oov_token in fasttext.wv.index_to_key)
print(fasttext.wv.most_similar(oov_vector, topn=5))

False
[('사랑해', 0.911003828048706), ('사랑', 0.8714408874511719), ('사랑한', 0.8656883239746094), ('사랑해서', 0.8580443263053894), ('사랑해.', 0.8488666415214539)]


# RNN

```
rnn = torch.nn.RNN(
    input_size,
    hidden_size,
    num_layers = 1,
    nomlinearity='tanh',
    bias=False,
    batch_first = True,
    dropout = 0,
    bidirectional = False
)
```

In [8]:
import torch
from torch import nn

In [9]:
input_size = 128
output_size = 256
num_layers = 3
bidirectional = True

In [10]:
model = nn.RNN(
    input_size = input_size,
    hidden_size = output_size,
    num_layers = num_layers,
    nonlinearity='tanh',
    batch_first = True,
    bidirectional=bidirectional,
)

In [11]:
batch_size=4
sequence_len=6

In [12]:
inputs = torch.randn(batch_size, sequence_len, input_size)

In [13]:
h0 = torch.rand(num_layers * (int(bidirectional)+1), batch_size, output_size)

In [14]:
outputs, hidden = model(inputs, h0)

In [15]:
print(outputs.shape)
print(hidden.shape)

torch.Size([4, 6, 512])
torch.Size([6, 4, 256])


# LSTM

Long Short Term Memory: RNN 모델이 갖던 기억력 부족과 Gradient Vanishing 문제를 해결

RNN 모델은 장기 의존성 문제(Long Term Dependencies) 문제가 발생 가능. 활성화함수로 사용되는 tanh 함수나 ReLU 함수 특성으로 인해 역전파 과정에서 기울기 소실이나 폭주도 발생 가능함.

LSTM 모델은 순환 싱경망과 비슷한 구조를 가지나, Memory cell과 Gate 구조의 도입으로 상기한 문제를 해결

```
lstm = torch.nn.LSTM(
    input_size,
    hidden_size,
    num_layers=1,
    bias=True,
    batch_first=True,
    dropout=0,
    bidirectional=False,
    proj_size=0
)
```

In [16]:
import torch
from torch import nn

In [17]:
input_size=128
output_size=256
num_layers = 3
bidirectional=True
proj_size=64

model = nn.LSTM(
    input_size=input_size,
    hidden_size=output_size,
    num_layers=num_layers,
    batch_first=True,
    bidirectional=bidirectional,
    proj_size=proj_size
)

batch_size=4
sequence_len=6

inputs=torch.randn(batch_size, sequence_len, input_size)
h0=torch.rand(
    num_layers * (int(bidirectional)+1),
    batch_size,
    proj_size if proj_size > 0 else output_size,
)
c0 = torch.rand(num_layers * (int(bidirectional)+1), batch_size, output_size)

outputs, (hn, cn) = model(inputs, (h0, c0))

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


In [18]:
print(outputs.shape)

torch.Size([4, 6, 128])


In [19]:
print(hn.shape)
print(cn.shape)

torch.Size([6, 4, 64])
torch.Size([6, 4, 256])


# P/N classification model by using RNN and LSTM

In [20]:
class sentence_classifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type='lstm'
    ):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim = embedding_dim,
            padding_idx = 0
        )
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size = embedding_dim,
                hidden_size = hidden_dim,
                num_layers = n_layers,
                bidirectional = bidirectional,
                dropout = dropout,
                batch_first = True
            )
        
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size = embedding_dim,
                hidden_size = hidden_dim,
                num_layers = n_layers,
                bidirectional = bidirectional,
                dropout = dropout,
                batch_first = True
            )
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
            
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [21]:
import pandas as pd
from Korpora import Korpora

In [22]:
corpus = Korpora.load('nsmc')
corpus_df = pd.DataFrame(corpus.test)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\dohyeong\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\

In [23]:
train = corpus_df.sample(frac=0.9, random_state=42)
test = corpus_df.drop(train.index)

In [24]:
print(train.head(5).to_markdown())
print(f'train_size: {len(train)}')
print(f'test_size: {len(test)}')

|       | text                                                                                     |   label |
|------:|:-----------------------------------------------------------------------------------------|--------:|
| 33553 | 모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. |       1 |
|  9427 | 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...                                    |       0 |
|   199 | 신날 것 없는 애니.                                                                       |       0 |
| 12447 | 잔잔 격동                                                                                |       1 |
| 39489 | 오랜만에 찾은 주말의 명화의 보석                                                         |       1 |
train_size: 45000
test_size: 5000


In [25]:
from konlpy.tag import Okt
from collections import Counter

In [26]:
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
        vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)

    return vocab

tokenizer = Okt()
print('okt done')
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=['<pad>', '<unk>'])
token_to_id = {token: idx for idx, token in enumerate(vocab)}
id_to_token = {idx: token for idx, token in enumerate(vocab)}

okt done


In [27]:
print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


# int encoding and padding

In [28]:
import numpy as np

In [29]:
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [30]:
unk_id = token_to_id['<unk>']
train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

In [31]:
print(train_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [32]:
print(test_ids[0])

[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [33]:
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values,  dtype = torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [34]:
from torch import optim

In [35]:
n_vocab = len(token_to_id)

In [64]:
hidden_dim = 64
embedding_dim = 128
n_layers = 2

classifier = sentence_classifier(
    n_vocab = n_vocab,
    hidden_dim = hidden_dim,
    embedding_dim = embedding_dim,
    n_layers = n_layers
)

In [112]:
device = 'cuda'

In [113]:
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [142]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [143]:
device

device(type='cuda')

In [223]:
def train(model, datasets, criterion, device, optimizer, interval):
    model.train()
    losses = list()
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % interval == 0:
            print(f'train_loss {step}: {np.mean(losses)}')
            

In [224]:
def test(model, datasets, criterion, device):
    model.eval()
    losses = []
    corrects = []
    
    with torch.no_grad():
        for step, (input_ids, labels) in enumerate(datasets):
            input_ids = input_ids.to(device)
            labels = labels.to(device).unsqueeze(1)
            
            logits = model(input_ids)
            loss = criterion(logits, labels)
            losses.append(loss.item())
            
            predictions = (logits > 0.5).float()
            correct = (predictions == labels).float().sum()
            corrects.append(correct)
    
    avg_loss = sum(losses) / len(losses)
    accuracy = sum(corrects) / len(datasets.dataset)
    print(f'Test Loss: {avg_loss}, Test Accuracy: {accuracy}')

In [225]:
epochs = 5
interval = 500

In [226]:
device = 'cuda'

In [227]:
model.to(device)

LSTM(128, 256, proj_size=64, num_layers=3, batch_first=True, bidirectional=True)

In [228]:
classifier.to(device)

sentence_classifier(
  (embedding): Embedding(5002, 128)
  (model): LSTM(128, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (classifier): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [236]:
for epoch in range(epochs):
    train(classifier, train_loader, criterion, device, optimizer, interval)
    test(classifier, test_loader, criterion, device)

train_loss 0: 0.6870390772819519
train_loss 500: 0.6931035059416841
train_loss 1000: 0.6931176014713474
train_loss 1500: 0.6932658474894859
train_loss 2000: 0.6932994445701172
train_loss 2500: 0.6933506678124038
Test Loss: 0.693649560308304, Test Accuracy: 0.4821999967098236
train_loss 0: 0.7011062502861023
train_loss 500: 0.6938849688766008
train_loss 1000: 0.6936074681096263
train_loss 1500: 0.6934359823759996
train_loss 2000: 0.693416163779568
train_loss 2500: 0.69341177191557
Test Loss: 0.6936506246225521, Test Accuracy: 0.4821999967098236
train_loss 0: 0.7008789777755737
train_loss 500: 0.6933555244923589
train_loss 1000: 0.6934144872051853
train_loss 1500: 0.6936174673211328
train_loss 2000: 0.6935650970505691
train_loss 2500: 0.6935454028122714
Test Loss: 0.6936476064947086, Test Accuracy: 0.4821999967098236
train_loss 0: 0.6898178458213806
train_loss 500: 0.6935004901267335
train_loss 1000: 0.6934609407192462
train_loss 1500: 0.6933279882106361
train_loss 2000: 0.69322231857255

In [222]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

In [200]:
for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

In [201]:
token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [ 0.03330366 -0.3556527   0.01270325 -0.20829733  0.13543186 -0.18150993
  0.02951619 -0.09317388 -0.18373103  0.16290817 -0.05611473 -0.03719699
  0.02471198 -0.16446424  0.20844965  0.365992    0.07688164  0.01230337
 -0.10401959  0.2306495  -0.05232944  0.24145994 -0.1746113  -0.37058467
 -0.05984326  0.1193573  -0.0958266   0.29869407  0.21386899 -0.33867097
 -0.08592527  0.14896092 -0.04156579  0.20819351 -0.07515699  0.36699715
  0.22735111 -0.20565431 -0.00274597 -0.01631356 -0.04651385  0.25305504
 -0.03152147 -0.19845273  0.37523302  0.05237744 -0.07598199 -0.06152836
  0.06555579 -0.04907126  0.08199442 -0.09313392  0.20899668  0.01285784
 -0.02268853 -0.07997185  0.3502871   0.13160667 -0.074122   -0.15173548
  0.4365486   0.0189922   0.14980258 -0.04991355  0.23081478 -0.11140083
  0.19742823  0.09540525 -0.17929368 -0.10804195  0.04769334 -0.10192904
 -0.45491502 -0.19848992  0.25304458 -0.22504833  0.01785926 -0.06307909
 -0.12844144  0.39623684 -0.12601593 -0.148210

In [202]:
len(token_to_embedding['보고싶다'])

128

In [203]:
token_to_embedding['보고싶다'].sum()

-0.12029147

In [204]:
from gensim.models import Word2Vec

In [205]:
word2vec = Word2Vec.load('./word2vec.model')
init_embeddings = np.zeros((n_vocab, embedding_dim))

In [206]:
embedding_dim

128

In [207]:
len(init_embeddings)

5002

In [208]:
id_to_token

{0: '<pad>',
 1: '<unk>',
 2: '.',
 3: '이',
 4: '영화',
 5: '의',
 6: '..',
 7: '가',
 8: '에',
 9: '...',
 10: '을',
 11: '도',
 12: '들',
 13: ',',
 14: '는',
 15: '를',
 16: '은',
 17: '?',
 18: '너무',
 19: '한',
 20: '다',
 21: '정말',
 22: '만',
 23: '진짜',
 24: '적',
 25: '!',
 26: '로',
 27: '점',
 28: '으로',
 29: '에서',
 30: '연기',
 31: '평점',
 32: '과',
 33: '것',
 34: '~',
 35: '최고',
 36: '내',
 37: '그',
 38: '나',
 39: '안',
 40: '잘',
 41: '와',
 42: '인',
 43: '생각',
 44: '게',
 45: '못',
 46: '이런',
 47: '왜',
 48: '....',
 49: '스토리',
 50: '드라마',
 51: '사람',
 52: '이다',
 53: '감동',
 54: '1',
 55: '보고',
 56: '하는',
 57: '때',
 58: '더',
 59: '하고',
 60: '고',
 61: '아',
 62: '말',
 63: '감독',
 64: 'ㅋㅋ',
 65: '그냥',
 66: '배우',
 67: '내용',
 68: '거',
 69: '중',
 70: '재미',
 71: '까지',
 72: '본',
 73: '보다',
 74: '요',
 75: '!!',
 76: '없는',
 77: '좀',
 78: '뭐',
 79: '시간',
 80: '지',
 81: '수',
 82: '쓰레기',
 83: '사랑',
 84: '봤는데',
 85: '볼',
 86: '네',
 87: '작품',
 88: '다시',
 89: '하나',
 90: '10',
 91: '없다',
 92: '할',
 93: '이건',
 94: '마지막',
 

In [209]:
for index, token in id_to_token.items():
    if token not in ['<pad>', '<unk>']:
        init_embeddings[index] = word2vec.wv[token]

In [210]:
embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)

In [230]:
class sentence_classifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type='lstm',
        pretrained_embedding=None  # 새로운 매개변수 추가
    ):
        super().__init__()
        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32)
            )
        else:
            self.embedding = nn.Embedding(
                num_embeddings=n_vocab,
                embedding_dim=embedding_dim,
                padding_idx=0
            )
        
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
            
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [231]:
classifier = sentence_classifier(
    n_vocab = n_vocab,
    hidden_dim = hidden_dim,
    embedding_dim = embedding_dim,
    n_layers = n_layers,
    pretrained_embedding = init_embeddings,
).to(device)


In [232]:
epochs = 5
interval = 500

In [237]:
for epoch in range(epochs):
    train(classifier, train_loader, criterion, device, optimizer, interval)
    test(classifier, test_loader, criterion, device)

train_loss 0: 0.7015191316604614
train_loss 500: 0.6934962734252869
train_loss 1000: 0.6934195609835835
train_loss 1500: 0.6934081540752617
train_loss 2000: 0.6934300783215493
train_loss 2500: 0.6934670106929
Test Loss: 0.6936484344851095, Test Accuracy: 0.4821999967098236
train_loss 0: 0.6969184875488281
train_loss 500: 0.6935072561462007
train_loss 1000: 0.6934360885596299
train_loss 1500: 0.6932757778377393
train_loss 2000: 0.6932530745096889
train_loss 2500: 0.6932658483294762
Test Loss: 0.6936503962967724, Test Accuracy: 0.4821999967098236
train_loss 0: 0.6907771229743958
train_loss 500: 0.6932430875277567
train_loss 1000: 0.6932838079812643
train_loss 1500: 0.6933793173004674
train_loss 2000: 0.6933697291102069
train_loss 2500: 0.6933468011845021
Test Loss: 0.6936381263093064, Test Accuracy: 0.4821999967098236
train_loss 0: 0.6913106441497803
train_loss 500: 0.6931516592374105
train_loss 1000: 0.6932619132481136
train_loss 1500: 0.6934792202842148
train_loss 2000: 0.6934826141712