In [1]:
import torch.nn as nn
class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type='lstm'
    ):
        super().__init__()

        self.embedding=nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type=='rnn':
            self.model=nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,

            )
        elif model_type=='lstm':
            self.model=nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,

            )
        if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2,1)
        else:
            self.classifier=nn.Linear(hidden_dim*1)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,inputs):
        embeddings=self.embedding(inputs)
        output,_=self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [2]:
import pandas as pd
from Korpora import Korpora

corpus=Korpora.load('nsmc')
corpus_df=pd.DataFrame(corpus.test)

train=corpus_df.sample(frac=0.9,random_state=42)
test=corpus_df.drop(train.index)

print(train.head().to_markdown())
print(f'Training Data Size : {len(train)}')
print(f'Testing Data Size : {len(test)}')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-48\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [3]:
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus,n_vocab,special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token,count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

tokenizer=Okt()
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab=build_vocab(corpus=train_tokens,n_vocab=5000,special_tokens=['<pad>','<unk>'])
token_to_id={token:idx for idx,token in enumerate(vocab)}
id_to_token={token:idx for idx,token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [6]:
import numpy as np
def pad_sequences(sequences,max_length,pad_value):
    result=list()
    for sequence in sequences:
        sequence=sequence[:max_length]
        pad_length=max_length-len(sequence)
        padded_sequence=sequence+[pad_value]*pad_length
        result.append(padded_sequence)
    return np.array(result)

unk_id=token_to_id['<unk>']
train_ids=[[token_to_id.get(token,unk_id) for token in review] for review in train_tokens]
test_ids=[[token_to_id.get(token,unk_id) for token in review] for review in test_tokens]

max_length=32
pad_id=token_to_id['<pad>']
train_ids=pad_sequences(train_ids,max_length,pad_id)
test_ids=pad_sequences(test_ids,max_length,pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [8]:
import torch
from torch.utils.data import TensorDataset,DataLoader

train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values,dtype=torch.float32)
test_labels=torch.tensor(test.label.values,dtype=torch.float32)

train_dataset=TensorDataset(train_ids,train_labels)
test_dataset=TensorDataset(test_ids,test_labels)

train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)

  train_ids=torch.tensor(train_ids)
  test_ids=torch.tensor(test_ids)


In [9]:
from torch import optim

n_vocab=len(token_to_id)
hidden_dim=64
embedding_dim=128
n_layer=2

device='cuda' if torch.cuda.is_available() else 'cpu'
classifier=SentenceClassifier(n_vocab=n_vocab,hidden_dim=hidden_dim,embedding_dim=embedding_dim,n_layers=n_layer).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(),lr=0.001)

In [10]:
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step} : {np.mean(losses)}")


def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")


epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.7053079605102539
Train Loss 500 : 0.6940382004022122
Train Loss 1000 : 0.6918097991090674
Train Loss 1500 : 0.6760961227342337
Train Loss 2000 : 0.6632575097976476
Train Loss 2500 : 0.6533547847068868
Val Loss : 0.591471304813513, Val Accuracy : 0.6904
Train Loss 0 : 0.3950698673725128
Train Loss 500 : 0.5772097259462474
Train Loss 1000 : 0.5633098314573001
Train Loss 1500 : 0.5500267355700956
Train Loss 2000 : 0.5377030125830783
Train Loss 2500 : 0.5225558206945455
Val Loss : 0.45505432165659276, Val Accuracy : 0.7902
Train Loss 0 : 0.5615537762641907
Train Loss 500 : 0.4238381556973248
Train Loss 1000 : 0.4194644880104256
Train Loss 1500 : 0.41523321999918217
Train Loss 2000 : 0.41394984914266364
Train Loss 2500 : 0.4120623234276627
Val Loss : 0.419062857144176, Val Accuracy : 0.8056
Train Loss 0 : 0.3411809206008911
Train Loss 500 : 0.36751887841733866
Train Loss 1000 : 0.36638016069119983
Train Loss 1500 : 0.3658599347561141
Train Loss 2000 : 0.367274000734344
Trai

In [11]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [ 2.6081896e-01  8.5469204e-01  6.4059186e-01  5.1611364e-01
  8.9949590e-01  8.6668253e-01 -1.1371868e+00  2.6883996e+00
  8.2390442e-02  9.7361249e-01  7.6103401e-01 -1.1177559e+00
 -8.0383009e-01  4.0308902e-01 -9.3939967e-02  5.7157969e-01
  1.0288697e+00  1.0952957e+00 -7.3642898e-01  9.5771170e-01
  3.2925922e-01  2.5472185e-01 -7.2439611e-01  8.5415013e-02
 -5.6072164e-01 -3.4131128e-02 -2.7754493e+00 -4.9875382e-01
 -3.6690271e-01  1.1390558e+00  8.0363810e-01  5.3548390e-01
  3.6653644e-01  8.4745169e-01  2.5196271e+00  7.6317692e-01
  5.8053356e-01  2.8422257e-01 -2.2621222e-01 -7.8375906e-01
 -4.3386695e-01 -1.7232476e-01  2.0026830e-01  6.3932067e-01
  4.9875519e-01  2.2426569e-01  6.9094472e-02 -2.0751071e+00
 -9.0699661e-01  3.3767518e-01  2.5023568e+00 -8.8527930e-01
 -6.9362736e-01 -9.5260102e-01  1.3477335e+00 -6.7588037e-01
 -1.6966709e-01 -1.1982466e+00  6.3574862e-01  5.2357918e-01
 -1.6391348e+00 -7.8939593e-01  5.1783490e-01 -9.6359563e-01
 -1.2824370e+00  4.