In [1]:
import pandas as pd
import hgtk
from tqdm import tqdm
import fasttext

In [2]:
def decompose(forms:list):
    word = ''
    for form in forms:
        try:
            if hgtk.checker.is_hangul(form):
                for s in form:
                    a, b, c = hgtk.letter.decompose(s)
                    if not a:
                        a = '-'
                    if not b:
                        b = '-'
                    if not c:
                        c = '-'
                    word = word + a + b + c
        except TypeError as e:
            print(form)
    return word

In [3]:
fast_model = fasttext.load_model("fasttext_jn.bin") # 모델 로드

In [4]:
def transform(word_sequence):
    return [(compose(word), similarity) for (similarity, word) in word_sequence]

In [5]:
from torch.utils.data import random_split, DataLoader, Dataset, TensorDataset
from torch import nn
import torch.nn.functional as F
import torch
import numpy as np
import matplotlib.pyplot as plt

In [6]:
class CustomDataset(Dataset):
    def __init__(self, csv_dir, num_word, transform = None, target_transform=None):
        self.df = pd.read_csv(csv_dir).sample(frac=1)[:5120]  #
        self.transform = transform                 #
        self.target_transform = target_transform     #
        self.num_word = num_word          #
        #self.labels = torch.form_numpy( ... )...
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, i):
        sent = self.df['morphologized_sent'].iloc[i]
        label = self.df['label'].iloc[i]
        label = torch.tensor(label, dtype=torch.float32)
        padded_vec = torch.zeros((self.num_word, fast_model.get_dimension()), dtype = torch.float32)
        
        sent2vec = [] 
        for w in sent:
            if w.rstrip():
                sent2vec.append(fast_model.get_word_vector(decompose(w)))
        sent2vec = np.array(sent2vec)
        len_sent = len(sent2vec)
        if len_sent > self.num_word:
            len_sent = self.num_word
        padded_vec[(self.num_word - len_sent):] = torch.from_numpy(sent2vec[:len_sent])
        
        return (padded_vec, label)

In [7]:
dataset = CustomDataset('./morphologized_ratings.csv', num_word=32)

In [8]:
train_size = int(len(dataset) * 0.8)
valid_size = len(dataset) - train_size
batch_size = 32

In [9]:
train_data, valid_data = random_split(dataset, [train_size, valid_size])

In [10]:
train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size = batch_size, shuffle=True)

In [11]:
train_sent, train_label = next(iter(train_dataloader))
print(train_sent.size())

torch.Size([32, 32, 100])


In [12]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_dim):
        super(SentimentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=num_layers,batch_first = True)
        
        self.linear = nn.Linear(self.hidden_size, self.output_dim)
        self.dropout = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        lstm_out, (hn, cn) = self.lstm(x)
        
        drop_out = self.dropout(lstm_out)
        re_drop_out = drop_out.reshape([-1, self.hidden_size])
        
        linear_out = self.linear(re_drop_out)
        
        sig_out = self.sig(linear_out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        
        return sig_out, (hn, cn)
        
    def init_hidden(self, batch_size, device):
        h0 = torch.zeros((self.num_layers, batch_size, self.hidden_size), dtype=torch.float32).to(device)
        c0 = torch.zeros((self.num_layers, batch_size, self.hidden_size), dtype = torch.float32).to(device)
        hidden = (h0, c0)
        
        return hidden

In [13]:
num_layers = 2
input_size = 100
hidden_size = 128
output_dim = 1

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [15]:
lstm_model = SentimentLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_dim=output_dim)

In [16]:
lstm_model.to(device)

SentimentLSTM(
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True)
  (linear): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (sig): Sigmoid()
)

In [17]:
lr = 0.001
clip = 5
epochs = 5

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = lr)

def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [18]:
def model_train(dataloader, model):
    
    train_losses = []
    train_acc = 0.0
    model.train()
    hidden = model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        hidden = tuple([each.data for each in hidden])
        pred, h = model(inputs, hidden)
        
        loss = loss_func(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        train_losses.append(loss.item())

        accuracy = acc(pred, labels)

        train_acc += accuracy
        optimizer.step()

    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = train_acc/len(train_dataloader.dataset)
        
    return epoch_train_loss, epoch_train_acc

In [19]:
def model_valid(dataloader, model):
    val_losses = []
    val_acc = 0.0
    model.eval()
    hidden = model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        hidden = tuple([each.data for each in hidden])
        
        pred, h = model(inputs, hidden)

        val_loss = loss_func(pred.squeeze(), labels.float())
        val_losses.append(val_loss.item())
        accuracy = acc(pred, labels)

        val_acc += accuracy

    epoch_val_loss = np.mean(val_losses)
    epoch_val_acc = val_acc/len(valid_dataloader.dataset)
        
    return epoch_val_acc, epoch_val_loss

In [20]:
epochs = 5

epoch_tr_acc, epoch_tr_loss = [], []
epoch_vl_acc, epoch_vl_loss = [],[]
for epoch in range(epochs):
    epoch_train_loss, epoch_train_acc = model_train(train_dataloader, lstm_model)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_tr_acc.append(epoch_train_acc)
    
    epoch_val_loss, epoch_val_acc = model_valid(valid_dataloader, lstm_model)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_vl_acc.append(epoch_val_acc)
    
    
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    print(25*'==')

100%|██████████| 128/128 [03:40<00:00,  1.72s/it] 
100%|██████████| 32/32 [00:03<00:00,  9.49it/s]


Epoch 1
train_loss : 0.689701902680099 val_loss : 0.5517578125
train_accuracy : 53.22265625 val_accuracy : 68.67739334702492


100%|██████████| 128/128 [00:14<00:00,  9.13it/s]
100%|██████████| 32/32 [00:02<00:00, 11.37it/s]


Epoch 2
train_loss : 0.6637472449801862 val_loss : 0.6220703125
train_accuracy : 60.009765625 val_accuracy : 65.1546711102128


100%|██████████| 128/128 [00:14<00:00,  9.11it/s]
100%|██████████| 32/32 [00:02<00:00, 11.37it/s]


Epoch 3
train_loss : 0.6347051730845124 val_loss : 0.65234375
train_accuracy : 63.57421875 val_accuracy : 63.573151640594006


100%|██████████| 128/128 [00:14<00:00,  9.09it/s]
100%|██████████| 32/32 [00:02<00:00, 11.36it/s]


Epoch 4
train_loss : 0.5989367868751287 val_loss : 0.6533203125
train_accuracy : 67.9931640625 val_accuracy : 61.98324151337147


100%|██████████| 128/128 [00:14<00:00,  9.11it/s]
100%|██████████| 32/32 [00:02<00:00, 11.47it/s]

Epoch 5
train_loss : 0.57958365813829 val_loss : 0.66015625
train_accuracy : 69.62890625 val_accuracy : 61.19313444942236





In [None]:
fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
#plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()
    
plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
#plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

In [None]:
torch.save(lstm_model, './lstm_model.bin')

In [None]:
tensor([[ 0.0146,  0.0404, -0.0509,  ..., -0.0385,  0.0166, -0.0370],
        [-0.0597,  0.0739, -0.0145,  ...,  0.0483,  0.0412, -0.0676],
        [-0.0344,  0.0557, -0.0689,  ..., -0.0809, -0.0663, -0.0484],
        ...,
        [ 0.0731,  0.0145,  0.0480,  ...,  0.0547,  0.0167,  0.0485],
        [-0.0729,  0.0058,  0.0433,  ...,  0.0633, -0.0016,  0.0611],
        [ 0.0038,  0.0102,  0.0623,  ..., -0.0873, -0.0795, -0.0269]],
       device='cuda:0')
tensor([[-0.0645, -0.0076,  0.0543,  ..., -0.0806, -0.0366,  0.0484],
        [-0.0302,  0.0088, -0.0471,  ..., -0.0544, -0.0489, -0.0127],
        [ 0.0427, -0.0663,  0.0533,  ...,  0.0839, -0.0501,  0.0093],
        ...,
        [ 0.0835, -0.0768, -0.0571,  ..., -0.0208,  0.0873,  0.0026],
        [-0.0453,  0.0513,  0.0046,  ...,  0.0104, -0.0859,  0.0382],
        [-0.0277,  0.0273, -0.0566,  ..., -0.0058, -0.0035,  0.0348]],
       device='cuda:0')

In [None]:
tensor([[ 0.0146,  0.0404, -0.0509,  ..., -0.0385,  0.0166, -0.0370],
        [-0.0597,  0.0739, -0.0145,  ...,  0.0483,  0.0412, -0.0676],
        [-0.0344,  0.0557, -0.0689,  ..., -0.0809, -0.0663, -0.0484],
        ...,
        [ 0.0731,  0.0145,  0.0480,  ...,  0.0547,  0.0167,  0.0485],
        [-0.0729,  0.0058,  0.0433,  ...,  0.0633, -0.0016,  0.0611],
        [ 0.0038,  0.0102,  0.0623,  ..., -0.0873, -0.0795, -0.0269]],
       device='cuda:0')
tensor([[-0.0645, -0.0076,  0.0543,  ..., -0.0806, -0.0366,  0.0484],
        [-0.0302,  0.0088, -0.0471,  ..., -0.0544, -0.0489, -0.0127],
        [ 0.0427, -0.0663,  0.0533,  ...,  0.0839, -0.0501,  0.0093],
        ...,
        [ 0.0835, -0.0768, -0.0571,  ..., -0.0208,  0.0873,  0.0026],
        [-0.0453,  0.0513,  0.0046,  ...,  0.0104, -0.0859,  0.0382],
        [-0.0277,  0.0273, -0.0566,  ..., -0.0058, -0.0035,  0.0348]],
       device='cuda:0')