In [1]:
import pandas as pd
import hgtk
from tqdm import tqdm
import fasttext

In [2]:
def decompose(forms:list):
    word = ''
    for form in forms:
        try:
            if hgtk.checker.is_hangul(form):
                for s in form:
                    a, b, c = hgtk.letter.decompose(s)
                    if not a:
                        a = '-'
                    if not b:
                        b = '-'
                    if not c:
                        c = '-'
                    word = word + a + b + c
        except TypeError as e:
            print(form)
    return word

In [3]:
def compose(jamo_sequence:list):
    tokenized_jamo = []
    index = 0

    while index < len(jamo_sequence):
        if not hgtk.checker.is_hangul(jamo_sequence[index]):
            tokenized_jamo.append(jamo_sequence[index])
            index = index + 1

        else:
            tokenized_jamo.append(jamo_sequence[index:index + 3])
            index = index + 3

    word = ''
    try:
        for jamo in tokenized_jamo:

            if len(jamo) == 3:
                if jamo[2] == "-":
                    word = word + hgtk.letter.compose(jamo[0], jamo[1])
                else:
                    word = word + hgtk.letter.compose(jamo[0], jamo[1], jamo[2])
            else:
                word = word + jamo

    except Exception as exception:
        if type(exception).__name__ == 'NotHangulException':
            return jamo_sequence

    return word

In [4]:
fast_model = fasttext.load_model("fasttext_jn.bin") # 모델 로드

In [5]:
def transform(word_sequence):
    return [(compose(word), similarity) for (similarity, word) in word_sequence]

In [6]:
fast_model.get_word_vector(decompose(['제주도']))

array([-0.23002268,  0.39805633, -0.595214  , -0.4201808 ,  0.06841939,
       -0.23568943,  0.33469296,  0.09796971,  0.10487619,  0.17936371,
       -0.04858068,  0.11436544, -0.18841325, -0.1625356 ,  0.03736197,
        0.23201014, -0.30488858, -0.11117312, -0.21404381, -0.01474717,
        0.13891964, -0.6124201 , -0.3584761 , -0.41327453, -0.21037365,
        0.30367267, -0.06418942,  0.20489423, -0.15140772,  0.34683898,
       -0.08985352,  0.21257268, -0.07074974,  0.20376125, -0.09280302,
       -0.33641914, -0.32314423, -0.19665879, -0.25710115,  0.04477051,
       -0.0592188 , -0.04977519,  0.60128856, -0.15175302, -0.6904205 ,
       -0.4166109 ,  0.2812419 ,  0.1937623 , -0.70656985, -0.4241435 ,
        0.8790712 , -0.02911854, -0.02107281,  0.7127032 ,  0.61294305,
       -0.10936469,  0.17948501, -0.3176769 ,  0.06604326,  0.24650899,
       -0.4777825 ,  0.52821845, -0.32548603, -0.3195687 ,  0.32848126,
        0.50072765,  0.13473417,  0.10986972, -0.09630211,  0.52

In [7]:
from torch.utils.data import random_split, DataLoader, Dataset, TensorDataset
from torch import nn
import torch.nn.functional as F
import torch
import numpy as np
import matplotlib.pyplot as plt

In [50]:
class CustomDataset(Dataset):
    def __init__(self, csv_dir, num_word, transform = None, target_transform=None):
        self.df = pd.read_csv(csv_dir).sample(frac=1)[:5120]
        self.transform = transform
        self.target_transform = target_transform
        self.num_word = num_word
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, i):
        sent = self.df['morphologized_sent'].iloc[i]
        label = self.df['label'].iloc[i]
        label = torch.tensor(label, dtype=torch.float32)
        padded_vec = torch.zeros((self.num_word, fast_model.get_dimension()), dtype = torch.float32)
        
        sent2vec = [] 
        for w in sent:
            if w.rstrip():
                sent2vec.append(fast_model.get_word_vector(decompose(w)))
        sent2vec = np.array(sent2vec)
        len_sent = len(sent2vec)
        if len_sent > self.num_word:
            len_sent = self.num_word
        padded_vec[(self.num_word - len_sent):] = torch.from_numpy(sent2vec[:len_sent])
        
        return (padded_vec, label)

In [51]:
dataset = CustomDataset('./morphologized_ratings.csv', num_word=32)

In [None]:
sent_len = [len(s.split()) for s in dataset.df['morphologized_sent']]
pd.Series(sent_len).hist()
plt.show()
pd.Series(sent_len).describe()

In [10]:
sr = pd.Series(sent_len)

NameError: name 'sent_len' is not defined

In [None]:
from collections import defaultdict

In [None]:
dic = defaultdict(int)
for i in sr:
    dic[i] += 1

In [None]:
dic

In [None]:
dic
i = 0
t = 1
for k, v in dic.items():
    i += v
    x = i / 5120 * 100
    if k == 32:
        print(x)
    
    if x//10 == t:
        print(k, v, i, x)
        t += 1
    

In [52]:
train_size = 4096
valid_size = 1024
batch_size = 16

In [53]:
train_data, valid_data = random_split(dataset, [train_size, valid_size])

In [54]:
train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size = batch_size, shuffle=True)

In [65]:
train_sent, train_label = next(iter(train_dataloader))
print(train_sent.size())

torch.Size([16, 32, 100])


In [57]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_dim):
        super(SentimentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=num_layers,batch_first = True)
        
        self.linear = nn.Linear(self.hidden_size, self.output_dim)
        self.dropout = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        lstm_out, (hn, cn) = self.lstm(x, hidden)
        
        drop_out = self.dropout(lstm_out)
        re_drop_out = drop_out.reshape([-1, self.hidden_size])
        
        linear_out = self.linear(re_drop_out)
        
        sig_out = self.sig(linear_out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        
        return sig_out
        
    def init_hidden(self, batch_size, device):
        h0 = torch.zeros((self.num_layers, batch_size, self.hidden_size), dtype=torch.float32).to(device)
        c0 = torch.zeros((self.num_layers, batch_size, self.hidden_size), dtype = torch.float32).to(device)
        hidden = (h0, c0)
        
        return hidden

In [58]:
num_layers = 1
input_size = 100
hidden_size = 128
output_dim = 1

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [59]:
lstm_model = SentimentLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_dim=output_dim)

In [60]:
lstm_model.to(device)

SentimentLSTM(
  (lstm): LSTM(100, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (sig): Sigmoid()
)

In [61]:
lr = 0.001
clip = 5
epochs = 5

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = lr)

def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [62]:
def model_train(dataloader, model):
    
    train_losses = []
    train_acc = 0.0
    model.train()
    hidden = model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        lstm_model.zero_grad()

        pred = model(inputs, hidden)
        #print(pred.shape, labels.shape)
        loss = loss_func(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        train_losses.append(loss.item())

        accuracy = acc(pred, labels)

        train_acc += accuracy


    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = train_acc/len(train_dataloader.dataset)
        
    return epoch_train_loss, epoch_train_acc

In [63]:
def model_valid(dataloader, model):
    val_losses = []
    val_acc = 0.0
    model.eval()
    hidden = model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        pred = model(inputs, hidden)

        val_loss = loss_func(pred.squeeze(), labels.float())
        val_losses.append(val_loss.item())
        accuracy = acc(pred, labels)

        val_acc += accuracy

    epoch_val_loss = np.mean(val_losses)
    epoch_val_acc = val_acc/len(valid_dataloader.dataset)
        
    return epoch_val_acc, epoch_val_loss

In [64]:
epochs = 5

epoch_tr_acc, epoch_tr_loss = [], []
epoch_vl_acc, epoch_vl_loss = [],[]
for epoch in range(epochs):
    epoch_train_loss, epoch_train_acc = model_train(train_dataloader, lstm_model)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_tr_acc.append(epoch_train_acc)
    
    epoch_val_loss, epoch_val_acc = model_valid(valid_dataloader, lstm_model)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_vl_acc.append(epoch_val_acc)
    
    
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    print(25*'==')

100%|██████████| 256/256 [00:18<00:00, 13.71it/s]
100%|██████████| 64/64 [00:03<00:00, 17.31it/s]


Epoch 1
train_loss : 0.6929674204438925 val_loss : 0.5185546875
train_accuracy : 50.8056640625 val_accuracy : 69.23658372834325


100%|██████████| 256/256 [00:16<00:00, 15.88it/s]
100%|██████████| 64/64 [00:03<00:00, 18.74it/s]


Epoch 2
train_loss : 0.6931896188762039 val_loss : 0.5185546875
train_accuracy : 50.5859375 val_accuracy : 69.23658307641745


 57%|█████▋    | 145/256 [00:09<00:07, 15.80it/s]


KeyboardInterrupt: 

In [None]:
fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
#plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()
    
plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
#plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

In [None]:
torch.save(lstm_model, './lstm_model.bin')