# Training fasttext model

Using corpus : https://81675795.ucloudcdnglobal.com/122/NIKL_MP_v1.1.pdf
https://kli.korean.go.kr/corpus/main/requestMain.do -> 형태 분석 말뭉치

In [None]:
import pandas as pd 

In [1]:
import fasttext

In [2]:
import hgtk
def decompose(form):
    word = ''
    try:
        for s in form:
            if s == ' ':
                word += ''
            elif hgtk.checker.is_hangul(s):
                a, b, c = hgtk.letter.decompose(s)
                if not a:
                    a = '-'
                if not b:
                    b = '-'
                if not c:
                    c = '-'
                word = word + a + b + c
    except e:
        print(e)
        print(f'except: {form}')
    return word

In [None]:
df_name = '../NIKL_MP_CSV/NXMP1902008040_{}.csv'    #문장이 sentence로, 문장을 형태소로 분석한 내용이 form으로 들어가 있음
with open('decomposed_sent.txt', 'w') as f:
    for i in range(5):
        df = pd.read_csv(df_name.format(i + 1), skipinitialspace=True, usecols=('sentence_id', 'form'))
        sent_form={}
        pre_sent = ''
        
        for value in tqdm(df.values):
            sent, form = value[0], value[1]
            sent = re.sub('ㄱ-ㅎ')
            if pre_sent != sent:
                sent_form[sent] = []
                pre_sent = sent
            sent_form[sent].append(form)
            
        for form in tqdm(sent_form.values()):
            f.write(' '.join(decompose(form)) + '\n')      #decompose('안녕하세요') == 'ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔ-ㅇㅛ'

In [None]:
fasttext_model = fasttext.train_unsupervised('decomposed_sent.txt', dim = 100, epoch = 10)

In [None]:
fasttext_model.save_model('fasttext_with_NIKL_MP_CSV.bin')

# preprocessing training data with Khaiii

In [3]:
from khaiii import KhaiiiApi

In [None]:
from tqdm import tqdm

In [None]:
import matplotlib.pyplot as plt

In [4]:
khaiii_api = KhaiiiApi()

In [None]:
import urllib.request

In [None]:
urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt', filename = 'ratings.txt')

In [None]:
df = pd.read_table('./ratings.txt')
df = df.dropna(how = 'any')

In [None]:
df = df.drop_duplicates(subset=['document'])

In [None]:
df['document'] = df['document'].str.replace('[^ㄱ-ㅎ가-힣]', ' ')

In [None]:
df = df.drop(index=df[df['document'].str.rstrip() == ''].index)

In [None]:
with open('morphs_label.txt', 'w') as out:
    for idx, row in tqdm(df.iterrows()):
        sent = row['document'].rstrip()
        sent = ' '.join(sent.split())
        label = row['label']
        morphs = ''
        try:
            for word in khaiii_api.analyze(sent):
                for m in word.morphs:
                    morphs += m.lex + ' '
        except:
            print(idx, row)
            break
        out.write(morphs + '\t' + str(label) + '\n')

In [None]:
file = open('./morphs_label.txt', 'r')
ori_data = []
for line in file:
    morphs, label = line.split('\t')
    ori_data.append((morphs, int(label)))

In [None]:
t = 0
f = 0
for (_, y) in ori_data:
    if y:
        t += 1
    else:
        f += 1

print(f'label \'1\' : {t} / label \'0\' : {f}')

In [None]:
sent_len = [len(s[0].split()) for s in ori_data]
pd.Series(sent_len).hist()
plt.show()
pd.Series(sent_len).describe()

In [None]:
dic = {}

for n in sent_len:
    if n in dic:
        dic[n] += 1
    else:
        dic[n] = 1

In [None]:
total_cnt = 0
keys = sorted(dic.keys())
f = True
for key in keys:
    cnt = dic[key]
    total_cnt += cnt
    if key >= 64:
        print(total_cnt / len(ori_data) * 100)
        break
        
    if (total_cnt / len(ori_data)) * 100 > 90 and f:
        print(key, total_cnt)
        f = False

# Writing Custom Dataset

In [5]:
fast_model = fasttext.load_model("fasttext_with_NIKL_MP_CSV.bin") # 모델 로드

In [6]:
from torch.utils.data import random_split, DataLoader, Dataset
from torch import nn
import torch
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data_dir, num_word, transform = None, target_transform=None):
        file = open(data_dir, 'r')
        self.ori_data = []
        for line in file:
            morphs, label = line.split('\t')
            self.ori_data.append((morphs, int(label)))
        self.data = self.ori_data
        self.transform = transform
        self.target_transform = target_transform
        self.num_word = num_word
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        sent = self.data[i][0]
        padded_vec = torch.zeros((self.num_word, fast_model.get_dimension()), dtype = torch.float32)
        
        sent2vec = []
        x = ''
        for m in sent.split():
            if m.rstrip():
                sent2vec.append(fast_model.get_word_vector(decompose(m)))
        sent2vec = np.array(sent2vec)
        len_sent = len(sent2vec)
        if len_sent > self.num_word:
            len_sent = self.num_word
        padded_vec[(self.num_word - len_sent):] = torch.from_numpy(sent2vec[:len_sent])
         
            
        label = torch.tensor(self.data[i][1], dtype = torch.float32)
        return (padded_vec, label)

In [None]:
dataset = CustomDataset('./morphs_label.txt', num_word = 64)

In [None]:
train_size = int(len(dataset) * 0.8)
valid_size = len(dataset) - train_size
batch_size = 32

In [None]:
train_data, valid_data = random_split(dataset, [train_size, valid_size])

In [None]:
train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size = batch_size, shuffle=True)

In [None]:
train_sent, train_label = next(iter(train_dataloader))
print(train_sent)
print(train_label)
print(train_sent.size())

# Building Model

In [10]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_dim):
        super(SentimentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=num_layers,batch_first = True)
        
        self.linear = nn.Linear(self.hidden_size, self.output_dim)
        self.dropout = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        batch_size = x.size(0)
        lstm_out, _ = self.lstm(x)
        
        drop_out = self.dropout(lstm_out)
        re_drop_out = drop_out.reshape([-1, self.hidden_size])
            
        linear_out = self.linear(re_drop_out)
        
        sig_out = self.sig(linear_out).reshape([batch_size, -1])[:, -1]
        
        return sig_out

In [None]:
num_layers = 2
input_size = 100
hidden_size = 128
output_dim = 1

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [None]:
lstm_model = SentimentLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_dim=output_dim)

In [None]:
lstm_model.to(device)

In [None]:
lr = 0.001
clip = 5
epochs = 5

loss_func = nn.BCELoss()#.to(device)
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = lr)

def acc(pred, label):
    correct = torch.eq(pred.round(), label).sum().item()
    return correct

In [None]:
epoch_tr_acc, epoch_tr_loss = [], []
epoch_vl_acc, epoch_vl_loss = [],[]
for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    lstm_model.train()
    #h = lstm_model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        #h = tuple([each.data for each in h])
        
        pred = lstm_model(inputs)
        
        loss = loss_func(pred, labels)
        loss.backward()
        train_losses.append(loss.item())
        
        accuracy = acc(pred, labels)

        train_acc += accuracy
        
        nn.utils.clip_grad_norm_(lstm_model.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad()

    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = train_acc/len(train_dataloader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_tr_acc.append(epoch_train_acc)
    
    val_losses = []
    val_acc = 0.0
    lstm_model.eval()
    #val_h = lstm_model.init_hidden(batch_size, device)

    for inputs, labels in tqdm(valid_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        #val_h = tuple([each.data for each in val_h])
        pred = lstm_model(inputs)

        val_loss = loss_func(pred, labels.float())
        val_losses.append(val_loss.item())
        accuracy = acc(pred, labels)

        val_acc += accuracy
    
    epoch_val_loss = np.mean(val_losses)
    epoch_val_acc = val_acc/len(valid_dataloader.dataset)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_vl_acc.append(epoch_val_acc)

    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    print(25*'==')

In [None]:
torch.save(lstm_model, f='./k_sentiment_Fasttext_LSTM.bin')

In [11]:
model_x = torch.load('./k_sentiment_Fasttext_LSTM.bin')

In [12]:
model_x

SentimentLSTM(
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True)
  (linear): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (sig): Sigmoid()
)

In [None]:
fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()
    
plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

#  Test

In [None]:
def analyze_sent(sent, fast_model, lstm_model, khaiii, num_word):
    morphs = []
    try:
        for word in khaiii.analyze(sent):
            for m in word.morphs:
                morphs.append(m.lex)
    except:
        print('Can\'t analyze sentence')
        return -1
    
    
    if len(morphs) > num_word:
        morphs = morphs[:num_word]
        
    sent_vec = np.zeros((num_word, fast_model.get_dimension()), dtype=np.float32)
    
    for i, m in zip(range(num_word), morphs):
        word_vec = fast_model.get_word_vector(decompose(m)).astype(np.float32)
        sent_vec[-(i + 1)] = word_vec
    
    sent_tensor = torch.from_numpy(sent_vec)
    sent_tensor = sent_tensor.reshape([1, num_word, fast_model.get_dimension()])

    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    
    if device == 'cuda':
        sent_tensor = sent_tensor.to(device)
    
    lstm_model.eval()
    
    pred = lstm_model(sent_tensor)
    
    return pred

In [None]:
pred = analyze_sent('', fast_model, model_x, khaiii_api, 64)
print(pred)
      
if round(pred.item()) > 0.5:
    print('긍정')
else :
    print('부정')