# 코랩 환경설정

In [None]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 현재 노트북 파일의 경로로 작업 디렉토리 변경
import os
notebook_path = '/content/drive/MyDrive/project'
os.chdir(notebook_path)

# Mecab Download (For Colab)

In [None]:
!bash install_mecab-ko_on_colab_light_220429.sh # 1~3분정도 소요

In [None]:
from konlpy.tag import Mecab
# 안되면 위에 **디렉토리변경** 한 다음에 하세요~!!!!!!!!!!!!!!
tokenizer = Mecab(dicpath='mecab-ko-dic-2.1.1-20180720').nouns

# 0. Utils

In [None]:
import sys
import csv
csv.field_size_limit(sys.maxsize)
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn import metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data.dataset import Dataset
import numpy as np
import nltk # default word/sentence tokenizer
nltk.download('punkt')
from torch.utils.data import DataLoader
import shutil
# from konlpy.tag import Mecab

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def get_evaluation(y_true, y_prob, list_metrics):
    y_pred = np.argmax(y_prob, -1)
    output = {}
    if 'accuracy' in list_metrics:
        output['accuracy'] = metrics.accuracy_score(y_true, y_pred)
    if 'loss' in list_metrics:
        try:
            output['loss'] = metrics.log_loss(y_true, y_prob)
        except ValueError:
            output['loss'] = -1
    if 'confusion_matrix' in list_metrics:
        output['confusion_matrix'] = str(metrics.confusion_matrix(y_true, y_pred))
    return output

def matrix_mul(input, weight, bias=False):
    feature_list = []
    for feature in input:
        feature = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze()

def element_wise_mul(input1, input2):

    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        feature_2 = feature_2.unsqueeze(1).expand_as(feature_1)
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))
    output = torch.cat(feature_list, 0)
    return torch.sum(output, 0).unsqueeze(0)

def get_max_lengths(data_path):
    word_length_list = []
    sent_length_list = []
    with open(data_path) as csv_file:
        reader = csv.reader(csv_file, quotechar='"')
        for idx, line in enumerate(reader):
            text = ""
            for tx in line[1:]:
                text += tx.lower()
                text += " "
            sent_list = sent_tokenize(text)
            sent_length_list.append(len(sent_list))

            for sent in sent_list:
                word_list = word_tokenize(sent)
                word_length_list.append(len(word_list))

        sorted_word_length = sorted(word_length_list)
        sorted_sent_length = sorted(sent_length_list)

    return sorted_word_length[int(0.9*len(sorted_word_length))], sorted_sent_length[int(0.9*len(sorted_sent_length))]

# 1. Dataset

In [None]:
class MyDataset(Dataset):
    def __init__(self, data_path, dict_path, max_length_sentences, max_length_word, tokenizer=word_tokenize):
        super(MyDataset, self).__init__()

        texts, labels = [], []
        with open(data_path, encoding='utf-8-sig') as csv_file:
            reader = csv.reader(csv_file, quotechar='"')
            for idx, line in enumerate(reader):
                text = ""
                for tx in line[1:]:
                    text += tx.lower()
                    text += " "
                label = int(line[0])
                texts.append(text)
                labels.append(label)

        self.texts = texts
        self.labels = labels
        self.dict = pd.read_csv(filepath_or_buffer=dict_path, header=None, sep=" ", quoting=csv.QUOTE_NONE,
                                usecols=[0]).values
        self.dict = [word[0] for word in self.dict]
        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.num_classes = len(set(self.labels)) # classification class num

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        label = self.labels[index]
        text = self.texts[index]
        document_encode = [
            [self.dict.index(word) if word in self.dict else -1 for word in self.tokenizer(sentences)] for sentences
            in
            sent_tokenize(text=text)] # get dictionary's index

        for sentences in document_encode: # padding (to max length word)
            if len(sentences) < self.max_length_word:
                extended_words = [-1 for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[-1 for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0) # 여러개의 배열을 하나의 배열로 쌓아올림
        document_encode += 1 # -1을 0으로 맞추기

        return document_encode.astype(np.int64), label

# 2. Word Attention

In [None]:
class WordAttNet(nn.Module):
    def __init__(self, dictionary_path, hidden_size=50):
        super(WordAttNet, self).__init__()
        dict = pd.read_csv(filepath_or_buffer=dictionary_path, header=None, sep=" ",
                           quoting=csv.QUOTE_NONE).values[:,1:] # 두번째 열부터 선택
        dict_len, embed_size = dict.shape
        dict_len += 1

        unknown_word = np.zeros((1, embed_size))
        # unknown_word 배열과 dict 배열을 합친 후, pytorch tensor로 변환함
        dict = torch.from_numpy(np.concatenate([unknown_word, dict], axis=0).astype(float))

        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size))
        self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1))

        self.lookup = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(dict)


        self.gru = nn.GRU(embed_size, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):

        output = self.lookup(input)
        f_output, h_output = self.gru(output.float(), hidden_state)  # feature output and hidden state output
        output = matrix_mul(f_output, self.word_weight, self.word_bias)
        output = matrix_mul(output, self.context_weight).permute(1,0)
        output = F.softmax(output, dim=1)
        output = element_wise_mul(f_output,output.permute(1,0))

        return output, h_output

# 3. Sentence Attention

In [None]:
class SentAttNet(nn.Module):
    def __init__(self, sent_hidden_size=50, word_hidden_size=50, num_classes=8):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 2 * sent_hidden_size))
        self.sent_bias = nn.Parameter(torch.Tensor(1, 2 * sent_hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 1))

        self.gru = nn.GRU(2 * word_hidden_size, sent_hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * sent_hidden_size, num_classes)

        self.sent_softmax = nn.Softmax(dim=1)
        self.fc_softmax = nn.Softmax(dim=1)

        self.classifier = nn.Sequential(
            nn.Linear(2 * sent_hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, num_classes)
        )

        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):
        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):

        f_output, h_output = self.gru(input, hidden_state)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output, dim=1)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        output = self.classifier(output) ####

        return output, h_output

# 4. Hierarchical Attention

In [None]:
class HierAttNet(nn.Module):
    def __init__(self, word_hidden_size, sent_hidden_size, batch_size, num_classes, pretrained_dictionary_path,
                 max_sent_length, max_word_length):
        super(HierAttNet, self).__init__()
        self.batch_size = batch_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length
        self.word_att_net = WordAttNet(pretrained_dictionary_path, word_hidden_size)
        self.sent_att_net = SentAttNet(sent_hidden_size, word_hidden_size, num_classes)
        self._init_hidden_state()

    def _init_hidden_state(self, last_batch_size=None):
        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):

        output_list = []
        input = input.permute(1, 0, 2)
        for i in input:
            output, self.word_hidden_state = self.word_att_net(i.permute(1, 0), self.word_hidden_state)
            output_list.append(output)
        output = torch.cat(output_list, 0)
        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)

        return output

# parameter tuning

In [None]:
batch_size = 32
num_epochs = 10
learning_rate = 1e-3
####################################################################################
word_hidden_size = 64
sent_hidden_size = 64
####################################################################################
# train_data = 'data/Training_dataset.txt' # training data
train_data = 'data/new_training_0823.txt'
test_data = 'data/new_validation_0823.txt'
####################################################################################
from konlpy.tag import Mecab
tokenizer = Mecab().nouns
dictionary_path = 'data/ko_w2v_version2.txt'

In [None]:
training_params = {"batch_size": batch_size, "shuffle": True, "drop_last": True}
test_params = {'batch_size': batch_size, 'shuffle':False, "drop_last":False}
max_word_length, max_sent_length = 32, 20 #get_max_lengths(train_data) # 32 20 정도 하면 될듯 ..??
training_set = MyDataset(train_data, dictionary_path, max_sent_length, max_word_length, tokenizer=tokenizer)
training_generator = DataLoader(training_set, **training_params)
test_set = MyDataset(test_data, dictionary_path, max_sent_length, max_word_length, tokenizer=tokenizer)
test_generator = DataLoader(test_set, **test_params)

In [None]:
max_word_length, max_sent_length

(32, 20)

# model

In [None]:
model = HierAttNet(
    word_hidden_size, sent_hidden_size,
    32, 2,
    dictionary_path, 20, 32
    )

# move model to GPU
if torch.cuda.is_available():
    model.cuda()

In [None]:
from tqdm import tqdm, tqdm_notebook

### Hyperparameter Tuning

In [None]:
# 손실함수와 optimizer를 정의합니다.
criterion = nn.CrossEntropyLoss()
# criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

best_loss = 1e5
best_epoch = 0
num_iter_per_epoch = len(training_generator)

# 저장된 checkpoint 불러와서 다시 학습

In [None]:
from torch.utils.tensorboard import SummaryWriter
logdir = 'logs'
writer = SummaryWriter(logdir)

In [None]:
""" 모델 epoch 평가 """
def eval_epoch(num_epoch, epoch, model, criterion, optimizer, data_loader):
    model.eval() # 모델을 평가 모드로 설정

    loss_ls = []
    te_label_ls = []
    te_pred_ls = []

    with tqdm_notebook(total=len(data_loader), desc=f"Valid") as pbar:
        for feature, label in data_loader:
            num_sample = len(label)

            if torch.cuda.is_available():
                feature = feature.cuda()
                label = label.cuda()

            with torch.no_grad():
                model._init_hidden_state(num_sample)
                te_predictions = model(feature)

            te_loss = criterion(te_predictions, label)
            loss_ls.append(te_loss * num_sample)
            te_label_ls.extend(label.clone().cpu())
            te_pred_ls.append(te_predictions.clone().cpu())
            pbar.update(1)

        te_loss = sum(loss_ls) / test_set.__len__()
        te_pred = torch.cat(te_pred_ls, 0)
        label = np.array(te_label_ls)
        test_metrics = get_evaluation(label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"])
        confusion_matrix_str = test_metrics.get('confusion_matrix', None)

        print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
            epoch + 1,
            num_epochs,
            optimizer.param_groups[0]['lr'],
            te_loss, test_metrics["accuracy"]))

    return test_metrics["accuracy"], test_metrics["confusion_matrix"]

In [None]:
train_losses = []

In [None]:
model = HierAttNet(word_hidden_size, sent_hidden_size, 32, 2, dictionary_path, 20, 32)

In [None]:
word_hidden_size, sent_hidden_size, dictionary_path

(64, 64, 'data/ko_w2v_version2.txt')

In [50]:
ls

[0m[01;34mcheckpoints[0m/                               Model_Saved_dict.pth
Classify_HAN.ipynb                         Model_Saved.pth
[01;34mdata[0m/                                      models.py
install_mecab-ko_on_colab_light_220429.sh  [01;34m__pycache__[0m/
ko_w2v_128.txt                             Training_0823_WED_sub.ipynb
[01;34mlogs[0m/                                      Untitled0.ipynb
[01;34mmecab-ko-dic-2.1.1-20180720[0m/


In [None]:
checkpoint_path = 'checkpoints/N_model_epoch_5_0.83.pth'  # 원하는 체크포인트 파일 경로
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
model_path = 'checkpoints/'
model_name_template = 'N_model_epoch_{}_{}.pth'
num_iter_per_epoch = len(training_generator)

checkpoint_path = 'checkpoints/N_model_epoch_4_0.81.pth'  # 원하는 체크포인트 파일 경로

checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch'] + 1  # 재개할 에포크 설정


# 학습 재개

PATH = f'{model_path}/Model_save0.pth'
torch.save(model, PATH)


for epoch in range(start_epoch, num_epochs):
    params = {'num_epoch': num_epochs, 'epoch': epoch, 'model': model, 'optimizer': optimizer, 'criterion': criterion}


    num_iter_per_epoch = len(training_generator)

    for iter, (feature, label) in enumerate(training_generator):
        model.train() # 모델을 학습 모드로 설정
        if torch.cuda.is_available():
            feature = feature.cuda()
            label = label.cuda()

        model._init_hidden_state()
        predictions = model(feature)

        # 손실 함수를 이용하여 loss를 계산
        loss = criterion(predictions, label)

        # optimizer의 gradient 초기화
        optimizer.zero_grad()
        loss.backward() # gradient 계산
        optimizer.step() # model's parameter update using optimizer

        training_metrics = get_evaluation(label.cpu().numpy(), predictions.cpu().detach().numpy(), list_metrics=["accuracy"])

        if (iter+1) % 100 == 0:
          print("Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
              epoch + 1,
              num_epochs,
              iter + 1,
              num_iter_per_epoch,
              optimizer.param_groups[0]['lr'],
              loss, training_metrics["accuracy"]))

        # 100번째 반복마다 저장 및 validation
        if (iter+1) % 400 == 0:
            # 모델 및 옵티마이저 상태 저장
            accuracy, confusion_matrix = eval_epoch(**params, data_loader=test_generator)
            print(confusion_matrix)
            model_file_name = model_name_template.format(epoch, round(accuracy,2))
            torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch}, model_path + model_file_name)
            PATH = f'{model_path}/Model_save_{epoch}'
            torch.save(model, PATH)


Epoch: 6/10, Iteration: 100/1195, Lr: 0.001, Loss: 0.2434108555316925, Accuracy: 0.875
Epoch: 6/10, Iteration: 200/1195, Lr: 0.001, Loss: 0.19735310971736908, Accuracy: 0.9375
Epoch: 6/10, Iteration: 300/1195, Lr: 0.001, Loss: 0.2981373369693756, Accuracy: 0.9375
Epoch: 6/10, Iteration: 400/1195, Lr: 0.001, Loss: 0.22876238822937012, Accuracy: 0.9375


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm_notebook(total=len(data_loader), desc=f"Valid") as pbar:


Valid:   0%|          | 0/71 [00:00<?, ?it/s]

Epoch: 6/10, Lr: 0.001, Loss: 0.41569772362709045, Accuracy: 0.8142222222222222
[[929 247]
 [171 903]]
Epoch: 6/10, Iteration: 500/1195, Lr: 0.001, Loss: 0.24215760827064514, Accuracy: 0.90625
Epoch: 6/10, Iteration: 600/1195, Lr: 0.001, Loss: 0.326753169298172, Accuracy: 0.84375
Epoch: 6/10, Iteration: 700/1195, Lr: 0.001, Loss: 0.1374119371175766, Accuracy: 0.96875
Epoch: 6/10, Iteration: 800/1195, Lr: 0.001, Loss: 0.4024469554424286, Accuracy: 0.875


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm_notebook(total=len(data_loader), desc=f"Valid") as pbar:


Valid:   0%|          | 0/71 [00:00<?, ?it/s]

Epoch: 6/10, Lr: 0.001, Loss: 0.4005464017391205, Accuracy: 0.832
[[893 283]
 [ 95 979]]
Epoch: 6/10, Iteration: 900/1195, Lr: 0.001, Loss: 0.24977155029773712, Accuracy: 0.9375
Epoch: 6/10, Iteration: 1000/1195, Lr: 0.001, Loss: 0.2938868999481201, Accuracy: 0.875
Epoch: 6/10, Iteration: 1100/1195, Lr: 0.001, Loss: 0.18118931353092194, Accuracy: 0.9375
Epoch: 7/10, Iteration: 100/1195, Lr: 0.001, Loss: 0.09860434383153915, Accuracy: 1.0
Epoch: 7/10, Iteration: 200/1195, Lr: 0.001, Loss: 0.13084714114665985, Accuracy: 0.9375
Epoch: 7/10, Iteration: 300/1195, Lr: 0.001, Loss: 0.22303692996501923, Accuracy: 0.96875
Epoch: 7/10, Iteration: 400/1195, Lr: 0.001, Loss: 0.22302667796611786, Accuracy: 0.875


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm_notebook(total=len(data_loader), desc=f"Valid") as pbar:


Valid:   0%|          | 0/71 [00:00<?, ?it/s]

Epoch: 7/10, Lr: 0.001, Loss: 0.46935051679611206, Accuracy: 0.8217777777777778
[[923 253]
 [148 926]]


KeyboardInterrupt: ignored