In [None]:
import itertools
import time
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchtext import data
from torchtext import datasets
from torchtext.data import Field
from torchtext.data import BucketIterator
import torch.optim as optim
from konlpy.tag import *

tagger = Komoran()
tokenize = tagger.morphs

torch.manual_seed(0)
REVIEW = Field(sequential=True,
               tokenize=tokenize,
               use_vocab=True,
               include_lengths=True,
               batch_first=True)

LABEL = Field(sequential=False, use_vocab=False)

train, test = data.TabularDataset.splits(
                                         path='./',
                                         train='train.tsv',
                                         test='test.tsv', format='tsv',
                                         fields=[('review', REVIEW), ('label', LABEL)]

                                         )



train, valid = train.split(random_state=random.seed(0))
print("train length : {}".format(len(train)))
print("test length : {}".format(len(test)))
print("valid length : {}".format(len(valid)))


REVIEW.build_vocab(train)

In [None]:
device = torch.device("cuda:0")

batch_size = 128
REVIEW.build_vocab(train)
len(REVIEW.vocab)

# Make iterator for splits
train_iter, test_iter, val_iter = BucketIterator.splits(
    (train, test, valid), batch_size=batch_size, device=device, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.review), sort_within_batch=True, repeat=False) # x.TEXT 길이 기준으로 정렬

# <center>3. Build Model

In [None]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, batch_first=True)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(
            packed_output)

        hidden = self.dropout(
            torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(REVIEW.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 3
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0.3

PAD_IDX = REVIEW.vocab.stoi[REVIEW.pad_token]

In [None]:
model = Classifier(INPUT_DIM,
                   EMBEDDING_DIM,
                   HIDDEN_DIM,
                   OUTPUT_DIM,
                   N_LAYERS,
                   BIDIRECTIONAL,
                   DROPOUT,
                   PAD_IDX)
model.to(device)

import numpy as np
# numpy float 출력옵션 변경
np.set_printoptions(formatter={'float_kind': lambda x: "{0:0.3f}".format(x)})

STEP = 50

optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

In [None]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch
    """
    max_preds = preds.argmax(dim=1, keepdim=True)

    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

best_valid_loss =9999999

epoch_loss = 0
epoch_acc = 0

In [None]:
model.train()
for step in range(STEP):
    losses=[]
    for i, batch in enumerate(train_iter):
        model.train()
        inputs, lengths = batch.review
        targets = batch.label   
        model.zero_grad()

        preds = model(inputs, lengths).squeeze(1)


        loss = loss_function(preds, targets.long()) 

        losses.append(loss.item())

        loss.backward()
        optimizer.step()
    with torch.no_grad():

        model.eval()
        val_losses=[]
        val_accu = []
        for i, batch in enumerate(val_iter):
            inputs, lengths = batch.review
            targets = batch.label
            preds = model(inputs, lengths).squeeze(1)



            val_loss = loss_function(preds, targets.long())

            val_losses.append(val_loss.item())
            optimizer.step()
        print()
        string = '[{}/{}] val_loss: {:.4f}'.format(step+1, STEP, np.mean(val_losses))
        print(string)

        print()

        if np.mean(val_losses) < best_valid_loss:
            best_valid_loss = np.mean(val_losses)
            print("save model")
            print()

            torch.save(model.state_dict(), 'model_base.pt')

In [None]:
model.load_state_dict(torch.load('model_base.pt'),strict=False)

import torch.nn.functional as F

In [None]:
y_hat = []
y_real = []

num_equal=0

with torch.no_grad():
    model.eval()
    num_equal = 0
    val_loss = 0
    for i, batch in enumerate(test_iter):

        inputs, lengths = batch.review
        targets = batch.label
        
        if 0 in lengths:
            idxes = torch.arange(inputs.size(0))
            if USE_CUDA:
                idxes = idxes.cuda()
            mask = idxes[lengths.ne(0)].long()

            inputs = inputs.index_select(0, mask)
            lengths = lengths.masked_select(lengths.ne(0))
            targets = targets.index_select(0, mask)


        preds = model(inputs, lengths)
        loss = loss_function(preds, targets) 


        acc = categorical_accuracy(preds, targets)

        max_preds = preds.argmax(dim = 1, keepdim = True).squeeze(0) # get the index of the max probability
        correct = max_preds.squeeze(1).eq(targets) # 같은것만 찾는 코드

        max_preds = max_preds.squeeze()
        y_hat.append(max_preds.tolist())
        y_real.append(targets.tolist())
        num_equal += int(torch.eq(max_preds, targets).sum())
        val_loss += loss.item()
        
print("Accuracy : " , num_equal / len(pd.DataFrame.from_csv('test.tsv', sep='\t', header=None)))
print("loss : ", val_loss/len(test_iter))


y_hat_flat = list(itertools.chain(*y_hat))
y_real_flat = list(itertools.chain(*y_real))

In [None]:
from sklearn.metrics import *
print(confusion_matrix(y_real_flat,y_hat_flat))

print(classification_report(y_real_flat, y_hat_flat, target_names=['class 0', 'class 1','class missing' ]))

In [None]:
result_dic = classification_report(y_real_flat, y_hat_flat, target_names=['class 0', 'class 1','class missing' ], output_dict=True)

In [None]:
model.parameters

In [None]:
err = ( result_dic['class 0']['f1-score'] + result_dic['class 1']['f1-score'] ) / 2

In [None]:
err

In [None]:
import json
with open('logging/{}.json'.format(err), 'w', encoding='utf-8') as make_file:
    json.dump(result_dic, make_file, ensure_ascii=False, indent="\t")