In [1]:
import itertools
import time
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchtext import data
from torchtext import datasets
from torchtext.data import Field
from torchtext.data import BucketIterator
import torch.optim as optim
from konlpy.tag import *

tagger = Komoran()
tokenize = tagger.morphs

torch.manual_seed(0)
REVIEW = Field(sequential=True,
               tokenize=tokenize,
               use_vocab=True,
               include_lengths=True,
               batch_first=True)

LABEL = Field(sequential=False, use_vocab=False)

train, test = data.TabularDataset.splits(
                                         path='./',
                                         train='train.tsv',
                                         test='test.tsv', format='tsv',
                                         fields=[('review', REVIEW), ('label', LABEL)]

                                         )



train, valid = train.split(random_state=random.seed(0))
print("train length : {}".format(len(train)))
print("test length : {}".format(len(test)))
print("valid length : {}".format(len(valid)))


REVIEW.build_vocab(train)

train length : 2054
test length : 734
valid length : 881


In [2]:
device = torch.device("cuda:0")

batch_size = 32
REVIEW.build_vocab(train)
len(REVIEW.vocab)

# Make iterator for splits
train_iter, test_iter, val_iter = BucketIterator.splits(
    (train, test, valid), batch_size=batch_size, device=device, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.review), sort_within_batch=True, repeat=False) # x.TEXT 길이 기준으로 정렬

# <center>3. Build Model

In [3]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           batch_first=True,
                           dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, batch_first=True)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(
            packed_output)

        hidden = self.dropout(
            torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden.squeeze(0))

In [4]:
INPUT_DIM = len(REVIEW.vocab)
EMBEDDING_DIM = 70
HIDDEN_DIM = 128
OUTPUT_DIM = 3
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.3

PAD_IDX = REVIEW.vocab.stoi[REVIEW.pad_token]

In [5]:
model = Classifier(INPUT_DIM,
                   EMBEDDING_DIM,
                   HIDDEN_DIM,
                   OUTPUT_DIM,
                   N_LAYERS,
                   BIDIRECTIONAL,
                   DROPOUT,
                   PAD_IDX)
model.to(device)

import numpy as np
# numpy float 출력옵션 변경
np.set_printoptions(formatter={'float_kind': lambda x: "{0:0.3f}".format(x)})

STEP = 50

optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

In [6]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch
    """
    max_preds = preds.argmax(dim=1, keepdim=True)

    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

best_valid_loss =9999999

epoch_loss = 0
epoch_acc = 0

In [7]:
model.train()
for step in range(STEP):
    losses=[]
    for i, batch in enumerate(train_iter):
        model.train()
        inputs, lengths = batch.review
        targets = batch.label   
        model.zero_grad()

        preds = model(inputs, lengths).squeeze(1)


        loss = loss_function(preds, targets.long()) 

        losses.append(loss.item())

        loss.backward()
        optimizer.step()
    with torch.no_grad():

        model.eval()
        val_losses=[]
        val_accu = []
        for i, batch in enumerate(val_iter):
            inputs, lengths = batch.review
            targets = batch.label
            preds = model(inputs, lengths).squeeze(1)



            val_loss = loss_function(preds, targets.long())

            val_losses.append(val_loss.item())
            optimizer.step()
        print()
        string = '[{}/{}] val_loss: {:.4f}'.format(step+1, STEP, np.mean(val_losses))
        print(string)

        print()

        if np.mean(val_losses) < best_valid_loss:
            best_valid_loss = np.mean(val_losses)
            print("save model")
            print()

            torch.save(model.state_dict(), 'model_base.pt')


[1/50] val_loss: 0.8977

save model


[2/50] val_loss: 0.5335

save model


[3/50] val_loss: 0.4959

save model


[4/50] val_loss: 0.4900

save model


[5/50] val_loss: 0.4764

save model


[6/50] val_loss: 1.0130


[7/50] val_loss: 0.5033


[8/50] val_loss: 0.8399


[9/50] val_loss: 0.4762

save model


[10/50] val_loss: 0.4785


[11/50] val_loss: 0.4371

save model


[12/50] val_loss: 0.4670


[13/50] val_loss: 1.1172


[14/50] val_loss: 0.4614


[15/50] val_loss: 0.4784


[16/50] val_loss: 0.4429


[17/50] val_loss: 0.5288


[18/50] val_loss: 0.4885


[19/50] val_loss: 0.5676


[20/50] val_loss: 0.4585


[21/50] val_loss: 0.5371


[22/50] val_loss: 0.9924


[23/50] val_loss: 0.4948


[24/50] val_loss: 0.5683


[25/50] val_loss: 0.8437


[26/50] val_loss: 0.5667


[27/50] val_loss: 0.5149


[28/50] val_loss: 0.5938


[29/50] val_loss: 0.5429


[30/50] val_loss: 0.5424


[31/50] val_loss: 0.5870


[32/50] val_loss: 0.5964


[33/50] val_loss: 0.7695


[34/50] val_loss: 0.6454


[35/50

In [8]:
model.load_state_dict(torch.load('model_base.pt'),strict=False)

import torch.nn.functional as F

In [9]:
y_hat = []
y_real = []

num_equal=0

with torch.no_grad():
    model.eval()
    num_equal = 0
    val_loss = 0
    for i, batch in enumerate(test_iter):

        inputs, lengths = batch.review
        targets = batch.label
        
        if 0 in lengths:
            idxes = torch.arange(inputs.size(0))
            if USE_CUDA:
                idxes = idxes.cuda()
            mask = idxes[lengths.ne(0)].long()

            inputs = inputs.index_select(0, mask)
            lengths = lengths.masked_select(lengths.ne(0))
            targets = targets.index_select(0, mask)


        preds = model(inputs, lengths)
        loss = loss_function(preds, targets) 


        acc = categorical_accuracy(preds, targets)

        max_preds = preds.argmax(dim = 1, keepdim = True).squeeze(0) # get the index of the max probability
        correct = max_preds.squeeze(1).eq(targets) # 같은것만 찾는 코드

        max_preds = max_preds.squeeze()
        y_hat.append(max_preds.tolist())
        y_real.append(targets.tolist())
        num_equal += int(torch.eq(max_preds, targets).sum())
        val_loss += loss.item()
        
print("Accuracy : " , num_equal / len(pd.DataFrame.from_csv('test.tsv', sep='\t', header=None)))
print("loss : ", val_loss/len(test_iter))


y_hat_flat = list(itertools.chain(*y_hat))
y_real_flat = list(itertools.chain(*y_real))

Accuracy :  0.8065395095367848
loss :  0.48776885618766147




In [10]:
from sklearn.metrics import *
print(confusion_matrix(y_real_flat,y_hat_flat))

print(classification_report(y_real_flat, y_hat_flat, target_names=['class 1','class missing' ]))

[[ 67  88]
 [ 54 525]]
               precision    recall  f1-score   support

      class 1       0.55      0.43      0.49       155
class missing       0.86      0.91      0.88       579

    micro avg       0.81      0.81      0.81       734
    macro avg       0.71      0.67      0.68       734
 weighted avg       0.79      0.81      0.80       734



In [11]:
result_dic = classification_report(y_real_flat, y_hat_flat, target_names=['class 1','class missing' ], output_dict=True)

In [12]:
model.parameters

<bound method Module.parameters of Classifier(
  (embedding): Embedding(9218, 70, padding_idx=1)
  (rnn): LSTM(70, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.3)
)>

In [13]:
err = result_dic['class 1']['f1-score'] 

In [14]:
err

0.48550724637681164

In [15]:
import json
with open('logging/{}_base.json'.format(err), 'w', encoding='utf-8') as make_file:
    json.dump(result_dic, make_file, ensure_ascii=False, indent="\t")