In [1]:
# 0 := delete \n
# 1 := leave as it is

# model = gensim.models.Word2Vec.load('../../../pretrained_model/kor/ko.bin')

# from hangul_utils import split_syllables, join_jamos
# import gensim

In [2]:
import torch
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import data
import bpe
import utils
import pretrained_model as pm
import data_loader as dl
import trainer
import initializer as init
import tester

In [3]:
# df = data.getInitData()
# data.mk_initData(df)
# first_np, second_np, label_np = utils.process(df)

In [4]:
first_np, second_np, label_np  = data.getData()
first_np, second_np = utils.process_splitted(first_np, second_np)
first_ls, second_ls, ch2idx, max_len = utils.tokenize(first_np, second_np)
first2idx_np, second2idx_np = utils.encode(first_ls, second_ls, ch2idx, max_len)

In [5]:
pretrained_word2vec = pm.load_pretrained_model(ch2idx)
pretrained_word2vec = torch.tensor(pretrained_word2vec)

There are 27 / 132 pretrained vectors found.


In [6]:
train_first, test_first, train_second, test_second, train_labels, test_labels = train_test_split(
    first2idx_np, second2idx_np, label_np, test_size = 0.1, random_state = 43
)

train_first, val_first, train_second, val_second, train_labels, val_labels = train_test_split(
    train_first, train_second, train_labels, test_size = 0.1, random_state = 43
)

In [7]:
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_first,
                                                                   train_second,
                                                                   val_first,
                                                                   val_second,
                                                                   test_first,
                                                                   test_second,
                                                                   train_labels,
                                                                   val_labels,
                                                                   test_labels,
                                                                   batch_size=50)

In [8]:
# PyTorch TensorBoard support
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('ocr/tests')

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3070


In [9]:
trainer.set_seed(42)

model, optimizer, loss_fn = init.initialize_model(
    vocab_size=len(ch2idx),
    embed_dim=200,
    hidden_size=100,
    num_classes=2,
    rnn_layers=1,
    dropout=0.0,
    learning_rate=0.001,
    model_name="RNN",
    optim_name="Adam",
    loss_fn_name="CEL",
    pretrained_model=pretrained_word2vec,
    freeze_embedding=False,
    device=device,
)

print(model)

doing with pretrained model!!!
OCR(
  (emb): Embedding(132, 200)
  (lstm1): RNN(
    (rnn): LSTM(200, 100, batch_first=True, bidirectional=True)
  )
  (lstm2): RNN(
    (rnn): LSTM(200, 100, batch_first=True, bidirectional=True)
  )
  (fc1): Linear(in_features=400, out_features=2, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)


In [10]:
trainer.train(
    epochs=20,
    title='test3',
    writer=writer,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn
)

Start training...

 Epoch  |  Train Loss  | Train Acc  | Val Loss | Val Acc | Elapsed
--------------------------------------------------------------------------------
   1    |   0.239527   | 90.979424  | 0.187160 | 93.33  | 70.99 
   2    |   0.152824   | 94.358025  | 0.150207 | 94.81  | 70.17 
   3    |   0.131749   | 95.139918  | 0.145590 | 94.37  | 70.48 
   4    |   0.119353   | 95.674897  | 0.138006 | 94.89  | 71.33 
   5    |   0.109190   | 95.917695  | 0.137656 | 95.04  | 71.48 
   6    |   0.103007   | 96.111111  | 0.125594 | 95.63  | 71.63 
   7    |   0.096227   | 96.399177  | 0.132787 | 95.00  | 71.85 
   8    |   0.089348   | 96.596708  | 0.132698 | 94.78  | 71.56 
   9    |   0.082214   | 97.004115  | 0.125319 | 95.26  | 71.93 
  10    |   0.076732   | 97.230453  | 0.126546 | 95.19  | 71.92 
  11    |   0.070428   | 97.395062  | 0.143881 | 94.78  | 71.06 
  12    |   0.066333   | 97.674897  | 0.129552 | 95.19  | 71.02 
  13    |   0.059489   | 97.851852  | 0.138091 | 95.1

In [None]:
tot_pred, tot_label = tester.test(test_dataloader=test_dataloader,
                                  device=device, 
                                  model=model)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/test3.xlsx', sheet_name='sheet1')

In [16]:
# Again, grab a single mini-batch of images
dataiter = iter(train_dataloader)
first, second, labels = dataiter.next()

# add_graph() will trace the sample input through your model,
# and render it as a graph.
writer.add_graph(model.cpu(), (first, second))
writer.flush()