In [1]:
!pip install naruto_skills



In [0]:
import sys
sys.path.append('/source/main')

In [0]:
import logging
logging.basicConfig(level=logging.INFO)
import time
from itertools import chain

import torch
from torch import nn
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from model_def.baseline import Baseline
from utils import pytorch_utils
from data_for_train import my_dataset
from train import trainer

# Setup and load model

In [4]:
my_dataset.bootstrap()

INFO:root:Src vocab contains 3519 tokens
INFO:root:Tgt vocab contains 8758 tokens


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# del model
model = Baseline(src_word_vocab_size=len(my_dataset.voc_src.index2word),
                     tgt_word_vocab_size=len(my_dataset.voc_tgt.index2word))
model.to(device)
PRE_TRAINED_MODEL='/source/main/train/output/saved_models/Baseline/2019-05-04T01:16:45/180000.pt'
checkpoint = torch.load(PRE_TRAINED_MODEL, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
logging.info('Load pre-trained model from %s successfully', PRE_TRAINED_MODEL)

model.eval()
logging.info('Model architecture: \n%s', model)
logging.info('Total trainable parameters: %s', pytorch_utils.count_parameters(model))

INFO:root:Load pre-trained model from /source/main/train/output/saved_models/Baseline/2019-05-04T01:16:45/180000.pt successfully
INFO:root:Model architecture: 
Baseline(
  (input_embedding): Embedding(3519, 512)
  (conv1): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3_bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.3)
  (conv4): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv4_bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv1d(512, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv5_bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv6_bn): BatchNorm1d(256, eps=1

In [0]:
MAX_LEN = 100

In [0]:
def docs2input_tensors(preprocessed_docs, device):
    seq_len = [len(doc.split()) for doc in preprocessed_docs]
    word_input = [my_dataset.voc_src.docs2idx([doc], equal_length=MAX_LEN)[0] for doc in preprocessed_docs]
    
    inputs = (word_input, seq_len)
    inputs = [np.array(i) for i in inputs]
    input_tensors = [torch.from_numpy(i) for i in inputs]
    input_tensors = [i.to(device) for i in input_tensors]
    return input_tensors

def replace_unk_tok(pred, src):
    pred = [p if p!='¶' else s for p, s in zip(pred.split(), src.split())]
    return ' '.join(pred)

def predict_batch(docs):
    input_tensors = docs2input_tensors(docs, device)
    predict_tensor = model.cvt_output(model(*input_tensors))
    predict_numpy = predict_tensor.cpu().numpy()
    
    translated_docs = my_dataset.voc_tgt.idx2docs(predict_numpy)
    translated_docs = [' '.join(pred_doc.split()[:len(src_doc.split())]) 
                       for src_doc, pred_doc in zip(docs, translated_docs)]
    translated_docs = [replace_unk_tok(pred, src) for pred, src in zip(translated_docs, docs)]
    return translated_docs

def predict_docs(docs, batch_size):
    return list(chain(*[predict_batch(docs[i: i+batch_size]) for i in range(0, len(docs), batch_size)]))


In [8]:
predict_docs(['hom nay toi di hoc karetoal'], batch_size=10)

['hôm nay tôi đi học karetoal']

In [0]:
def get_metrics(df):
    logging.info('Total sentences: %s', df.shape[0])
    sen_acc = (df['tgt'] == df['pred']).sum()/df.shape[0]
    
    df = df[df['tgt'].map(lambda x: len(x.split())) == df['pred'].map(lambda x: len(x.split()))]
    logging.info('Total predicted sequences without changing len: %s', df.shape[0])
    tok_tgt = [tok for doc in df['tgt'] for tok in doc.split()]
    tok_pred = [tok for doc in df['pred'] for tok in doc.split()]
    sen_tok = (np.array(tok_tgt) == np.array(tok_pred)).sum()/len(tok_tgt)
    
    return sen_acc, sen_tok

# Predict

In [0]:
df = pd.read_csv('/source/main/data_for_train/output/my_test.csv')


In [11]:
start = time.time()
pred = predict_docs(list(df['src']), batch_size=128)
end = time.time()
df['pred'] = pred
logging.info('Duration: %.2f s' % (end-start))

INFO:root:Duration: 3.09 s


In [12]:
get_metrics(df)

INFO:root:Total sentences: 5000
INFO:root:Total predicted sequences without changing len: 5000


(0.3934, 0.9739834829348896)

- Baseline/2019-05-04T01:16:45:
  + 150k: 0.383, 0.9732856714953901
  + 180k: 0.3934, 0.9739834829348896