In [1]:
!pip install naruto_skills



In [2]:
import sys
sys.path.append('/source/main')

In [3]:
import logging
logging.basicConfig(level=logging.INFO)
import time
from itertools import chain

import torch
from torch import nn
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from preprocess import preprocessor
from model_def.seq2seq_attn import Seq2SeqAttn
from utils import pytorch_utils, text_utils
from data_for_train import dataset as my_dataset
from train import trainer

# Setup and load model

In [4]:
my_dataset.bootstrap()

INFO:root:Indexing vocabs successfully. Total vocabs: 25390
INFO:root:Indexing vocabs successfully. Total vocabs: 50437
INFO:root:Vocab for source from file /source/main/vocab/output/src.pkl contains 25390 tokens
INFO:root:Vocab for source from file /source/main/vocab/output/tgt.pkl contains 50437 tokens


In [5]:
voc = my_dataset.voc_tgt

In [7]:
np.mod

<ufunc 'remainder'>

In [13]:
temp = [[ 1201.,  7082.,  1099.,   167.,   644., 11650.,   772.,   897.,  1357., 4310.]]
voc.idx2docs(np.mod(np.array(temp).astype(int), len(voc.index2word)))

['nỗ Bạo nổ năng áp Lắm Vũ nợ Điện Áp']

['lực lực lực lực lực lực lực lực lực lúc']

In [6]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
# del model
model = Seq2SeqAttn(src_vocab_size=len(my_dataset.voc_src.index2word),
                    tgt_vocab_size=len(my_dataset.voc_tgt.index2word),
                    start_idx=2,
                    end_idx=3
                    )
model.to(device)
PRE_TRAINED_MODEL='/source/main/train/output/saved_models//Seq2SeqAttn/2019-05-30T06:27:24/210000.pt'
checkpoint = torch.load(PRE_TRAINED_MODEL, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
logging.info('Load pre-trained model from %s successfully', PRE_TRAINED_MODEL)

model.eval()
logging.info('Model architecture: \n%s', model)
logging.info('Total trainable parameters: %s', pytorch_utils.count_parameters(model))

INFO:root:Load pre-trained model from /source/main/train/output/saved_models//Seq2SeqAttn/2019-05-30T06:27:24/210000.pt successfully
INFO:root:Model architecture: 
Seq2SeqAttn(
  (encoder): Encoder(
    (embedding): Embedding(25390, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.3, bidirectional=True)
    (dropout): Dropout(p=0.3)
  )
  (flatten_hidden_lstm): FlattenHiddenLSTM()
  (core_decoder): AttnRawDecoder(
    (embedding): Embedding(50437, 256)
    (lstm): LSTM(256, 512, num_layers=3, dropout=0.3)
    (attention): Attention(
      (scoring): Linear(in_features=512, out_features=1024, bias=True)
      (softmax): Softmax()
    )
    (output_mapping): Linear(in_features=1536, out_features=50437, bias=True)
    (dropout): Dropout(p=0.3)
  )
  (greedy_infer): DecoderGreedyInfer(
    (core_decoder): AttnRawDecoder(
      (embedding): Embedding(50437, 256)
      (lstm): LSTM(256, 512, num_layers=3, dropout=0.3)
      (attention): Attention(
        (scoring): Linear(in_features

In [7]:
MAX_LEN = 100

In [12]:
def docs2input_tensors(docs, device):
    preprocessed_docs = preprocessor.infer_preprocess(docs, max_length=MAX_LEN)
    seq_len = [len(doc.split()) for doc in preprocessed_docs]
    word_input = my_dataset.voc_src.docs2idx(preprocessed_docs, equal_length=MAX_LEN)
    
    inputs = (word_input, seq_len)
    inputs = [np.array(i) for i in inputs]
    input_tensors = [torch.from_numpy(i) for i in inputs]
    input_tensors = [i.to(device) for i in input_tensors]
    return input_tensors

def replace_unk_tok(pred, src):
    pred = [p if p != '__o__' else s for p, s in zip(pred.split(), src.split())]
    return ' '.join(pred)

def predict_batch(docs):
    input_tensors = docs2input_tensors(docs, device)
    predict_tensor = model(*input_tensors)
    predict_numpy = predict_tensor.cpu().numpy()
    
    translated_docs = my_dataset.voc_tgt.idx2docs(predict_numpy)
    translated_docs = [' '.join(pred_doc.split()[:len(src_doc.split())]) 
                       for src_doc, pred_doc in zip(docs, translated_docs)]
    translated_docs = [replace_unk_tok(pred, src) for pred, src in zip(translated_docs, docs)]
    return translated_docs

def predict_docs(docs, batch_size):
    return list(chain(*[predict_batch(docs[i: i+batch_size]) for i in range(0, len(docs), batch_size)]))


In [14]:
predict_docs(['Hom nay toi di hoc karetoal'], batch_size=10)

['Hôm nay tôi đi học karetoal']

In [15]:
text_utils.decompose_predicted_test_file('/source/main/data_download/output/sample_pred.txt')

In [16]:
def get_metrics(df):
    logging.info('Total sentences: %s', df.shape[0])
    sen_acc = (df['tgt'] == df['pred']).sum()/df.shape[0]
    
    df = df[df['tgt'].map(lambda x: len(x.split())) == df['pred'].map(lambda x: len(x.split()))]
    logging.info('Total predicted sequences without changing len: %s', df.shape[0])
    tok_tgt = [tok for doc in df['tgt'] for tok in doc.split()]
    tok_pred = [tok for doc in df['pred'] for tok in doc.split()]
    sen_tok = (np.array(tok_tgt) == np.array(tok_pred)).sum()/len(tok_tgt)
    
    return sen_acc, sen_tok

# Predict

In [17]:
df = pd.read_csv('/source/main/data_for_train/output/my_test.csv')

In [18]:
start = time.time()
pred = predict_docs(list(df['src']), batch_size=128)
end = time.time()
df['pred'] = pred
logging.info('Duration: %.2f s' % (end-start))

KeyboardInterrupt: 

In [None]:
get_metrics(df)

- Baseline/2019-05-04T01:16:45:
  + 150k: 0.383, 0.9732856714953901
  + 180k: 0.3934, 0.9739834829348896
  + 120k (version on gpu): 0.3726, 0.9717781354609999
- SimpleButHuge/2019-05-04T01:40:37:
  + 80k: 0.3714, 0.9716497644886392
  + 150k: 0.4134, 0.9751091976155914

In [None]:
tgt = [text_utils.process_line(doc) for doc in df['tgt']]
tgt = list(zip(*tgt))[1]
# tgt = list(chain(*tgt))
pred = [text_utils.process_line(doc) for doc in df['pred']]
pred = list(zip(*pred))[1]
# pred = list(chain(*pred))

In [None]:
df['tgt_'] = tgt
df['pred_'] = pred

In [None]:
df[df['tgt_'].map(len) != df['pred_'].map(len)].shape

In [None]:
df_ = df[df['tgt_'].map(len) == df['pred_'].map(len)]

In [None]:
tgt = list(chain(*df_['tgt_']))
pred = list(chain(*df_['pred_']))

In [None]:
len(tgt)

In [None]:
len(pred)

In [None]:
(np.array(tgt) == np.array(pred)).sum()/len(tgt)

- SimpleButHuge/2019-05-04T01:40:37:
 + 470k: 0.9750297875460562

# Predict to summit

In [None]:
with open('/dataset/vietnamese_tone_prediction/test.txt', 'rt', encoding='utf-8') as i_f:
    lines = i_f.readlines()
lines = [l[:-1] for l in lines]
lines = [(l[:3], l[4:]) for l in lines]
df_submit = pd.DataFrame(lines)

In [None]:
df_submit[1] = preprocessor.infer_preprocess(df_submit[1], 200)

In [None]:
df_submit['len'] = df_submit[1].map(lambda x: len(x.split()))

In [None]:
df_submit['len'].describe()