In [None]:
from fastai2.basics import *
from transformers import AutoTokenizer
from fastai_transformers_utils.all import *

from nmt_try.models.patch import *
from nmt_try.models.qrnn2qrnn import *
from nmt_try.data.tatoeba import *
from nmt_try.metrics import compute_bleu

In [None]:
# all_skip

In [None]:
tok_data_loc = './test_data/tok_cmn.csv'
enc_model_name = 'hfl/chinese-bert-wwm-ext'
dec_model_name = 'distilgpt2'
enc_seq_len = 50
dec_seq_len = 40

In [None]:
enc_tokenizer = AutoTokenizer.from_pretrained(enc_model_name)
dec_tokenizer = GPT2DecoderTokenizer.from_pretrained(dec_model_name)

# Full Test of QRNN2AttnQRNN

## Datasets

In [None]:
small_dss = get_tatoeba_dss(tok_data_loc, enc_tokenizer, dec_tokenizer, enc_seq_len, dec_seq_len, pct=0.2)
dss = get_tatoeba_dss(tok_data_loc, enc_tokenizer, dec_tokenizer, enc_seq_len, dec_seq_len)
len(small_dss.train), len(dss.train)

(3392, 16964)

In [None]:
dss.train[10], dss.decode(dss.train[10])

((TensorText([ 101,  800, 6651,  749,  511,  102,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0]),
  TensorText([50257,  1544,  4966,    13, 50256, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258]),
  TensorText([ 1544,  4966,    13, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 5

In [None]:
# dls = dss.dataloaders(bs=2)
# for x in dls.train:
#     print(x[0].shape, x[0].dtype, x[0].device, type(x[0]))
#     print(x[1].shape, x[1].dtype, x[0].device, type(x[1]))
#     print(x[2].shape, x[2].dtype, x[0].device, type(x[2]))
#     break

## Model

In [None]:
enc_vocab_size = len(enc_tokenizer)
enc_pad_id = enc_tokenizer.pad_token_id

dec_vocab_size = len(dec_tokenizer)
dec_pad_id = dec_tokenizer.pad_token_id

embeded_size = 256
num_encoder_layers = 2
num_decoder_layers = 2
drop_p = 0.1

num_heads=1

In [None]:
%xdel qrnn2attnqrnn
%xdel decoder
%xdel encoder
encoder = QRNNEncoder(enc_vocab_size, embeded_size, enc_pad_id, num_encoder_layers, drop_p)
decoder = CrossAttnQRNNDecoder(dec_vocab_size, embeded_size, dec_pad_id, num_decoder_layers, drop_p, num_heads)
qrnn2attnqrnn = QRNN2AttnQRNN(encoder, decoder, num_encoder_layers, num_decoder_layers, embeded_size, enc_pad_id)

NameError: name 'qrnn2attnqrnn' is not defined
NameError: name 'decoder' is not defined
NameError: name 'encoder' is not defined


## Learner and Train

In [None]:
%xdel dls
%xdel learn
dls = small_dss.dataloaders(bs=128)
# dls = dss.dataloaders(bs=128)
learn = Learner(dls, 
                qrnn2attnqrnn, 
                loss_func=CrossEntropyLossFlat(ignore_index=dec_pad_id), 
                opt_func=Adam,
                metrics=[accuracy, Perplexity()],
                cbs=[QRNN2QRNNCallback()]
               ).to_fp16()

NameError: name 'dls' is not defined
NameError: name 'learn' is not defined


In [None]:
learn.fit_one_cycle(1, 5e-4)

In [None]:
# learn.fit_one_cycle(3, 5e-4)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.785617,5.789774,0.025,326.939117,00:48
1,5.883923,5.723858,0.025,306.083496,00:47
2,5.720279,5.64751,0.024375,283.584351,00:48


## Bleu

In [None]:
generated_qrnn2attnqrnn = GeneratedQRNN2AttnQRNN(qrnn2attnqrnn, enc_tokenizer, dec_tokenizer)
generate_args = GenerateArgs(   
    max_length=20,
#     do_sample=True,
    num_beams=1,
    temperature=1.0,
    repetition_penalty=1,
    length_penalty=1.0,
)

In [None]:
compute_bleu(generated_qrnn2attnqrnn, generate_args, dec_tokenizer, dls.valid)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


2.1120610680284926e-52

## Generate

In [None]:
generate_args = GenerateArgs(   
    max_length=20,
#     do_sample=True,
    num_beams=3,
    temperature=1.0,
    repetition_penalty=1,
    length_penalty=1.0,
)

In [None]:
src_strs = [
    '他们有自己的愿景，自己的规划师、建筑师和工程师，自己的劳动力。',
    '随着需求不断攀升，不仅亚洲会创造出更多更好的就业机会，全球范围内的供应链及整个生产网络也将会从中受益。',
    '如果欧盟想要突破共同的经济和货币政策的界限，在发展安全防卫政策的同时发展共同的外交政策，英国必须参与。',
    '如今，当新的流感菌株在亚洲出现时，科学家收集咽喉棉签，分离病毒，测定毒株的基因序列。',
    '李总统计划的其他要素还包括建设生态友好的运输网络，例如高速铁路以及几百公里长的自行车车道，并且从垃圾堆中利用甲烷来制造能源。',
]
tgt_strs = [
    'They have their own vision; their own planners, architects, and engineers; and their own manpower.',
    'As demand rises, more and better jobs will be created not only in Asia, but also globally, along supply chains and across production networks.',
    'If the EU is to progress beyond the limits of a common economic and monetary policy and develop a defense and security policy along with a common foreign policy, the UK must be on board.',
    'Today, when a new strain of influenza appears in Asia, scientists collect a throat swab, isolate the virus, and run the strain’s genetic sequence.',
    'Other elements of Lee’s plan include construction of eco-friendly transportation networks, such as high-speed railways and hundreds of kilometers of bicycle tracks, and generating energy using waste methane from landfills.',

]
result = generated_qrnn2attnqrnn.generate_from_strs(src_strs, generate_args, device='cuda:0')
result

['..................',
 '..................',
 '..................',
 '..................',
 '..................']

In [None]:
src_strs = [
    '他们有自己的愿景，自己的规划师、建筑师和工程师，自己的劳动力。',
    '随着需求不断攀升，不仅亚洲会创造出更多更好的就业机会，全球范围内的供应链及整个生产网络也将会从中受益。',
    '如果欧盟想要突破共同的经济和货币政策的界限，在发展安全防卫政策的同时发展共同的外交政策，英国必须参与。',
    '如今，当新的流感菌株在亚洲出现时，科学家收集咽喉棉签，分离病毒，测定毒株的基因序列。',
    '李总统计划的其他要素还包括建设生态友好的运输网络，例如高速铁路以及几百公里长的自行车车道，并且从垃圾堆中利用甲烷来制造能源。',
]
tgt_strs = [
    'They have their own vision; their own planners, architects, and engineers; and their own manpower.',
    'As demand rises, more and better jobs will be created not only in Asia, but also globally, along supply chains and across production networks.',
    'If the EU is to progress beyond the limits of a common economic and monetary policy and develop a defense and security policy along with a common foreign policy, the UK must be on board.',
    'Today, when a new strain of influenza appears in Asia, scientists collect a throat swab, isolate the virus, and run the strain’s genetic sequence.',
    'Other elements of Lee’s plan include construction of eco-friendly transportation networks, such as high-speed railways and hundreds of kilometers of bicycle tracks, and generating energy using waste methane from landfills.',

]
result = generated_qrnn2attnqrnn.generate_from_strs(src_strs, generate_args, device='cuda:0')
result

['..................',
 '..................',
 '..................',
 '..................',
 '..................']