In [None]:
# export
from transformers import AutoTokenizer
from fastai2.basics import *

from fastai_transformers_utils.generated_lm import GenerateArgs
from fastai_transformers_utils.tokenizers import GPT2DecoderTokenizer

from nmt_try.models.gru2gru import GeneratedGRU2GRU, GRUEncoder, GRUDecoder, GRU2GRU
from nmt_try.models.tran2tran import GeneratedTran2Tran, TranEncoder, TranDecoder, Tran2Tran
from nmt_try.models.bert2gpt2 import BertEncoder, GPT2Decoder, Bert2GPT2, GeneratedBert2GPT2
from nmt_try.models.qrnn2qrnn import GeneratedQRNN2QRNN, QRNNEncoder, QRNNDecoder, QRNN2QRNN, GeneratedQRNN2AttnQRNN, CrossAttnQRNNDecoder, QRNN2AttnQRNN


In [None]:
# default_exp models.patch

# Models Patch
> 

In [None]:
enc_tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
dec_tokenizer = GPT2DecoderTokenizer.from_pretrained('distilgpt2')

## GeneratedSeq2Seq.generate_from_strs()

In [None]:
# export
@patch
@torch.no_grad()
def generate_from_strs(self: [GeneratedTran2Tran, GeneratedGRU2GRU, GeneratedBert2GPT2, GeneratedQRNN2QRNN, GeneratedQRNN2AttnQRNN], 
                       src_strs, generate_args: GenerateArgs, device):
    ''' self.seq2seq.eval, self.enc_tokenizer.encode, self.dec_tokenizer.decode, self.generate_from_ids '''
    self.seq2seq.eval()
    
    pred_strs = []
    for src_str in src_strs:
        src_id = self.enc_tokenizer.encode(src_str)
        src_ids = torch.tensor([src_id], device=device)
        pred_ids = self.generate_from_ids(src_ids, generate_args)
        pred_str = self.dec_tokenizer.decode(list(pred_ids[0]), skip_special_tokens=True)
        pred_strs.append(pred_str)
    return pred_strs

In [None]:
# Test for GeneratedGRU2GRU
encoder = GRUEncoder(len(enc_tokenizer), 256, enc_tokenizer.pad_token_id, 1)
decoder = GRUDecoder(len(dec_tokenizer), 256, dec_tokenizer.pad_token_id, 1)
gru2gru = GRU2GRU(encoder, decoder, 1, 1)
generated_gru2gru = GeneratedGRU2GRU(gru2gru, enc_tokenizer, dec_tokenizer)

generate_args = GenerateArgs(max_length=10, num_beams=2)
src_strs = ['天氣不錯', '你好']
pred_strs = generated_gru2gru.generate_from_strs(src_strs, generate_args, device='cpu')
pred_strs

Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))


['Bo Stronghold Jian LW morality Tubico Welch',
 ' pondevaluate Retro HR Assass Productionsawa gate']

In [None]:
# Test for GeneratedTran2Tran
encoder = TranEncoder(len(enc_tokenizer), 256, 100, enc_tokenizer.pad_token_id)
decoder = TranDecoder(len(dec_tokenizer), 256, 100, dec_tokenizer.pad_token_id)
tran2tran = Tran2Tran(encoder, decoder, enc_tokenizer.pad_token_id)
generated_tran2tran = GeneratedTran2Tran(tran2tran, enc_tokenizer, dec_tokenizer)

generate_args = GenerateArgs(max_length=10, num_beams=2)
src_strs = ['天氣不錯', '你好']
pred_strs = generated_tran2tran.generate_from_strs(src_strs, generate_args, device='cpu')
pred_strs

Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))


[' encodegrowthgrowthgrowthgrowthgrowthgrowthgrowth',
 ' Economy Economy Economy Economy Economy Economy Economy Economy']

In [None]:
# Test for GeneratedBert2GPT2
encoder = BertEncoder('hfl/chinese-bert-wwm-ext')
decoder = GPT2Decoder(
    'distilgpt2', dec_tokenizer.pad_token_id,
    vocab_size=len(dec_tokenizer),
    num_heads=2, drop_p=0, num_layers=2,
)
bert2gpt2 = Bert2GPT2(encoder, decoder, enc_tokenizer.pad_token_id)
generated_bert2gpt2 = GeneratedBert2GPT2(bert2gpt2, enc_tokenizer, dec_tokenizer)

generate_args = GenerateArgs(max_length=10, num_beams=2)
src_strs = ['天氣不錯', '你好']
pred_strs = generated_bert2gpt2.generate_from_strs(src_strs, generate_args, device='cpu')
pred_strs

Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))


[' boss boss boss boss boss boss boss boss',
 ' supervisors supervisors supervisors supervisors supervisors supervisors supervisors supervisors']

In [None]:
# Test for GeneratedQRNN2QRNN
encoder = QRNNEncoder(len(enc_tokenizer), 256, enc_tokenizer.pad_token_id, 1)
decoder = QRNNDecoder(len(dec_tokenizer), 256, dec_tokenizer.pad_token_id, 1)
qrnn2qrnn = QRNN2QRNN(encoder, decoder, 1, 1)
generated_qrnn2qrnn = GeneratedQRNN2QRNN(qrnn2qrnn, enc_tokenizer, dec_tokenizer)

generate_args = GenerateArgs(max_length=10, num_beams=2)
src_strs = ['天氣不錯', '你好']
pred_strs = generated_qrnn2qrnn.generate_from_strs(src_strs, generate_args, device='cpu')
pred_strs

Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))


[' four Antar XTmillion Zstri david funding',
 ' Fame therapeutic Ruk climb Kee bacter TekUpload']

In [None]:
# Test for GeneratedQRNN2AttnQRNN
encoder = QRNNEncoder(len(enc_tokenizer), 256, enc_tokenizer.pad_token_id, 1)
decoder = CrossAttnQRNNDecoder(len(dec_tokenizer), 256, dec_tokenizer.pad_token_id, 1)
qrnn2attnqrnn = QRNN2AttnQRNN(encoder, decoder, 1, 1, 256, 1)
generated_qrnn2attnqrnn = GeneratedQRNN2AttnQRNN(qrnn2attnqrnn, enc_tokenizer, dec_tokenizer)

generate_args = GenerateArgs(max_length=10, num_beams=2)
src_strs = ['天氣不錯', '你好']
pred_strs = generated_qrnn2attnqrnn.generate_from_strs(src_strs, generate_args, device='cpu')
pred_strs

Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
Disabled padding because no padding token set (pad_token: [PAD], pad_token_id: 0).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))


[' Hound Hound Hound Hound Hound Hound Hound Hound',
 ' deputy deputy deputy deputy deputy deputy deputy deputy']

## Export - 

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 02_data.news_commentary.ipynb.
Converted 02_data.tatoeba.ipynb.
Converted 03a_models.patch.ipynb.
Converted 03c_models.bert2gpt2.ipynb.
Converted 03c_models.gru2gru.ipynb.
Converted 03c_models.qrnn2qrnn.ipynb.
Converted 03c_models.tran2tran.ipynb.
Converted 04_metrics.ipynb.
Converted 90_fulltest_bert2gpt2.ipynb.
Converted 90_fulltest_gru2gru.ipynb.
Converted 90_fulltest_qrnn2attnqrnn.ipynb.
Converted 90_fulltest_qrnn2qrnn.ipynb.
Converted 90_fulltest_tran2tran.ipynb.
Converted 95_nc_gru2gru.ipynb.
Converted 95_nc_qrnn2qrnn.ipynb.
Converted 95_nc_tran2tran.ipynb.
Converted index.ipynb.
