In [None]:
# export
from fastai2.basics import *
from transformers import AutoTokenizer

from fastai2_utils.data.all import *
from fastai_transformers_utils.all import *

In [None]:
# default_exp data.tatoeba

# Data Tatoeba
> Chinese to English pairs

In [None]:
ori_data_loc = './test_data/cmn.txt'
tok_data_loc = './test_data/tok_cmn.csv'
enc_tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
dec_tokenizer = GPT2DecoderTokenizer.from_pretrained('distilgpt2')
enc_seq_len = 50
dec_seq_len = 40

## tokenize_data

In [None]:
# export
def _tokenize_data(ori_data_loc, enc_tokenizer, dec_tokenizer):
    df = pd.read_csv(ori_data_loc, header=None, names=['English', 'Chinese', 'Contributor'], delimiter='\t')
    df.drop(['Contributor'], axis=1, inplace=True)
    
    tok_df = df.copy()
    encoder_tok_list = L(parallel_gen(TransformersTokenizer, df.Chinese, tokenizer=enc_tokenizer)).sorted().itemgot(1)
    tok_df.Chinese =  encoder_tok_list.map(lambda x: ' '.join(x)) # split tokens by ' '
    decoder_tok_list = L(parallel_gen(TransformersTokenizer, df.English, tokenizer=dec_tokenizer)).sorted().itemgot(1)
    tok_df.English =  decoder_tok_list.map(lambda x: ' '.join(x)) # split tokens by ' '
    
    is_valid = np.zeros(len(tok_df))
    is_valid[:int(len(tok_df)*0.2)] = 1
    np.random.RandomState(42).shuffle(is_valid)
    is_valid = is_valid.astype(np.bool)
    tok_df['is_valid'] = is_valid

    return tok_df

In [None]:
# skip
# tokenize data and save it to tok_data_loc
tok_df = _tokenize_data(ori_data_loc, enc_tokenizer, dec_tokenizer)
tok_df.to_csv(tok_data_loc, index=False)
pd.read_csv(tok_data_loc)

Unnamed: 0,English,Chinese,is_valid
0,<|bos|> Hi . <|endoftext|>,嗨 。,False
1,<|bos|> Hi . <|endoftext|>,你 好 。,False
2,<|bos|> Run . <|endoftext|>,你 用 跑 的 。,False
3,<|bos|> Wait ! <|endoftext|>,等 等 ！,False
4,<|bos|> Hello ! <|endoftext|>,你 好 。,False
...,...,...,...
21200,"<|bos|> Last Ġyear Ġin Ġthe ĠPhilippines , Ġearthquakes Ġand Ġtidal Ġwaves Ġresulted Ġin Ġthe Ġdeaths Ġof Ġmore Ġthan Ġ6 , 000 Ġpeople . <|endoftext|>",去 年 在 菲 律 宾 ， 地 震 和 海 啸 造 成 了 超 过 6000 人 的 死 亡 。,False
21201,"<|bos|> My Ġmother Ġspeaks ĠFrench Ġbetter Ġthan Ġmy Ġfather Ġspeaks ĠEnglish , Ġso Ġthey Ġusually Ġspeak Ġto Ġeach Ġother Ġin ĠFrench . <|endoftext|>",我 母 亲 的 法 语 比 我 父 亲 的 英 语 要 好 ， 所 以 他 们 通 常 用 法 语 交 流 。,False
21202,"<|bos|> Tom Ġdidn 't Ġknow Ġhow Ġto Ġtranslate Ġthe Ġword Ġ"" computer "" Ġbecause Ġthe Ġpeople Ġhe Ġwas Ġtalking Ġto Ġhad Ġnever Ġseen Ġone . <|endoftext|>",汤 姆 不 知 如 何 翻 译 [UNK] 计 算 机 [UNK] 一 词 ， 因 为 同 他 谈 话 的 人 从 未 见 过 一 台 。,False
21203,"<|bos|> Even Ġnow , ĠI Ġoccasionally Ġthink ĠI 'd Ġlike Ġto Ġsee Ġyou . ĠNot Ġthe Ġyou Ġthat Ġyou Ġare Ġtoday , Ġbut Ġthe Ġyou ĠI Ġremember Ġfrom Ġthe Ġpast . <|endoftext|>",即 使 是 现 在 ， 我 偶 尔 还 是 想 见 到 你 。 不 是 今 天 的 你 ， 而 是 我 记 忆 中 曾 经 的 你 。,True


## get_tatoeba_dss

In [None]:
# export
def get_tatoeba_dss(tok_data_loc, enc_tokenizer, dec_tokenizer, enc_seq_len, dec_seq_len, pct=1.0):
    tok_df = pd.read_csv(tok_data_loc)
    
    splits = ColSplitter()(tok_df)
    splits = pct_splits(splits, pct=pct)
    
    encoder_input_tfm = [attrgetter('Chinese'), lambda x: x.split(' '), TransformersNumericalize(enc_tokenizer), Pad2Max(enc_seq_len, enc_tokenizer.pad_token_id)]
    decoder_input_tfm = [attrgetter('English'), lambda x: x.split(' '), TransformersNumericalize(dec_tokenizer), Pad2Max(dec_seq_len+1, dec_tokenizer.pad_token_id), lambda x: x[:-1]]
    decoder_output_tfm = [attrgetter('English'), lambda x: x.split(' '), TransformersNumericalize(dec_tokenizer), Pad2Max(dec_seq_len+1, dec_tokenizer.pad_token_id), lambda x: x[1:]]
    ds_tfms = [
        encoder_input_tfm,
        decoder_input_tfm,
        decoder_output_tfm,
    ]
    
    dss = Datasets(tok_df, tfms=ds_tfms, splits=splits, n_inp=2)
    return dss

In [None]:
small_dss = get_tatoeba_dss(tok_data_loc, enc_tokenizer, dec_tokenizer, enc_seq_len, dec_seq_len, pct=0.5)
dss = get_tatoeba_dss(tok_data_loc, enc_tokenizer, dec_tokenizer, enc_seq_len, dec_seq_len)
test_eq(len(small_dss.train), len(dss.train)//2)

In [None]:
dss.train[10], dss.decode(dss.train[10])

((TensorText([ 101,  800, 6651,  749,  511,  102,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0]),
  TensorText([50257,  1544,  4966,    13, 50256, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258]),
  TensorText([ 1544,  4966,    13, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 5

In [None]:
dls = dss.dataloaders(bs=2)
for x in dls.train:
    print(x[0].shape, x[0].dtype, x[0].device, type(x[0]))
    print(x[1].shape, x[1].dtype, x[1].device, type(x[1]))
    print(x[2].shape, x[2].dtype, x[2].device, type(x[2]))
    break

torch.Size([2, 50]) torch.int64 cuda:0 <class 'fastai2.text.data.TensorText'>
torch.Size([2, 40]) torch.int64 cuda:0 <class 'fastai2.text.data.TensorText'>
torch.Size([2, 40]) torch.int64 cuda:0 <class 'fastai2.text.data.TensorText'>


## Export -

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 02_data.tatoeba.ipynb.
Converted 03a_models.patch.ipynb.
Converted 03c_models.bert2gpt2.ipynb.
Converted 03c_models.gru2gru.ipynb.
Converted 03c_models.tran2tran.ipynb.
Converted 04_metrics.ipynb.
Converted index.ipynb.
