In [124]:
EN_ES_CORPUS_DIR = "/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/"

## Converting tatoeba format to standard format

In [125]:
import csv

with open(EN_ES_CORPUS_DIR + "original/valid.txt", encoding='utf8') as f:
    es = open(EN_ES_CORPUS_DIR + "dev.es", 'w', encoding='utf8')
    en = open(EN_ES_CORPUS_DIR + "dev.en", 'w', encoding='utf8')
    for line in csv.reader(f, delimiter="\t"):
        es.write(line[3] + "\n")
        en.write(line[2] + "\n")
    en.close()
    es.close()
    
with open(EN_ES_CORPUS_DIR + "original/dev.txt", encoding='utf8') as f:
    es = open(EN_ES_CORPUS_DIR + "train.es", 'w', encoding='utf8')
    en = open(EN_ES_CORPUS_DIR + "train.en", 'w', encoding='utf8')
    for line in csv.reader(f, delimiter="\t"):
        if "\n" in line[2] or "\n" in line[3]:
            continue
        if len(line) != 4:
            continue
            
        es.write(line[3] + "\n")
        en.write(line[2] + "\n")
        
    en.close()
    es.close()
        

### Tokenizing data

In [126]:
pip install tokenizers==0.10.3

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [127]:
tokenized_path = "/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/"

In [128]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

en_es_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [129]:
from tokenizers.trainers import WordPieceTrainer

en_es_trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [130]:
from tokenizers.pre_tokenizers import Whitespace

en_es_tokenizer.pre_tokenizer = Whitespace()

en_files = [f"/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/{split}.en" for split in ["dev", "train"]]
es_files = [f"/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/{split}.es" for split in ["dev", "train"]]
files = en_files + es_files
en_es_tokenizer.train(files= files, trainer=en_es_trainer)

In [131]:
from pathlib import Path



def tokenize_files(tokenizer, files, extension):
    for file in files:
        print(f"Reading file {file}")
        with open(file, encoding='utf8') as f:
          lines = f.readlines()
          tokenized_lines = tokenizer.encode_batch(lines)
          tokenized_name = Path(file).stem
          tokenized_name = tokenized_path + tokenized_name + "." + extension
          print(tokenized_name)
          with open(tokenized_name, 'w', encoding='utf8') as wf:

            wf.writelines([" ".join(t.tokens) + "\n" for t in tokenized_lines])

In [132]:
en_es_tokenizer.save(tokenized_path + "en-es-tokenizer.json")

In [133]:
print(en_es_tokenizer)

<tokenizers.Tokenizer object at 0x2cf5440>


In [134]:
tok = en_es_tokenizer.encode("En mis sueños tengo mi propio idioma. ¿Será qué me comunico con mi planeta mientras duermo?")
print(tok.tokens)


['En', 'mis', 'sueños', 'tengo', 'mi', 'propio', 'idioma', '.', '¿', 'Será', 'qué', 'me', 'comunic', '##o', 'con', 'mi', 'planeta', 'mientras', 'duermo', '?']


In [135]:
tokenize_files(en_es_tokenizer, en_files, "en")
tokenize_files(en_es_tokenizer, es_files, "es")

Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/dev.en
/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/dev.en
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/train.en
/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/train.en
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/dev.es
/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/dev.es
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/train.es
/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/train.es


## Binarizing data

In [4]:
%env DEST_DIR = /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es

env: DEST_DIR=/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es


In [137]:
pip install fairseq

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [140]:
! fairseq-preprocess --source-lang es --target-lang en \
    --trainpref $DEST_DIR/train --validpref $DEST_DIR/dev \
    --destdir $DEST_DIR/bin/ \
    --workers 20

2021-09-29 17:40:54 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bf16=False, bpe=None, checkpoint_shard_count=1, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/bin/', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, scoring='bleu', seed=1, source_lang='es', srcdict=None, target_lang='en', task='translation', tensorboard_logdir=None, testpref=None, tgtdict=None, threshold_loss_scale=None, thresholdsr

In [1]:
%env MODEL_DIR = /notebooks/master-thesis/models/es-en

env: MODEL_DIR=/notebooks/master-thesis/models/es-en


In [None]:
! fairseq-train $DEST_DIR/bin/ \
    --source-lang es --target-lang en \
    --arch=transformer \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --restore-file $MODEL_DIR/checkpoint_last.pt \
    --save-dir $MODEL_DIR \
    --keep-last-epochs 3 \
    --reset-optimizer \
    --fp16


2021-09-29 21:18:31 | INFO | fairseq_cli.train | Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, all_gather_list_size=16384, arch='transformer', attention_dropout=0.0, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, cpu=False, criterion='label_smoothed_cross_entropy', cross_self_attention=False, curriculum=0, data='/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/tokenizers/en-es/bin/', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_

In [None]:
ls

In [None]:
cd storage


In [None]:
ls

In [None]:
cd ..



In [None]:
cd ..

In [None]:
ls

In [None]:
ls

In [None]:
cd master-thesis

In [None]:
! unzip models-20210929T123614Z-003.zip

In [None]:
! rm models-20210929T123614Z-003.zip