## ENVIRONMENT VARIABLES

In [1]:
EN_ES_CORPUS_DIR = "/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/"

In [2]:
NAH_ES_CORPUS_DIR = "/notebooks/master-thesis/corpora/americasnlp2021/data/nahuatl-spanish"

In [3]:
tokenizer_path = "/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/"

In [4]:
%env TOKENIZER_PATH = /storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en

env: TOKENIZER_PATH=/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en


In [5]:
tokenized_path_es_en = "/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en/"
tokenized_path_nah_es = "/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/nah-es/"

In [6]:
%env TOKENIZED_PATH_ES_EN = /storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en
%env TOKENIZED_PATH_NAH_ES = /storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/nah-es

env: TOKENIZED_PATH_ES_EN=/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en
env: TOKENIZED_PATH_NAH_ES=/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/nah-es


In [7]:
%env BIN_DIR = /storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin

env: BIN_DIR=/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin


In [8]:
! mkdir -p $TOKENIZED_PATH_ES_EN
! mkdir -p $TOKENIZED_PATH_NAH_ES
! mkdir -p $BIN_DIR

In [9]:
%env MODEL_DIR = /storage/master-thesis/models/resdrop__nah-es_es-en

env: MODEL_DIR=/storage/master-thesis/models/resdrop__nah-es_es-en


In [27]:
%env CODE_STORAGE = /storage/code/

env: CODE_STORAGE=/storage/code/


In [11]:
! locale

LANG=
LANGUAGE=
LC_CTYPE="POSIX"
LC_NUMERIC="POSIX"
LC_TIME="POSIX"
LC_COLLATE="POSIX"
LC_MONETARY="POSIX"
LC_MESSAGES="POSIX"
LC_PAPER="POSIX"
LC_NAME="POSIX"
LC_ADDRESS="POSIX"
LC_TELEPHONE="POSIX"
LC_MEASUREMENT="POSIX"
LC_IDENTIFICATION="POSIX"
LC_ALL=


In [None]:
! update-locale LANG=en_US.UTF-8 LANGUAGE=en.UTF-8

%env LANG=en_US.UTF-8
%env LC_CTYPE=en_US.UTF-8
%env LC_ALL=en_US.UTF-8

/bin/sh: 1: update-locale: not found
env: LANG=en_US.UTF-8
env: LC_CTYPE=en_US.UTF-8
env: LC_ALL=en_US.UTF-8


In [None]:
! cat /etc/rc.conf

cat: /etc/rc.conf: No such file or directory


## Libraries

In [None]:
pip install tokenizers==0.10.3

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip uninstall torch torchvision torchaudio -y

In [None]:
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

In [None]:
pip uninstall fairseq -y

In [None]:
pip install fairseq

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
! pip uninstall apex -y

In [None]:
rm -r apex

In [None]:
#! git clone https://github.com/NVIDIA/apex

! pip install -v --disable-pip-version-check --no-cache-dir apex

In [None]:
pip install tensorboardX

## Converting tatoeba format to standard format

In [None]:
import csv

with open(EN_ES_CORPUS_DIR + "original/valid.txt", encoding='utf8') as f:
    es = open(EN_ES_CORPUS_DIR + "dev.es", 'w', encoding='utf8')
    en = open(EN_ES_CORPUS_DIR + "dev.en", 'w', encoding='utf8')
    for line in csv.reader(f, delimiter="\t"):
        es.write(line[3] + "\n")
        en.write(line[2] + "\n")
    en.close()
    es.close()
    
with open(EN_ES_CORPUS_DIR + "original/dev.txt", encoding='utf8') as f:
    es = open(EN_ES_CORPUS_DIR + "train.es", 'w', encoding='utf8')
    en = open(EN_ES_CORPUS_DIR + "train.en", 'w', encoding='utf8')
    for line in csv.reader(f, delimiter="\t"):
        if "\n" in line[2] or "\n" in line[3]:
            continue
        if len(line) != 4:
            continue
            
        es.write(line[3] + "\n")
        en.write(line[2] + "\n")
        
    en.close()
    es.close()
        

### Tokenizing data

In [12]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

en_es_nah_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [13]:
from tokenizers.trainers import WordPieceTrainer

en_es_nah_trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], show_progress=True)

In [14]:
from tokenizers.pre_tokenizers import Whitespace

en_es_nah_tokenizer.pre_tokenizer = Whitespace()

en_files = [f"/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/{split}.en" for split in ["dev", "train"]]
es_files = [f"/notebooks/master-thesis/corpora/tatoeba-challenge/en-es/{split}.es" for split in ["dev", "train"]]
nah_files = [f"/notebooks/master-thesis/corpora/americasnlp2021/data/nahuatl-spanish/{split}.nah" for split in ["dev", "train"]]
es2_files = [f"/notebooks/master-thesis/corpora/americasnlp2021/data/nahuatl-spanish/{split}.es" for split in ["dev", "train"]]

files = en_files + es_files + es2_files + nah_files
#print(files)
en_es_nah_tokenizer.train(files= files, trainer=en_es_nah_trainer)

In [15]:
from pathlib import Path



def tokenize_files(tokenizer, files, extension, output_path):
    for file in files:
        print(f"Reading file {file}")
        with open(file, encoding='utf8') as f:
          lines = f.readlines()
          tokenized_lines = tokenizer.encode_batch(lines)
          tokenized_name = Path(file).stem
          tokenized_name = output_path + tokenized_name + "." + extension
          print(tokenized_name)
          with open(tokenized_name, 'w', encoding='utf8') as wf:

            wf.writelines([" ".join(t.tokens) + "\n" for t in tokenized_lines])

In [16]:
en_es_nah_tokenizer.save(tokenizer_path + "nah-es-en-tokenizer.json")

In [17]:
print(en_es_nah_tokenizer)

<tokenizers.Tokenizer object at 0x7fea4000f6c0>


In [19]:
tok = en_es_nah_tokenizer.encode("we hold this truth to be self evidently that everyone is created equal?")
print(tok.tokens)

['we', 'hold', 'this', 'truth', 'to', 'be', 'self', 'evident', '##ly', 'that', 'everyone', 'is', 'created', 'equal', '?']


In [20]:
tokenize_files(en_es_nah_tokenizer, en_files, "en", tokenized_path_es_en)
tokenize_files(en_es_nah_tokenizer, es_files, "es", tokenized_path_es_en)

tokenize_files(en_es_nah_tokenizer, nah_files, "nah", tokenized_path_nah_es)
tokenize_files(en_es_nah_tokenizer, es2_files, "es", tokenized_path_nah_es)

Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/dev.en
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en/dev.en
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/train.en
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en/train.en
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/dev.es
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en/dev.es
Reading file /notebooks/master-thesis/corpora/tatoeba-challenge/en-es/train.es
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/es-en/train.es
Reading file /notebooks/master-thesis/corpora/americasnlp2021/data/nahuatl-spanish/dev.nah
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/nah-es/dev.nah
Reading file /notebooks/master-thesis/corpora/americasnlp2021/data/nahuatl-spanish/train.nah
/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-e

## Binarizing data

In [21]:
### Removing previous dict files
! rm $BIN_DIR/dict.nah.txt
! rm $BIN_DIR/dict.es.txt
! rm $BIN_DIR/dict.en.txt

rm: cannot remove '/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin/dict.nah.txt': No such file or directory
rm: cannot remove '/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin/dict.es.txt': No such file or directory
rm: cannot remove '/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin/dict.en.txt': No such file or directory


In [22]:
## Concatenating all training data
! cat $TOKENIZED_PATH_NAH_ES/train.nah $TOKENIZED_PATH_NAH_ES/train.es $TOKENIZED_PATH_ES_EN/train.es $TOKENIZED_PATH_ES_EN/train.en > $BIN_DIR/train.all

In [23]:
! fairseq-preprocess --source-lang all \
    --trainpref $BIN_DIR/train \
    --destdir $BIN_DIR \
    --workers 20 \
    --only-source

2022-09-20 10:08:26 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=True, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=

In [24]:
! fairseq-preprocess --source-lang nah --target-lang es \
    --trainpref $TOKENIZED_PATH_NAH_ES/train --validpref $TOKENIZED_PATH_NAH_ES/dev \
    --destdir $BIN_DIR \
    --srcdict $BIN_DIR/dict.all.txt \
    --tgtdict $BIN_DIR/dict.all.txt \
    --workers 20

2022-09-20 10:08:42 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging

### Reusing es dict from quy-es preprocessing

In [25]:
! fairseq-preprocess --source-lang es --target-lang en \
    --trainpref $TOKENIZED_PATH_ES_EN/train --validpref $TOKENIZED_PATH_ES_EN/dev \
    --destdir $BIN_DIR \
    --srcdict $BIN_DIR/dict.all.txt \
    --tgtdict $BIN_DIR/dict.all.txt \
    --workers 20

2022-09-20 10:08:48 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/storage/master-thesis/models/resdrop__nah-es_es-en/tokenizers/nah-es-en/bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging

## Training model

In [28]:
! cd $CODE_STORAGE

In [29]:
! git clone https://github.com/dannigt/fairseq.git

Cloning into 'fairseq'...
remote: Enumerating objects: 25231, done.[K
remote: Total 25231 (delta 0), reused 0 (delta 0), pack-reused 25231[K
Receiving objects: 100% (25231/25231), 19.74 MiB | 16.78 MiB/s, done.
Resolving deltas: 100% (18304/18304), done.
Checking out files: 100% (771/771), done.


In [30]:
cd ..

/


In [32]:
! fairseq-train $BIN_DIR \
    --user-dir $CODE_STORAGE/fairseq/examples/residual_drop/residual_drop_src/ \
    --arch=residual_drop_transformer --share-all-embeddings \
    --task translation_multi_simple_epoch --lang-pairs nah-es,es-en \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --restore-file $MODEL_DIR/checkpoint_last.pt \
    --save-dir $MODEL_DIR/ \
    --keep-last-epochs 2 \
    --reset-optimizer \
    --encoder-langtok "src" \
    --decoder-langtok \
    --fp16 \
    --max-epoch 200 \
    --patience 10 \
    --encoder-drop-residual 2


2022-09-20 11:25:10 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/storage/code//fairseq/examples/residual_drop/residual_drop_src/', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': Fal

In [None]:
! echo $MODEL_DIR

/storage/master-thesis/models/quy-es+es-en


In [None]:
cd /storage/master-thesis/models/quy-es+es-en

/storage/master-thesis/models/quy-es+es-en


In [None]:
! rm checkpoint108.pt
! rm checkpoint109.pt
! rm checkpoint_best.pt
! rm checkpoint_last.pt

In [None]:
! rm dict.en.txt
! rm dict.es.txt
! rm dict.quy.txt

In [None]:
! du -shc /notebooks/*

2.5K	/notebooks/CITATION.cff
5.5K	/notebooks/CODE_OF_CONDUCT.md
15K	/notebooks/CONTRIBUTING.md
19K	/notebooks/ISSUES.md
12K	/notebooks/LICENSE
512	/notebooks/MANIFEST.in
3.5K	/notebooks/Makefile
41K	/notebooks/README.md
41K	/notebooks/README_zh-hans.md
42K	/notebooks/README_zh-hant.md
12K	/notebooks/docker
4.6M	/notebooks/docs
5.0M	/notebooks/examples
8.5K	/notebooks/hubconf.py
7.8G	/notebooks/master-thesis
1.5K	/notebooks/model_cards
9.5K	/notebooks/notebooks
512	/notebooks/pyproject.toml
64K	/notebooks/scripts
1.0K	/notebooks/setup.cfg
13K	/notebooks/setup.py
13M	/notebooks/src
731K	/notebooks/templates
4.5K	/notebooks/test quy-es -> es-en model.ipynb
6.8M	/notebooks/tests
147K	/notebooks/train es-en model.ipynb
158K	/notebooks/train quy-es + es-en model.ipynb
160K	/notebooks/utils
3.5K	/notebooks/valohai.yaml
7.9G	total


In [None]:
! du -shc /notebooks/master-thesis/*

512	/notebooks/master-thesis/Untitled.ipynb
362M	/notebooks/master-thesis/corpora
7.5G	/notebooks/master-thesis/models
7.8G	total


In [None]:
! du -shc /notebooks/master-thesis/models/*

1.6G	/notebooks/master-thesis/models/es-en
802M	/notebooks/master-thesis/models/quy-es
5.2G	/notebooks/master-thesis/models/quy-es+es-en
7.5G	total


In [None]:
! du -shc /*

19M	/apex
5.0M	/bin
4.0K	/boot
24K	/content
^C


In [None]:
ls

 CITATION.cff         [0m[01;34mdocker[0m/          setup.py
 CODE_OF_CONDUCT.md   [01;34mdocs[0m/            [01;34msrc[0m/
 CONTRIBUTING.md      [01;34mexamples[0m/        [01;34mtemplates[0m/
 ISSUES.md            hubconf.py      'test quy-es -> es-en model.ipynb'
 LICENSE              [01;34mmaster-thesis[0m/   [01;34mtests[0m/
 MANIFEST.in          [01;34mmodel_cards[0m/    'train es-en model.ipynb'
 Makefile             [01;34mnotebooks[0m/      'train quy-es + es-en model.ipynb'
 README.md            pyproject.toml   [01;34mutils[0m/
 README_zh-hans.md    [01;34mscripts[0m/         valohai.yaml
 README_zh-hant.md    setup.cfg


/storage/master-thesis
