## Data preparation WORK

**Change according to your configuration, and environment**

**Make sure there the 'scripts' folder and 'data' folder are presents.**

In [None]:
%pip install datasets OpenNMT-py sentencepiece sacrebleuq

In [None]:
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model -P data
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/nllb-inference.yaml
!wget https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt -P data
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/nllb-200-600M-onmt.pt


In [None]:
# Cleaning task
!python scripts/filter.py data/train.fr data/train.bam
# Training the Sentence piece subwording models for French and Bam
!python scripts/unigram.py data/train.fr.fil.txt data/train.bam.fil.txt
# Deplacer les fichier créer par les différents scripts dans le dossier data
!mv *.vocab *.model data

In [None]:
# Subwording the train, test and dev sets
!python scripts/subword.py data/source.model data/target.model data/train.fr.fil.txt data/train.bam.fil.txt
!python scripts/subword.py data/source.model data/target.model data/dev.fr data/dev.bam
!python scripts/subword.py data/source.model data/target.model data/test.fr data/test.bam

In [None]:
# Move the new created files in folder "data" and change their names
!mv data/train.sub-src.txt data/train.sub.fr && mv data/train.sub-trg.txt data/train.sub.bam
!mv data/dev.sub-src.txt data/dev.sub.fr && mv data/dev.sub-trg.txt data/dev.sub.bam
!mv data/test.sub-src.txt data/test.sub.fr && mv data/test.sub-trg.txt data/test.sub.bam

In [None]:
!head -10 data/train.sub.fr data/train.sub.bam

## Model / Training Configuration

In [None]:
# Try to avoid running out of memory
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

In [None]:
import os

model_name = "fr2bam"
vocab_size = 50000

training_steps = 2500
valid_steps = int(training_steps / 5)
save_ckpt_freq = int(training_steps / 5)
warmup_steps = int(training_steps / 10)
reporting =  10 # int(training_steps/10)
GPU = 1 # TOGGLE for GPU

if(not os.path.exists(model_name)):
  os.makedirs(model_name)

config = f"""

# config.yaml


## Where the samples will be written
save_data: run

# Training files
data:
    corpus_1:
        path_src: data/train.sub.fr
        path_tgt: data/train.sub.bam
        transforms: [filtertoolong] # change the transform method
    valid:
        path_src: data/dev.sub.fr
        path_tgt: data/dev.sub.bam
        transforms: [filtertoolong] # change the transform method

# Vocabulary files, generated by onmt_build_vocab
src_vocab: data/dictionary.txt
tgt_vocab: data/dictionary.txt

train_from: nllb-200-600M-onmt.pt

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 50000
tgt_vocab_size: 50000

# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 200
src_seq_length: 200

# Tokenization options
src_subword_model: data/flores200_sacrebleu_tokenizer_spm.model
tgt_subword_model: data/flores200_sacrebleu_tokenizer_spm.model

# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models/{model_name}

# Stop training if it does not imporve after n validations
early_stopping: 3

# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: {save_ckpt_freq}

# To save space, limit checkpoints to last n
keep_checkpoint: 2

seed: 3456

# Default: 100000 - Train the model to max n steps 
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
train_steps: {training_steps}

# Default: 10000 - Run validation after n steps
valid_steps: {valid_steps}

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: {warmup_steps}
report_every: {reporting}

# Batching
num_workers: 2  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 512  # Tokens per batch, change when CUDA out of memory
valid_batch_size: 512
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]

"""

if(GPU):
  config += """
world_size: 1
gpu_ranks: [0]
  """

with open(f"{model_name}/config.yaml", "w") as fp:
  fp.write(config)

In [None]:
!nproc --all

In [None]:
!onmt_build_vocab -config {model_name}/config.yaml -n_sample -1 -num_threads 2

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!onmt_train -config {model_name}/config.yaml -verbose 

In [None]:
ls models/

## Model Evaluation

In [None]:
!onmt_translate -model models/fr2bam_step_400.pt -src data/test.sub.fr -output models/nllb.pred_400.txt -gpu 1 -verbose

In [None]:
!head data/test.sub.bam 
!head models/nllb.pred_400.txt

In [None]:
!python scripts/desubword.py data/target.model fr2bam/models/nllb.pred_400.txt

In [None]:
# Sacrebleu testing CODE
bleu = !sacrebleu data/test.fr -i fr2bam/models/nllb.pred_400.txt.desub.txt -m bleu -b -w 4
ter = !sacrebleu data/test.fr -i fr2bam/models/nllb.pred_400.txt.desub.txt -m ter -b -w 4

print(bleu)
print(ter)