<a href="https://colab.research.google.com/github/gigajet/transformer/blob/master/running_fairseq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# Google Colab doesn't support `ln`.

Mounted at /content/drive


In [2]:
# Install fairseq
!git clone https://github.com/pytorch/fairseq
%cd fairseq
!pip install --editable ./

Cloning into 'fairseq'...
remote: Enumerating objects: 31258, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 31258 (delta 10), reused 17 (delta 8), pack-reused 31225[K
Receiving objects: 100% (31258/31258), 21.51 MiB | 31.29 MiB/s, done.
Resolving deltas: 100% (23060/23060), done.
/content/fairseq
Obtaining file:///content/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting bitarray
  Downloading bitarray-2.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (236 kB)
[K     |████████████████████████████████| 236 kB 5.1 MB/s 
Collecting omegaconf<2.1
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting hydra-core<1.1,>=1.0.7
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[K     |████████████████

In [3]:
# Dependencies for preprocessing
!pip install fastBPE sacremoses subword_nmt

Collecting fastBPE
  Downloading fastBPE-0.1.0.tar.gz (35 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 8.2 MB/s 
[?25hCollecting subword_nmt
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: fastBPE, sacremoses
  Building wheel for fastBPE (setup.py) ... [?25l[?25hdone
  Created wheel for fastBPE: filename=fastBPE-0.1.0-cp37-cp37m-linux_x86_64.whl size=483188 sha256=61ef3682fc23cf9be18ff16594aec0dab98b000f80ca41fce0af805ed5f4db50
  Stored in directory: /root/.cache/pip/wheels/bd/d4/0e/0d317a65f77d3f8049fedd8a2ee0519164cf3e6bd77ef886f1
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=9cad2f35f4a217b0b7f48e509fa00d7fce5ad11435c1872385b0913ac1b593de
  Stored in directory: /root/.cache/pip/

In [4]:
# Run this if you get error "importlib_metadata.PackageNotFoundError: No package metadata was found for fairseq" in next cell
import os
os.environ['PYTHONPATH'] += ":/content/fairseq/"

## Verify that fairseq is now usable (optional)

In [None]:
# This cell is for verifying the install
# Interactive translation
import torch
import fairseq

# List available models
torch.hub.list('pytorch/fairseq')  # [..., 'transformer.wmt16.en-de', ... ]

# Load a transformer trained on WMT'16 En-De
# Note: WMT'19 models use fastBPE instead of subword_nmt, see instructions below
en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de',
                       tokenizer='moses', bpe='subword_nmt')
en2de.eval()  # disable dropout

# The underlying model is available under the *models* attribute
assert isinstance(en2de.models[0], fairseq.models.transformer.TransformerModel)

# Move model to GPU for faster translation
# en2de.cuda()

# Translate a sentence
en2de.translate('Hello world!')
# 'Hallo Welt!'

# Batched translation
en2de.translate(['Hello world!', 'The cat sat on the mat.'])
# ['Hallo Welt!', 'Die Katze saß auf der Matte.']

2022-05-02 04:29:06 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
Downloading: "https://github.com/pytorch/fairseq/archive/main.zip" to /root/.cache/torch/hub/main.zip
Using cache found in /root/.cache/torch/hub/pytorch_fairseq_main
2022-05-02 04:29:10 | INFO | fairseq.file_utils | https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2 not found in cache, downloading to /tmp/tmpy9uuosu6
  7%|▋         | 150277120/2193287384 [00:05<01:50, 18416384.24B/s]

KeyboardInterrupt: ignored

## Prepare & Train IWSLT'14 German to English (Transformer)

In [None]:
# Download and prepare the data
%cd examples/translation/
!bash prepare-iwslt14.sh
%cd ../..

/content/fairseq/examples/translation
Cloning Moses github repository (for tokenization scripts)...
Cloning into 'mosesdecoder'...
remote: Enumerating objects: 148090, done.[K
remote: Counting objects: 100% (518/518), done.[K
remote: Compressing objects: 100% (223/223), done.[K
^C
/content/fairseq


In [None]:
# Preprocess/binarize the data
# Look below on how to set environment variable in GG Colab
%env TEXT=examples/translation/iwslt14.tokenized.de-en
!fairseq-preprocess --source-lang de --target-lang en \
    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    --destdir /content/drive/MyDrive/translation/iwslt14.tokenized.de-en \
    --workers 20


env: TEXT=examples/translation/iwslt14.tokenized.de-en
/bin/bash: fairseq-preprocess: command not found


In [None]:
# Train the model, please change --max-epoch depends whether you use gpu/cpu
%env CUDA_VISIBLE_DEVICES=0 
!fairseq-train \
    /content/drive/MyDrive/translation/iwslt14.tokenized.de-en \
    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-epoch 15 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --save-dir /content/drive/MyDrive/translation/iwslt14.tokenized.de-en/checkpoints


env: CUDA_VISIBLE_DEVICES=0
/bin/bash: fairseq-train: command not found


In [None]:
# Evaluate our model
!fairseq-generate /content/drive/MyDrive/translation/iwslt14.tokenized.de-en \
    --path /content/drive/MyDrive/translation/iwslt14.tokenized.de-en/checkpoints/checkpoint_best.pt \
    --batch-size 128 --beam 5 --remove-bpe

/bin/bash: fairseq-generate: command not found


## Prepare IWSLT'15 English - Vietnamese data

In [5]:
!unzip /content/drive/MyDrive/translation/en-vi.zip -d en-vi
!cp /content/drive/MyDrive/translation/myprepare-iwslt15-en-vi.sh .

Archive:  /content/drive/MyDrive/translation/en-vi.zip
  inflating: en-vi/IWSLT15.TED.dev2010.en-vi.en  
  inflating: en-vi/IWSLT15.TED.dev2010.en-vi.vi  
  inflating: en-vi/IWSLT15.TED.tst2010.en-vi.en  
  inflating: en-vi/IWSLT15.TED.tst2010.en-vi.vi  
  inflating: en-vi/IWSLT15.TED.tst2011.en-vi.en  
  inflating: en-vi/IWSLT15.TED.tst2011.en-vi.vi  
  inflating: en-vi/IWSLT15.TED.tst2012.en-vi.en  
  inflating: en-vi/IWSLT15.TED.tst2012.en-vi.vi  
  inflating: en-vi/IWSLT15.TED.tst2013.en-vi.en  
  inflating: en-vi/IWSLT15.TED.tst2013.en-vi.vi  
  inflating: en-vi/IWSLT15.TED.tst2015.en-vi.en  
  inflating: en-vi/IWSLT15.TED.tst2015.en-vi.vi  
  inflating: en-vi/train.en          
  inflating: en-vi/train.en-vi       
  inflating: en-vi/train.tags.en-vi.clean.en  
  inflating: en-vi/train.tags.en-vi.clean.vi  
  inflating: en-vi/train.tags.en-vi.en  
  inflating: en-vi/train.tags.en-vi.tok.en  
  inflating: en-vi/train.tags.en-vi.tok.vi  
  inflating: en-vi/train.tags.en-vi.vi  
  i

In [6]:
!sh myprepare-iwslt15-en-vi.sh
# After this, we have `train valid test` in iwslt15.tokenized.en-vi

Cloning Moses github repository (for tokenization scripts)...
Cloning into 'mosesdecoder'...
remote: Enumerating objects: 148090, done.[K
remote: Counting objects: 100% (518/518), done.[K
remote: Compressing objects: 100% (223/223), done.[K
remote: Total 148090 (delta 319), reused 443 (delta 292), pack-reused 147572[K
Receiving objects: 100% (148090/148090), 129.87 MiB | 18.58 MiB/s, done.
Resolving deltas: 100% (114345/114345), done.
Cloning Subword NMT repository (for BPE pre-processing)...
Cloning into 'subword-nmt'...
remote: Enumerating objects: 590, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 590 (delta 3), reused 4 (delta 1), pack-reused 576[K
Receiving objects: 100% (590/590), 245.03 KiB | 4.90 MiB/s, done.
Resolving deltas: 100% (352/352), done.
Creating backup of train.en train.vi
creating train, valid, test...
learn_bpe.py on en-vi/train.en-vi...
100% 10000/10000 [00:12<00:00, 813.07it/s]
ap

In [7]:
# Preprocess/binarize the data
# Look below on how to set environment variable in GG Colab
%env TEXT=iwslt15.tokenized.en-vi
!fairseq-preprocess --source-lang en --target-lang vi \
    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    --destdir /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/dataset \
    --workers 20

env: TEXT=iwslt15.tokenized.en-vi
2022-05-06 15:02:56 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-05-06 15:02:56 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/content/drive/MyDrive/iwslt15.tokenized.en-vi/dataset', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=F

## Train IWSLT'15 English-Vietnamese (Transformer)

In [None]:
# arch transformer, ở đây dùng transformer_iwslt_de_en là dùng lại architecture của nó
# Train the model, please change --max-epoch depends whether you use gpu/cpu
# see stop-time-hours
%env CUDA_VISIBLE_DEVICES=0 
!fairseq-train \
    /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/dataset \
    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-epoch 36 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --save-dir checkpoints
    # --save-dir /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/checkpoints


env: CUDA_VISIBLE_DEVICES=0
/bin/bash: fairseq-train: command not found


In [None]:
# Evaluate our model
!fairseq-generate /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/dataset \
    --path /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/checkpoints/checkpoint_best.pt \
    --batch-size 128 --beam 5 --remove-bpe

/bin/bash: fairseq-generate: command not found


## Try to add FuzzyLayer to Transformer


In [None]:
# FuzzyLayer
import torch
import math
from torch import nn

class FuzzyLayer(nn.Module):

    def __init__(self, output_dim: int, **kwargs):
        super(FuzzyLayer, self).__init__(**kwargs)
        self.output_dim = output_dim
        self.fuzzy_degree = nn.Parameter(torch.empty(output_dim))
        self.sigma = nn.Parameter(torch.ones(output_dim))
        self.reset_parameters()
        
    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        nn.init.uniform_(self.fuzzy_degree)

    def forward(self, input, **kwargs):
        x = torch.repeat_interleave(torch.unsqueeze(input,-1), self.output_dim, dim=-1)
        fuzzy_out = torch.exp(
                      -torch.sum(
                          torch.square((x-self.fuzzy_degree)/(self.sigma**2))            
                          ,dim=-2, keepdims=False)
              )
        return fuzzy_out


class FuzzyRuleLayer(nn.Module):

    def __init__(self, input_dim: int, output_dim,**kwargs):
        super(FuzzyRuleLayer, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layers = nn.ModuleList([
            FuzzyLayer(output_dim) for _ in range(input_dim)
        ])

    def forward(self, input):
        batch_size, input_dim = input.size()
        an=torch.ones(batch_size, self.output_dim)
        for layer in self.layers:
            an=an*layer(input)
        return an

z=FuzzyRuleLayer(5,35)
y=torch.ones(3,5)
print(z(y).shape)
print(z(y))

torch.Size([3, 35])
tensor([[6.7112e-06, 6.8293e-04, 1.1360e-05, 1.4338e-05, 3.1253e-06, 4.4289e-04,
         1.2499e-04, 2.6953e-04, 5.7319e-06, 3.1216e-05, 5.5655e-05, 3.2802e-04,
         3.7528e-02, 1.1846e-05, 7.1205e-07, 2.0275e-06, 7.1357e-07, 4.7466e-04,
         1.9231e-06, 2.3387e-03, 8.4445e-06, 1.2212e-04, 1.2321e-04, 4.9719e-04,
         9.7817e-03, 7.2931e-03, 2.3511e-06, 5.0083e-02, 1.0146e-05, 1.0202e-06,
         5.4978e-04, 8.7630e-04, 9.0879e-02, 2.8873e-08, 5.0999e-03],
        [6.7112e-06, 6.8293e-04, 1.1360e-05, 1.4338e-05, 3.1253e-06, 4.4289e-04,
         1.2499e-04, 2.6953e-04, 5.7319e-06, 3.1216e-05, 5.5655e-05, 3.2802e-04,
         3.7528e-02, 1.1846e-05, 7.1205e-07, 2.0275e-06, 7.1357e-07, 4.7466e-04,
         1.9231e-06, 2.3387e-03, 8.4445e-06, 1.2212e-04, 1.2321e-04, 4.9719e-04,
         9.7817e-03, 7.2931e-03, 2.3511e-06, 5.0083e-02, 1.0146e-05, 1.0202e-06,
         5.4978e-04, 8.7630e-04, 9.0879e-02, 2.8873e-08, 5.0999e-03],
        [6.7112e-06, 6.8293e-0

## Try to add MyLSTM to fairseq (CPU only)


### Some notes
+ Follow the tutorial at https://fairseq.readthedocs.io/en/latest/tutorial_simple_lstm.html, add `@register_model` and `@register_model_architecture` to appropriate class.

+ That's not all, you must either:

1. Move your `model_name.py` to `fairseq/models`

2. Inside your directory (supposed `user_dir`), create a new folder named `models` and move your file there. In the command line (such as `fairseq-train`), specify `--user-dir /path/to/user_dir`

Because each time the command line tool runs, it must run some bootstrap code to get the current model list, parse your arguments, etc. Therefore you must follow some of their rules.

In [None]:
# Make mymodel path and prepare my_model (in this case `MyLSTM.py`) in user_dir (in this case `mymodel`)
%cd /content
!rm -rf my_transformer
!git clone https://github.com/gigajet/transformer my_transformer
!mkdir mymodel
%cd mymodel
!mkdir models
%cd models 
!cp /content/my_transformer/MyFairseqLSTM.py .
!cp -r /content/my_transformer/layer .


/content
fatal: destination path 'my_transformer' already exists and is not an empty directory.
mkdir: cannot create directory ‘mymodel’: File exists
/content/mymodel
mkdir: cannot create directory ‘models’: File exists
/content/mymodel/models


In [None]:
# Train the model, please change --max-epoch depends whether you use gpu/cpu
%env CUDA_VISIBLE_DEVICES=0 
!fairseq-train \
    /content/drive/MyDrive/translation/iwslt14.tokenized.de-en \
    --encoder-dropout 0.2 --decoder-dropout 0.2 \
    --optimizer adam --lr 0.005 --lr-shrink 0.5 \
    --max-epoch 1 \
    --max-tokens 12000 \
    --save-dir checkpoints \
    --user-dir /content/mymodel \
    --arch mylstm_default \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric

env: CUDA_VISIBLE_DEVICES=0
2022-04-22 10:02:58 | INFO | numexpr.utils | NumExpr defaulting to 2 threads.
2022-04-22 10:02:59 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-04-22 10:03:01 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/content/mymodel', 'empty_cache_freq': 0, 'all_gath

## Add MyTransformer to fairseq and train it using IWSLT15 en-vi dataset

In [23]:
# Make mymodel path and prepare my_model (in this case `MyLSTM.py`) in user_dir (in this case `mymodel`)
%cd /content
!rm -rf my_transformer
!git clone https://github.com/gigajet/transformer my_transformer
!mkdir mymodel
%cd mymodel
!mkdir models
%cd models 
!cp /content/my_transformer/MyFairseqTransformer.py .
!cp /content/my_transformer/nnFairseqTransformer.py .
!cp -r /content/my_transformer/layer .
%cd /content/fairseq


/content
Cloning into 'my_transformer'...
remote: Enumerating objects: 215, done.[K
remote: Counting objects: 100% (215/215), done.[K
remote: Compressing objects: 100% (147/147), done.[K
remote: Total 215 (delta 118), reused 158 (delta 61), pack-reused 0[K
Receiving objects: 100% (215/215), 27.84 MiB | 38.31 MiB/s, done.
Resolving deltas: 100% (118/118), done.
mkdir: cannot create directory ‘mymodel’: File exists
/content/mymodel
mkdir: cannot create directory ‘models’: File exists
/content/mymodel/models
/content/fairseq


In [24]:
# fix the import layer to mymodel.layer
# F*** PYTHON3 IMPORT
!find /content/mymodel -type f -exec sed -i "s/from layer\./from mymodel.models.layer./g" {} \;

In [26]:
# Use MyTransformer to train en-vi data
# Train the model, please change --max-epoch depends whether you use gpu/cpu
# see stop-time-hours
%env CUDA_VISIBLE_DEVICES=0 
!fairseq-train \
    /content/drive/MyDrive/translation/iwslt15.tokenized.en-vi/dataset \
    --arch nntransformer_default \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-epoch 100 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --save-dir /content/drive/MyDrive/iwslt15.tokenized.en-vi/checkpoints-nntransformer \
    --user-dir /content/mymodel


env: CUDA_VISIBLE_DEVICES=0
2022-05-06 15:12:28 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-05-06 15:12:30 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/content/mymodel', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': No