# Train Q&A using Fairseq on Colab

In [None]:
!pip install fairseq sacrebleu==1.5.1 sacremoses
!pip install fastBPE subword_nmt omegaconf hydra-core 

In [None]:
!git clone https://github.com/pytorch/fairseq

In [None]:
import os

## Training


In [None]:
%cd /content/fairseq/examples/translation/

In [None]:
import csv, random

# read the dataset
input_files = ['/content/qna_chitchat_caring.tsv',
              '/content/qna_chitchat_witty.tsv',
              '/content/qna_chitchat_friendly.tsv',
              '/content/qna_chitchat_professional.tsv'
              ]

pair_list = {
    'src':[],
    'des':[]
}

n = 0
for f in input_files:
  with open(f) as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
      if len(line) < 2:
        # skip if the line doesn't contain the pair of text
        continue
      pair_list['src'].append(line[0])
      pair_list['des'].append(line[1])
      n += 1

indexes = [i for i in range(n)]
random.seed(1)
random.shuffle(indexes)

tmp_list = pair_list['src'].copy()
pair_list['src'] = [tmp_list[i] for i in indexes]

tmp_list = pair_list['des'].copy()
pair_list['des'] = [tmp_list[i] for i in indexes]

In [None]:
pair_list['src'][:5]

In [None]:
pair_list['des'][:5]

In [None]:
# split the read dataset
split_ratio = [0, 0.90,0.98,1.00]

list_len = len(pair_list['src'])
print('number of sample: %d'%(list_len))
split_index = [int(x*list_len) for x in split_ratio]

In [None]:
dataset_name = 'qna_chitchat'
langs = {
    'src':'q',
    'des':'a'
}

output_dir = '%s.%s-%s'%(dataset_name, langs['src'], langs['des'])
os.makedirs(output_dir, exist_ok=True)
os.makedirs("%s/tmp"%output_dir, exist_ok=True)

corpus = {}
i = 0

for s in ['train', 'valid', 'test']:
  corpus[s] = {}
  for p in pair_list.keys():
    corpus[s][p] = pair_list[p][split_index[i]:split_index[i+1]]

    with open('%s/tmp/%s.%s'%(output_dir, s, langs[p]), 'w') as f:
      f.write('\n'.join(corpus[s][p]))
  i+=1
  print('number of %s: %d'%(s, len(corpus[s][p])))

In [None]:
!ls

In [None]:
%cd /content/fairseq/examples/translation/
!echo 'Cloning Moses github repository (for tokenization scripts)...'
!git clone https://github.com/moses-smt/mosesdecoder.git

!echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
!git clone https://github.com/rsennrich/subword-nmt.git


In [None]:
%cd /content/fairseq/examples/translation/

In [None]:
%%shell
cd /content/fairseq/examples/translation/

SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt/subword_nmt
BPE_TOKENS=10000

data_name=qna_chitchat
src=q
tgt=a
lang=$src-$tgt
prep=$data_name.$lang
tmp=$prep/tmp
orig=orig

mkdir -p $orig $tmp $prep


echo "pre-processing train data..."
for l in $src $tgt; do
    f=$tmp/train.$l
    tok=train.tags.$lang.tok.$l

    cat $f | \
    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
    echo ""
done

perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
for l in $src $tgt; do
    perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
done


echo "pre-processing valid data..."
for l in $src $tgt; do
    f=$tmp/valid.$l
    tok=valid.tags.$lang.tok.$l

    cat $f | \
    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
    echo ""
done

perl $CLEAN -ratio 1.5 $tmp/valid.tags.$lang.tok $src $tgt $tmp/valid.tags.$lang.clean 1 175
for l in $src $tgt; do
    perl $LC < $tmp/valid.tags.$lang.clean.$l > $tmp/valid.tags.$lang.$l
done

echo "pre-processing test data..."
for l in $src $tgt; do
    f=$tmp/test.$l
    tok=test.tags.$lang.tok.$l

    cat $f | \
    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
    echo ""
done

perl $CLEAN -ratio 1.5 $tmp/test.tags.$lang.tok $src $tgt $tmp/test.tags.$lang.clean 1 175
for l in $src $tgt; do
    perl $LC < $tmp/test.tags.$lang.clean.$l > $tmp/test.tags.$lang.$l
done

TRAIN=$tmp/train.$src-$tgt

echo $TRAIN

BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
    cat $tmp/train.$l >> $TRAIN
done

echo $TRAIN

echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE

for L in $src $tgt; do
    for f in train.$L valid.$L test.$L; do
        echo "apply_bpe.py to ${f}..."
        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
    done
done


In [None]:
%cd /content/fairseq/

In [None]:
%%bash

data_name=qna_chitchat
src=q
tgt=a
lang=$src-$tgt

TEXT=examples/translation/$data_name.$src-$tgt

rm -rf data-bin/$data_name.tokenized.$src-$tgt

fairseq-preprocess --source-lang $src --target-lang $tgt \
    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    --destdir data-bin/$data_name.tokenized.$src-$tgt \
    --workers 20

In [None]:
%env data_name=qna_chitchat
%env src=q
%env tgt=a
%env lang=$src-$tgt

# keep these files for inference later
!mkdir -p /content/$data_name.tokenized.$src-$tgt/
!cp /content/fairseq/examples/translation/$data_name.$src-$tgt/code \
      /content/$data_name.tokenized.$src-$tgt/
!cp -r /content/fairseq/data-bin/$data_name.tokenized.$src-$tgt/dict.$src.txt \
      /content/$data_name.tokenized.$src-$tgt/
!cp -r /content/fairseq/data-bin/$data_name.tokenized.$src-$tgt/dict.$tgt.txt \
      /content/$data_name.tokenized.$src-$tgt/
%cd /content/
!zip -rq $data_name.tokenized.$src-$tgt.zip $data_name.tokenized.$src-$tgt
!cp $data_name.tokenized.$src-$tgt.zip \
      /content/runs/fairseq/$src-$tgt/1/$data_name.tokenized.$src-$tgt.zip

%cd fairseq/
!CUDA_VISIBLE_DEVICES=0 fairseq-train \
    data-bin/$data_name.tokenized.$src-$tgt \
    --save-dir /content/runs/fairseq/$src-$tgt/1/ \
    --arch transformer \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.0001 --lr-scheduler inverse_sqrt --warmup-updates 16 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 2048 \
    --no-epoch-checkpoints \
    --skip-invalid-size-inputs-valid-test \
    --max-epoch 256 \
    --encoder-embed-dim 256 \
    --decoder-embed-dim 256 \
    --validate-interval 16 \
    # --finetune-from-model /content/runs/fairseq/$src-$tgt/1/checkpoint_best.pt \

## Evaluation

In [None]:
%env data_name=qna_chitchat
%env src=q
%env tgt=a
%env lang=$src-$tgt

!fairseq-generate \
    data-bin/$data_name.tokenized.$src-$tgt \
    --path /content/runs/fairseq/$src-$tgt/1/checkpoint_best.pt \
    --beam 5 --remove-bpe

## Testing

In [None]:
%cd /content/fairseq/

%env data_name=qna_chitchat
%env src=q
%env tgt=a
%env lang=$src-$tgt

In [None]:
!pip install --editable ./

In [None]:
from fairseq.models.transformer import TransformerModel
import warnings

In [None]:
train_num = 1

warnings.filterwarnings('ignore', category=UserWarning, module='fairseq')

checkpoint_dir = '/content/runs/fairseq/%s-%s/%s/'%(os.getenv('src'),os.getenv('tgt'),train_num)
checkpoint_file = 'checkpoint_best.pt'

%cd /content/
!cp /content/runs/fairseq/$src-$tgt/1/$data_name.tokenized.$src-$tgt.zip \
      /content/
!unzip -f $data_name.tokenized.$src-$tgt.zip

qna = TransformerModel.from_pretrained(
  checkpoint_dir,
  checkpoint_file=checkpoint_file,
  data_name_or_path='%s.tokenized.%s-%s'%(os.getenv('data_name'),os.getenv('src'),os.getenv('tgt')),
  bpe='subword_nmt',
  bpe_codes='%s.tokenized.%s-%s/code'%(os.getenv('data_name'),os.getenv('src'),os.getenv('tgt'))
)

In [None]:
input_text = "hello, how do you do?"
output_text = qna.translate(input_text)
print(output_text)