# Example Transformer model for machine translation (eng to kmb)

## Dependencies

In [1]:
! pip install opustools-pkg

Collecting opustools-pkg
[?25l  Downloading https://files.pythonhosted.org/packages/6c/9f/e829a0cceccc603450cd18e1ff80807b6237a88d9a8df2c0bb320796e900/opustools_pkg-0.0.52-py3-none-any.whl (80kB)
[K     |████                            | 10kB 19.8MB/s eta 0:00:01[K     |████████                        | 20kB 3.1MB/s eta 0:00:01[K     |████████████▏                   | 30kB 4.1MB/s eta 0:00:01[K     |████████████████▏               | 40kB 3.0MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 3.2MB/s eta 0:00:01[K     |████████████████████████▎       | 61kB 3.8MB/s eta 0:00:01[K     |████████████████████████████▎   | 71kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.5MB/s 
[?25hInstalling collected packages: opustools-pkg
Successfully installed opustools-pkg-0.0.52


In [2]:
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

Cloning into 'joeynmt'...
remote: Enumerating objects: 2380, done.[K
remote: Total 2380 (delta 0), reused 0 (delta 0), pack-reused 2380[K
Receiving objects: 100% (2380/2380), 2.60 MiB | 30.59 MiB/s, done.
Resolving deltas: 100% (1669/1669), done.
Processing /content/joeynmt
Collecting sacrebleu>=1.3.6
[?25l  Downloading https://files.pythonhosted.org/packages/6e/9d/9846507837ca50ae20917f59d83b79246b8313bd19d4f5bf575ecb98132b/sacrebleu-1.4.9-py3-none-any.whl (60kB)
[K     |████████████████████████████████| 61kB 2.9MB/s 
[?25hCollecting subword-nmt
  Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl
Collecting pyyaml>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)
[K     |████████████████████████████████| 276kB 12.3MB/s 
[?25hCollecting pylint
[?25l  Downloading h

## Imports

In [0]:
from os import path
import os
import time

import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer

## Data Gathering

In [0]:
source_language = 'en'
target_language = 'kmb'
os.environ["data_path"] = path.join("joeynmt", "data", source_language + target_language) 
os.environ["src"] = source_language 
os.environ["tgt"] = target_language

In [5]:
# JW300 data
! opus_read -d JW300 -s $tgt -t $src -wm moses -w jw300.$tgt jw300.$src -q

source = []
target = []
with open('jw300.' + source_language) as f:
  for _, line in enumerate(f):
    source.append(line.strip())
with open('jw300.' + target_language) as f:
  for _, line in enumerate(f):
    target.append(line.strip())

jw300_raw = []
for idx, line in enumerate(source):
  if len(line) > 2:
    if len(target[idx]) > 2:
      jw300_raw.append([line, target[idx]])

jw300 = pd.DataFrame(jw300_raw, columns=['source_sentence', 'target_sentence'])
jw300.head(3)


Alignment file /proj/nlpl/data/OPUS/JW300/latest/xml/en-kmb.xml.gz not found. The following files are available for downloading:

 920 KB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en-kmb.xml.gz
 263 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en.zip
  10 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/kmb.zip

 274 MB Total size
./JW300_latest_xml_en-kmb.xml.gz ... 100% of 920 KB
./JW300_latest_xml_en.zip ... 100% of 263 MB
./JW300_latest_xml_kmb.zip ... 100% of 10 MB


Unnamed: 0,source_sentence,target_sentence
0,Table of Contents,Iala – mu
1,"December 1 , 2010",1 Ua Katatu Ua 2011
2,Who Inhabit the Spirit Realm ?,O Kuiala ku Diulu Kuene Muene Athu mu Nzumbi


In [6]:
# Common test data
source_test_file = 'test.en-' + target_language + '.en'
target_test_file = 'test.en-' + target_language + '.' + target_language

! wget https://raw.githubusercontent.com/jaderabbit/masakhane/master/jw300_utils/test/test.en-$tgt.en
! wget https://raw.githubusercontent.com/jaderabbit/masakhane/master/jw300_utils/test/test.en-$tgt.$tgt

source = []
target = []
with open(source_test_file) as f:
  for _, line in enumerate(f):
    source.append(line.strip())
with open(target_test_file) as f:
  for _, line in enumerate(f):
    target.append(line.strip())

! rm test.en-$tgt.en
! rm test.en-$tgt.$tgt

test_raw = []
for idx, line in enumerate(source):
  if len(line) > 2:
    if len(target[idx]) > 2:
      test_raw.append([line, target[idx]])

df_test = pd.DataFrame(test_raw, columns=['source_sentence', 'target_sentence'])
df_test.head(3)

--2020-05-05 18:32:35--  https://raw.githubusercontent.com/jaderabbit/masakhane/master/jw300_utils/test/test.en-kmb.en
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 204945 (200K) [text/plain]
Saving to: ‘test.en-kmb.en’


2020-05-05 18:32:36 (7.47 MB/s) - ‘test.en-kmb.en’ saved [204945/204945]

--2020-05-05 18:32:36--  https://raw.githubusercontent.com/jaderabbit/masakhane/master/jw300_utils/test/test.en-kmb.kmb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 230668 (225K) [text/plain]
Saving to: ‘test.en-kmb.kmb’


2020-05-05 18:32:36 (8

Unnamed: 0,source_sentence,target_sentence
0,Dorcas “ abounded in good deeds and gifts of m...,"Dorka , “ [ uavudile ] jimbote ni jimola [ ja ..."
1,"What will be considered in this article , and ...","Ihi i tua - nda di longa ku mbandu íii , ni mu..."
2,Some names in this article have been changed .,Saí majina a a lungulula .


## Pre-processing

In [7]:
# drop test data from common
df_pp = jw300[~jw300['source_sentence'].isin(df_test['source_sentence'].values)]
df_pp = df_pp[~df_pp['target_sentence'].isin(df_test['target_sentence'].values)]

# remove duplicates
df_pp.drop_duplicates(inplace=True)

# remove conflicting translations
df_pp.drop_duplicates(subset='source_sentence', inplace=True)
df_pp.drop_duplicates(subset='target_sentence', inplace=True)

# what's left in terms of number of samples?
len(df_pp)/len(jw300)

0.9117961694336439

In [0]:
# reset the index of the training set after filtering
df_pp.reset_index(drop=False, inplace=True)

In [0]:
## Lower case the corpus
df_pp["source_sentence"] = df_pp["source_sentence"].str.lower()
df_pp["target_sentence"] = df_pp["target_sentence"].str.lower()
df_test["source_sentence"] = df_test["source_sentence"].str.lower()
df_test["target_sentence"] = df_test["target_sentence"].str.lower()

# shuffle the training/dev data
df_pp = df_pp.sample(frac=1).reset_index(drop=True)

# Do the split between dev/train
num_dev_patterns = 1000
dev = df_pp.tail(num_dev_patterns)
stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)

# output the final parallel corpus files
with open("train."+source_language, "w") as src_file, open("train."+target_language, "w") as trg_file:
  for index, row in stripped.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")
    
with open("dev."+source_language, "w") as src_file, open("dev."+target_language, "w") as trg_file:
  for index, row in dev.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")

with open("test."+source_language, "w") as src_file, open("test."+target_language, "w") as trg_file:
  for index, row in df_test.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")

In [10]:
! head train.en

it hurt him even to contemplate the deed she wanted him to commit .
with the help of jehovah’s spirit , your children can have strong faith .
god can awaken the dead , just as we can awaken a person from sleep . ​ — job 14 : 13 - 15 .
so whether our home will be in heaven with jesus or on a paradise earth , pentecost of the year 33 is very important to us . ​ — see endnote .
but remember what jehoshaphat did .
• what are some ways married people can let spirituality guide them ?
“ the father incomprehensible , the son incomprehensible , and the holy ghost incomprehensible . ” ​ — the athanasian creed , describing the trinity taught by many churches of christendom .
why not ? the lamp’s light is diminishing so gradually that you are not aware of it . similarly , the influences of satan’s world may cause our zeal to diminish little by little .
but they are not accurate .
joshua did not ask god what to do when the gibeonites wanted to make an agreement with him .


In [11]:
! head train.kmb

o ku xinganeka ngó ku ima ia mesenene o muhatu kuila muéne u bhanga , kia luualesa kiavulu o muxima ua zuze .
ni kikuatekesu kia nzumbi ikola ia jihova , o tuana tué a tena kukala ni kixikanu kia kolo .
( nzuá 11 : 11 - 14 ) nzambi uala ni kutena kua kuphaphumuna uoso ua fu , kála ki tuene mu balumuna mukuetu ua mu zeka . ​ — jobe 14 : 13 - 15 .
mu kiki , kikale se tua kingila kutunga kumoxi ni jezú ku diulu , mba mu palaízu mu ixi , o fesa ia pendekoxi ia muvu ua 33 , iala ni valolo ia dikota phala etu . ​ — tala ku disukilu dia milongi .
nange tu kala ni uôma uavulu .
• mu ukexilu uahi o jikidistá a tokala kuehela o itumu ia nzambi ku a endesa ?
“ ki tu tena ku tendela o tata , o mona ki tu tena ku mu tendela , o nzumbi ikôla ué ki tu tena ku i tendela . ” — o milongi ia atanasiano , ia tilindade iene mu longa mu jingeleja javulu ja kidistándade .
kiki kia difu ni jindunge ja mundu ua satanaji , ji tena ku tu bhangesa ku zozesa o vondadi ietu ia ku sidivila jihova .
maji jene , ki ja

## Subword BPE Tokens

In [12]:
# Do BPE
! subword-nmt learn-joint-bpe-and-vocab --input train.$src train.$tgt -s 4000 -o bpe.codes.4000 --write-vocabulary vocab.$src vocab.$tgt

! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < train.$src > train.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < train.$tgt > train.bpe.$tgt
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < dev.$src > dev.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < dev.$tgt > dev.bpe.$tgt
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < test.$src > test.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < test.$tgt > test.bpe.$tgt

# Create directory, move everyone we care about to the correct location
! mkdir -p $data_path
! cp train.* $data_path
! cp test.* $data_path
! cp dev.* $data_path
! cp bpe.codes.4000 $data_path
! ls $data_path

# Create that vocab using build_vocab
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py joeynmt/data/$src$tgt/train.bpe.$src joeynmt/data/$src$tgt/train.bpe.$tgt --output_path joeynmt/data/$src$tgt/vocab.txt

# Some output
! echo "BPE Target Sentences"
! tail -n 5 test.bpe.$tgt
! echo "Combined BPE Vocab"
! tail -n 10 joeynmt/data/en$tgt/vocab.txt

bpe.codes.4000	dev.en	     test.bpe.kmb  train.bpe.en   train.kmb
dev.bpe.en	dev.kmb      test.en	   train.bpe.kmb
dev.bpe.kmb	test.bpe.en  test.kmb	   train.en
BPE Target Sentences
o ngu@@ b@@ u ya kuxikana ( tala o kaxi 12 - 14 )
o ka@@ pas@@ e@@ te ka kubh@@ uluka ( tala o kaxi 15 - 18 )
nga mono kwila o athu a xikina dingi se a mona kwila ey@@ e wa zolo mwene o bibidya , wa mu bhanga yoso i u tena phala ku a kwatekesa . ”
o xi@@ bhata ya nzumbi ikôla ( tala o kaxi 19 - 20 )
ni ki@@ kwat@@ ek@@ esu kya jihova tu tena kubh@@ ânga nê !
Combined BPE Vocab
iókio
sambu@@
mulang@@
ízu
iiale
langu@@
njin@@
kobo
fuxi
c.@@


## JoeyNMT Config

In [0]:
# This creates the config file for our JoeyNMT system. 
name = '%s%s' % (source_language, target_language)

config = """
name: "{name}_transformer"

data:
    src: "{source_language}"
    trg: "{target_language}"
    train: "data/{name}/train.bpe"
    dev:   "data/{name}/dev.bpe"
    test:  "data/{name}/test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    src_vocab: "data/{name}/vocab.txt"
    trg_vocab: "data/{name}/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    #load_model: "models/{name}_transformer/12000.ckpt" # if given, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "noam"            # Try switching from plateau to Noam scheduling
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    patience: 8
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0002
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "eval_metric" # "ppl"
    epochs: 40
    validation_freq: 2000
    logging_freq: 200
    eval_metric: "bleu"
    model_dir: "models/{name}_transformer"
    overwrite: True
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 512
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 512
        ff_size: 2048
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 512
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 512
        ff_size: 2048
        dropout: 0.3
""".format(name=name, source_language=source_language, target_language=target_language)
with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
    f.write(config)

## Train the model

In [14]:
!cd joeynmt; python3 -m joeynmt train configs/transformer_$src$tgt.yaml

2020-05-05 18:37:35,212 Hello! This is Joey-NMT.
2020-05-05 18:37:35.336316: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-05-05 18:37:36,615 Total params: 46259200
2020-05-05 18:37:36,616 Trainable parameters: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.dec_layer_norm.bias', 'decoder.layers.0.dec_layer_norm.weight', 'decoder.layers.0.feed_forward.layer_norm.bias', 'decoder.layers.0.feed_forward.layer_norm.weight', 'decoder.layers.0.feed_forward.pwff_layer.0.bias', 'decoder.layers.0.feed_forward.pwff_layer.0.weight', 'decoder.layers.0.feed_forward.pwff_layer.3.bias', 'decoder.layers.0.feed_forward.pwff_layer.3.weight', 'decoder.layers.0.src_trg_att.k_layer.bias', 'decoder.layers.0.src_trg_att.k_layer.weight', 'decoder.layers.0.src_trg_att.output_layer.bias', 'decoder.layers.0.src_trg_att.output_layer.weight', 'decoder.layers.0.src_trg_att.q_layer.bias', 'decoder.layers.0.src_trg_

In [0]:
! cat joeynmt/models/enkmb_transformer/validations.txt

Steps: 2000	Loss: 47097.83984	PPL: 8.26328	bleu: 14.85684	LR: 0.00049411	*
Steps: 4000	Loss: 39856.20312	PPL: 5.97219	bleu: 20.91001	LR: 0.00034939	*
Steps: 6000	Loss: 37121.82812	PPL: 5.28307	bleu: 23.80892	LR: 0.00028527	*
Steps: 8000	Loss: 35674.49609	PPL: 4.95110	bleu: 25.00066	LR: 0.00024705	*
Steps: 10000	Loss: 34720.73828	PPL: 4.74383	bleu: 26.10309	LR: 0.00022097	*
Steps: 12000	Loss: 34417.08594	PPL: 4.67967	bleu: 26.15923	LR: 0.00020172	*
Steps: 14000	Loss: 34284.73047	PPL: 4.65198	bleu: 27.20639	LR: 0.00018675	*
Steps: 16000	Loss: 34352.42969	PPL: 4.66613	bleu: 27.29388	LR: 0.00017469	*
Steps: 18000	Loss: 34467.03906	PPL: 4.69017	bleu: 27.56135	LR: 0.00016470	*
Steps: 20000	Loss: 34512.35938	PPL: 4.69971	bleu: 27.82828	LR: 0.00015625	*
Steps: 22000	Loss: 34950.40234	PPL: 4.79293	bleu: 27.65960	LR: 0.00014898	
Steps: 24000	Loss: 35239.23828	PPL: 4.85541	bleu: 27.80829	LR: 0.00014264	
Steps: 26000	Loss: 35323.68750	PPL: 4.87383	bleu: 28.57189	LR: 0.00013704	*
Steps: 28000	Loss:

# Evaluate the model on the test set

In [0]:
! cd joeynmt; python3 -m joeynmt test models/enkmb_transformer/config.yaml 

2020-02-04 22:38:57,549 Hello! This is Joey-NMT.
2020-02-04 22:39:38,351  dev bleu:  28.81 [Beam search decoding with beam size = 5 and alpha = 1.0]
2020-02-04 22:41:09,183 test bleu:  32.76 [Beam search decoding with beam size = 5 and alpha = 1.0]
