In [37]:
import sys
import os
import numpy as np
import pandas as pd
from igbo_text import IgboText
from keras.optimizers import Adam
from keras.preprocessing.text import text_to_word_sequence
from collections import Counter
sys.path.append(os.path.join('..'))
import dataloader
from transformer import transformer, extras

In [38]:
igbo_text = IgboText()
dataset = pd.read_csv('../data/data.csv')

In [39]:
splitted_igbo = []
igbo_list = []

In [40]:
igbo_data = dataset['ig'].tolist()

In [41]:
for text in igbo_data:
    text = " ".join(text_to_word_sequence(text, filters='!#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n', lower=False, split=' '))
    if text.strip() == "":
        continue
    tokenized = igbo_text.tokenize(text, convert_to_lower=True)
    splitted_igbo += tokenized
    igbo_list.append(tokenized)

In [42]:
igbo_counter = Counter(splitted_igbo)

In [43]:
len(igbo_counter)

18513

In [44]:
english_data = dataset['en'].tolist()

In [45]:
splitted_english = []
english_list = []

In [46]:
for text in english_data:
    tokenized = text_to_word_sequence(text)
    splitted_english += tokenized
    english_list.append(tokenized)

In [69]:
del english_list[11944]
del english_list[29518]

In [47]:
english_counter = Counter(splitted_english)

In [48]:
len(english_counter)

18245

In [49]:
igbo_tokens = dataloader.TokenList(list(igbo_counter))
eng_tokens = dataloader.TokenList(list(english_counter))

In [50]:
igbo_tokens.length()

18517

In [51]:
eng_tokens.length()

18249

In [55]:
english, igbo = dataloader.make_data(english_list, igbo_list, eng_tokens, igbo_tokens)

In [63]:
model_size = 512

In [57]:
igbo.shape

(45163, 108)

In [65]:
trans = transformer.Transformer(eng_tokens, igbo_tokens, model_size, 2048, 6, 8)

In [66]:
trans.compile(Adam(0.001, 0.9, 0.98, epsilon=1e-9))

In [67]:
trans.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embeddings_1 (Embeddings)       (None, None, 512)    9343488     input_1[0][0]                    
__________________________________________________________________________________________________
position_encoding_1 (PositionEn (None, None, 512)    0           embeddings_1[0][0]               
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 512)    0           position_encoding_1[0][0]        
__________________________________________________________________________________________________
layer_norm

In [68]:
lr_scheduler = extras.LRSchedulerPerStep(model_size, 4000)

In [None]:
trans.model.fit([english, igbo], None, batch_size=64, epochs=50, callbacks=[lr_scheduler])

In [None]:
sentence = "Let it go"
decoded = trans.decode_sequence(sentence, len_limit=len(sentence) + 30)

In [None]:
print(decoded)