In [24]:
import pandas as pd
from keras.models import load_model
import sys
import os
import numpy as np
sys.path.append(os.path.join('..'))
from models.utils import get_all_characters, get_vocab, get_inv_vocab, preprocess_input_data
from models.utils import clean_lines, get_short_sentences, decode_sequence
from models.rnn_attention import custom_model, AttentionLayer

In [2]:
dataset = pd.read_csv('../data/data.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,en,ig
0,0,into two,yiwa
1,1,"11 Like a dog that returns to its vomit, The s...",11 Dị nnọọ ka nkịta nke na-alaghachi n’agbọ ọ ...
2,2,rush,kpọwa
3,3,"4 So his father-in-law, the young woman’s fath...","4 Ọgọ nwoke ahụ, bụ́ nna nwa agbọghọ ahụ, ekwe..."
4,4,trap for animals,igbụdụ


In [4]:
Tx = 100
Ty = 120
english_texts, igbo_texts = get_short_sentences(dataset, Tx, Ty)

In [5]:
len(english_texts)

23452

In [6]:
len(igbo_texts)

23452

In [7]:
english_texts = clean_lines(english_texts, True)

In [8]:
igbo_texts = clean_lines(igbo_texts, True)

In [9]:
print(igbo_texts[110:115])
print(english_texts[110:115])

['\t fepu \n', '\t 56 O ga erukwa na di nnoo ka m bu n obi ime ha , otu ahu ka m ga eme unu . \n', '\t 163 Akporo m okwu ugha asi , ana m aso ya oyi . Ahuru m iwu gi n anya . \n', '\t aru ofufo \n', '\t O cho isekapu ofu aka ewi \n']
['\t hurry off \n', '\t 56 And I will do to you what I intended to do to them . \n', '\t 163 I hate falsehood I detest it I love your law . \n', '\t freedom from care \n', '\t She wants to tear off one foreleg of the giant rat \n']


In [10]:
Tx = max([len(x) for x in english_texts])
Ty = max([len(x) for x in igbo_texts])
timesteps = max([Tx, Ty])
print(Tx, Ty, timesteps)

112 128 128


In [11]:
english_characters = get_all_characters(english_texts)
igbo_characters = get_all_characters(igbo_texts)

In [12]:
num_english_tokens = len(english_characters)
num_igbo_tokens = len(igbo_characters)

In [13]:
print("Number of input", len(english_texts))
print("Number of unique igbo characters", num_igbo_tokens)
print("Number of unique english characters", num_english_tokens)

Number of input 23452
Number of unique igbo characters 65
Number of unique english characters 67


In [14]:
igbo_vocab = get_vocab(igbo_characters)
english_vocab = get_vocab(english_characters)
igbo_inv_vocab = get_inv_vocab(igbo_vocab)

In [15]:
english_vocab

defaultdict(<function models.utils.get_vocab.<locals>.<lambda>()>,
            {'\t': 0,
             '\n': 1,
             ' ': 2,
             '!': 3,
             ',': 4,
             '.': 5,
             '1': 6,
             '2': 7,
             '3': 8,
             '4': 9,
             '5': 10,
             '6': 11,
             '7': 12,
             '8': 13,
             '9': 14,
             '?': 15,
             'A': 16,
             'B': 17,
             'C': 18,
             'D': 19,
             'E': 20,
             'F': 21,
             'G': 22,
             'H': 23,
             'I': 24,
             'J': 25,
             'K': 26,
             'L': 27,
             'M': 28,
             'N': 29,
             'O': 30,
             'P': 31,
             'Q': 32,
             'R': 33,
             'S': 34,
             'T': 35,
             'U': 36,
             'V': 37,
             'W': 38,
             'Y': 39,
             'Z': 40,
             'a': 41,
             'b':

In [16]:
encoder_data = preprocess_input_data(english_texts, english_vocab, timesteps, num_english_tokens)
decoder_data = preprocess_input_data(igbo_texts, igbo_vocab, timesteps, num_igbo_tokens)

In [17]:
dimension = 256
epochs = 1
batch_size = 64

In [19]:
#model  = custom_model(num_english_tokens, num_igbo_tokens, dimension, timesteps)
model = load_model("../saved-models/attention-ascii.h5", custom_objects={'AttentionLayer': AttentionLayer})

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 128, 67)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128, 512)          663552    
_________________________________________________________________
attention (AttentionLayer)   (None, 128, 65)           1022914   
Total params: 1,686,466
Trainable params: 1,686,466
Non-trainable params: 0
_________________________________________________________________


In [21]:
encoder_data.shape

(23452, 128, 67)

In [22]:
decoder_data.shape

(23452, 128, 65)

In [23]:
model.fit(encoder_data, decoder_data, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 21106 samples, validate on 2346 samples
Epoch 1/1
   64/21106 [..............................] - ETA: 1:20:30 - loss: 1.3971

KeyboardInterrupt: 

In [27]:
for i in range(750, 800):
    input_seq = encoder_data[i:i+1]
    decoded = model.predict(input_seq)
    pred = np.argmax(decoded, axis=-1)
    decoded_string = ""
    for j in range(timesteps):
        decoded_string += igbo_inv_vocab[pred[0, j]]
    decoded_string = decoded_string.strip()
    print("Source: ", english_texts[i])
    print("Output: ", decoded_string)

Source:  	 26 Give thanks to the God of the heavens , For his loyal love endures forever . 

Output:  26 Kele ne Ce e e
Source:  	 11 And they began to question him , saying Why do the scribes say that E li jah must come first ? 

Output:  11 Ha wee ji
Source:  	 language 

Output:  agu
Source:  	 shut one s mouth 

Output:  gp
Source:  	 dig ground 

Output:  ge
Source:  	 11 By Hu shim he became father to A bi tub and El pa al . 

Output:  11 Iasasa a
Source:  	 intestinal worm 

Output:  aki
Source:  	 22 Then King Je hoi a kim sent El na than the son of Ach bor and other men with him to Egypt . 

Output:  22 Eo Ee
Source:  	 14 Ne than el the fourth , Rad dai the fifth , 

Output:  14 Naaa  b
Source:  	 22 and whoever swears by heaven is swearing by the throne of God and by the One sitting on it . 

Output:  22 onye na a
Source:  	 Afor and Nkwo 

Output:  agi
Source:  	 she herself 

Output:  ya  a
Source:  	 commit abortion 

Output:  ab
Source:  	 be disobedient 

Output:  da   