In [1]:
import pandas as pd
from keras.models import load_model
import sys
import os
sys.path.append(os.path.join('..'))
from models.utils import get_all_characters, get_vocab, get_inv_vocab, preprocess_input_data
from models.utils import preprocess_target_data, clean_lines, get_short_sentences, decode_sequence
from models.seq2seq import custom_model, encoder_model, decoder_model

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('../data/data.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,en,ig
0,0,into two,yiwa
1,1,"11 Like a dog that returns to its vomit, The s...",11 Dị nnọọ ka nkịta nke na-alaghachi n’agbọ ọ ...
2,2,rush,kpọwa
3,3,"4 So his father-in-law, the young woman’s fath...","4 Ọgọ nwoke ahụ, bụ́ nna nwa agbọghọ ahụ, ekwe..."
4,4,trap for animals,igbụdụ


Because of performance constraint, I will only use those sentences that are below a certain size

In [4]:
Tx = 100
Ty = 120
english_texts, igbo_texts = get_short_sentences(dataset, Tx, Ty)

In [5]:
len(english_texts)

23452

In [6]:
len(igbo_texts)

23452

In [7]:
english_texts = clean_lines(english_texts, True)

In [8]:
igbo_texts = clean_lines(igbo_texts, True)

In [9]:
print(igbo_texts[110:115])
print(english_texts[110:115])

['\t fepu \n', '\t 56 O ga erukwa na di nnoo ka m bu n obi ime ha , otu ahu ka m ga eme unu . \n', '\t 163 Akporo m okwu ugha asi , ana m aso ya oyi . Ahuru m iwu gi n anya . \n', '\t aru ofufo \n', '\t O cho isekapu ofu aka ewi \n']
['\t hurry off \n', '\t 56 And I will do to you what I intended to do to them . \n', '\t 163 I hate falsehood I detest it I love your law . \n', '\t freedom from care \n', '\t She wants to tear off one foreleg of the giant rat \n']


In [10]:
Tx = max([len(x) for x in english_texts])
Ty = max([len(x) for x in igbo_texts])
print(Tx, Ty)

112 128


In [11]:
english_characters = get_all_characters(english_texts)
igbo_characters = get_all_characters(igbo_texts)

In [12]:
num_english_tokens = len(english_characters)
num_igbo_tokens = len(igbo_characters)

In [13]:
print("Number of input", len(english_texts))
print("Number of unique igbo characters", num_igbo_tokens)
print("Number of unique english characters", num_english_tokens)

Number of input 23452
Number of unique igbo characters 65
Number of unique english characters 67


In [14]:
igbo_vocab = get_vocab(igbo_characters)
english_vocab = get_vocab(english_characters)
igbo_inv_vocab = get_inv_vocab(igbo_vocab)

In [27]:
english_vocab

defaultdict(<function models.utils.get_vocab.<locals>.<lambda>()>,
            {'\t': 0,
             '\n': 1,
             ' ': 2,
             '!': 3,
             ',': 4,
             '.': 5,
             '1': 6,
             '2': 7,
             '3': 8,
             '4': 9,
             '5': 10,
             '6': 11,
             '7': 12,
             '8': 13,
             '9': 14,
             '?': 15,
             'A': 16,
             'B': 17,
             'C': 18,
             'D': 19,
             'E': 20,
             'F': 21,
             'G': 22,
             'H': 23,
             'I': 24,
             'J': 25,
             'K': 26,
             'L': 27,
             'M': 28,
             'N': 29,
             'O': 30,
             'P': 31,
             'Q': 32,
             'R': 33,
             'S': 34,
             'T': 35,
             'U': 36,
             'V': 37,
             'W': 38,
             'Y': 39,
             'Z': 40,
             'a': 41,
             'b':

In [15]:
encoder_input_data = preprocess_input_data(english_texts, english_vocab, Tx, num_english_tokens)
decoder_input_data = preprocess_input_data(igbo_texts, igbo_vocab, Ty, num_igbo_tokens)
decoder_target_data = preprocess_target_data(igbo_texts, igbo_vocab, Ty, num_igbo_tokens)

In [16]:
dimension = 256
epochs = 1
batch_size = 64

In [17]:
model = load_model('../saved-models/seq2seq-ascii.h5')

In [13]:
model  = custom_model(num_english_tokens, num_igbo_tokens, dimension)

In [18]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 67)     0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 65)     0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 256), (None, 331776      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  329728      decoder_inputs[0][0]             
                                                                 encoder_lstm[0][1]               
          

In [26]:
model.fit([encoder_input_data, decoder_input_data], 
                  decoder_target_data, 
                  batch_size=batch_size, 
                  epochs=epochs, validation_split=0.1)

Train on 21059 samples, validate on 2340 samples
Epoch 1/1


<keras.callbacks.History at 0x25fca914ac8>

In [19]:
encoder_inputs = model.inputs[0]

In [20]:
encoder_lstm = model.get_layer(name='encoder_lstm')

In [21]:
decoder_inputs = model.inputs[1]

In [22]:
decoder_lstm = model.get_layer(name='decoder_lstm')

In [23]:
decoder_dense = model.get_layer(name='decoder_dense')

In [24]:
encoder_model = encoder_model(encoder_inputs, encoder_lstm)

In [25]:
decoder_model = decoder_model(dimension, decoder_lstm, decoder_inputs, decoder_dense)

In [26]:
for i in range(750, 800):
    input_seq = encoder_input_data[i:i+1]
    decoded_sentence = decode_sequence(input_seq, encoder_model, num_igbo_tokens, igbo_vocab, decoder_model, igbo_inv_vocab, Ty)
    print("Source: ", english_texts[i])
    print("Output: ", decoded_sentence)

Source:  	 26 Give thanks to the God of the heavens , For his loyal love endures forever . 

Output:   26 Chineke , biko , ka m ga eme ka m buru ihe ndina gi , m ga aga nke umu Izrel . 

Source:  	 11 And they began to question him , saying Why do the scribes say that E li jah must come first ? 

Output:   11 O wee si ha O bu na ndi a ga agha wee mee mmadu ahu , o bukwa n ebe ahu ka ha na eme omume ozugbo . 

Source:  	 language 

Output:   obu anya 

Source:  	 shut one s mouth 

Output:   kpu oku 

Source:  	 dig ground 

Output:   wa ani 

Source:  	 11 By Hu shim he became father to A bi tub and El pa al . 

Output:   11 Obi m dikwara m , si , O buru na i meere m amara . 

Source:  	 intestinal worm 

Output:   akpa ego 

Source:  	 22 Then King Je hoi a kim sent El na than the son of Ach bor and other men with him to Egypt . 

Output:   22 Jotam nwunye nwaanyi Sila na Zeron na Jehoshan na Jefiya . 

Source:  	 14 Ne than el the fourth , Rad dai the fifth , 

Output:   14 na Haza n