In [1]:
import pandas as pd
from models.utils import get_all_characters, get_vocab, get_inv_vocab, preprocess_input_data
from models.utils import preprocess_target_data, clean_lines, get_short_sentences, decode_sequence
from models.seq2seq import custom_model, encoder_model, decoder_model

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('data/data.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,en,ig
0,0,into two,yiwa
1,1,"11 Like a dog that returns to its vomit, The s...",11 Dị nnọọ ka nkịta nke na-alaghachi n’agbọ ọ ...
2,2,rush,kpọwa
3,3,"4 So his father-in-law, the young woman’s fath...","4 Ọgọ nwoke ahụ, bụ́ nna nwa agbọghọ ahụ, ekwe..."
4,4,trap for animals,igbụdụ


Because of performance constraint, I will only use those sentences that are below a certain size

In [4]:
Tx = 100
Ty = 120
english_texts, igbo_texts = get_short_sentences(dataset, Tx, Ty)

In [5]:
len(english_texts)

23399

In [6]:
len(igbo_texts)

23399

In [7]:
english_characters = get_all_characters(english_texts)
igbo_characters = get_all_characters(igbo_texts)

In [8]:
num_english_tokens = len(english_characters)
num_igbo_tokens = len(igbo_characters)

In [9]:
print("Number of input", len(english_texts))
print("Number of unique igbo characters", num_igbo_tokens)
print("Number of unique english characters", num_english_tokens)

Number of input 23399
Number of unique igbo characters 119
Number of unique english characters 94


In [10]:
igbo_vocab = get_vocab(igbo_characters)
english_vocab = get_vocab(english_characters)
igbo_inv_vocab = get_inv_vocab(igbo_vocab)

In [11]:
encoder_input_data = preprocess_input_data(english_texts, english_vocab, Tx, num_english_tokens)
decoder_input_data = preprocess_input_data(igbo_texts, igbo_vocab, Ty, num_igbo_tokens)
decoder_target_data = preprocess_target_data(igbo_texts, igbo_vocab, Ty, num_igbo_tokens)

In [12]:
dimension = 256
epochs = 1
batch_size = 64

In [13]:
model  = custom_model(num_english_tokens, num_igbo_tokens, dimension)

In [14]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [15]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 94)     0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 119)    0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 256), (None, 359424      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  385024      decoder_inputs[0][0]             
                                                                 encoder_lstm[0][1]               
          

In [26]:
model.fit([encoder_input_data, decoder_input_data], 
                  decoder_target_data, 
                  batch_size=batch_size, 
                  epochs=epochs, validation_split=0.1)

Train on 21059 samples, validate on 2340 samples
Epoch 1/1


<keras.callbacks.History at 0x25fca914ac8>

In [27]:
encoder_inputs = model.inputs[0]

In [28]:
encoder_lstm = model.get_layer(name='encoder_lstm')

In [29]:
decoder_inputs = model.inputs[1]

In [30]:
decoder_lstm = model.get_layer(name='decoder_lstm')

In [31]:
decoder_dense = model.get_layer(name='decoder_dense')

In [36]:
encoder_model = encoder_model(encoder_inputs, encoder_lstm)

TypeError: __call__() takes 2 positional arguments but 3 were given

In [37]:
decoder_model = decoder_model(dimension, decoder_lstm, decoder_inputs, decoder_dense)

TypeError: __call__() takes 2 positional arguments but 5 were given

In [38]:
for i in range(10):
    input_seq = encoder_input_data[i:i+1]
    decoded_sentence = decode_sequence(input_seq, encoder_model, num_igbo_tokens, igbo_vocab, decoder_model, igbo_inv_vocab, Ty)
    print("Source: ", english_texts[i])
    print("Output: ", decoded_sentence)

Source:  into two
Output:  2 na anụ anụ na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  11 Like a dog that returns to its vomit, The stupid one repeats his foolishness.
Output:  we na na-enu na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  rush
Output:  we na na-enu na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  trap for animals
Output:  we na na-enu na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  cat
Output:  2 na anụ anụ na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  Rhynchophorus phoenicis
Output:  wa na na-enu na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 
Source:  write again