In [1]:
import keras
import tensorflow as tf
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Using TensorFlow backend.


In [2]:
import pandas as pd
import io
import csv
from docx import Document

def read_docx_tab(tab, **kwargs):
    vf = io.StringIO()
    writer = csv.writer(vf)
    for row in tab.rows:
        writer.writerow(cell.text for cell in row.cells)
    vf.seek(0)
    return pd.read_csv(vf, **kwargs)

def read_docx_tables(filename, tab_id=None, **kwargs):
    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in doc.tables]
    else:
        try:
            return read_docx_tab(doc.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise

            
table = read_docx_tables(filename = 'Tabular.docx', tab_id = 0)

In [3]:
table.head()

Unnamed: 0,S/N,ENG VERB,ENG VERB.1,PAST TENSE,PAST TENSE .1,PAST TENSE .2,English Sentence,IGBO SENTENCES
0,1,FIND,FIND,FOUND,FOUND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,2,DO,DO,DID,DID,DID,I did it,émèrè ḿ yá
2,3,MAKE,MAKE,MADE,MADE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,4,GET,GET,GOT,GOT,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,5,SAY,SAY,SAID,SAID,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [4]:
table.columns

Index(['S/N', 'ENG VERB', 'ENG VERB.1', 'PAST TENSE ', 'PAST TENSE .1',
       'PAST TENSE .2', 'English Sentence ', 'IGBO SENTENCES '],
      dtype='object')

In [5]:
updated_table = table.drop(['S/N','ENG VERB.1','PAST TENSE .1','PAST TENSE .2'], axis = 1)

In [6]:
updated_table.head()

Unnamed: 0,ENG VERB,PAST TENSE,English Sentence,IGBO SENTENCES
0,FIND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,DO,DID,I did it,émèrè ḿ yá
2,MAKE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,GET,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,SAY,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [7]:
updated_table = updated_table.dropna()
updated_table.shape

(93, 4)

In [8]:
x, y = updated_table['English Sentence '], updated_table['IGBO SENTENCES ']
raw_dataset = updated_table[['English Sentence ','IGBO SENTENCES ']]

In [9]:
x[:2]

0    I found the book
1           I did it 
Name: English Sentence , dtype: object

In [10]:
y[:2]

0    Áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

# Data Cleaning and preprocessing

In [11]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [12]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [13]:
# remove quotes
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [14]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [15]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [16]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [17]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [18]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [19]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_igbo_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_igbo_words:
            all_igbo_words.add(word)

In [20]:
print(len(all_eng_words))
print(len(all_igbo_words))

177
183


In [21]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_igbo_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_igbo_words)
num_encoder_tokens, num_decoder_tokens

(177, 183)

In [22]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [32]:
x_lower[:4]

0     i found the book
1             i did it
2    he made  the cake
3           i got home
Name: English Sentence , dtype: object

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_lower, y_lower, test_size = 0.2)
x_train.shape, x_test.shape

((74,), (19,))

In [34]:
x_train[:2]

55    he promoted the argument
49      she developed the idea
Name: English Sentence , dtype: object

In [39]:
len(x_train)

74

In [38]:
max_encoder_seq_length = 5
max_decoder_seq_length = 7

In [40]:
encoder_input_data = np.zeros(
    (len(x_train), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(x_train), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(x_train), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [48]:
list_me = ['i', ' ', 'am', 'her']
out = list_me.remove(' ')
out

In [50]:
for i, (input_text, target_text) in enumerate(zip(x_train, y_train)):
    x_transform = input_text.split(' ')
    try:
        x_transform.remove(' ')
    except ValueError as err:
        continue
        
    for t, char in enumerate(x_transform):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    y_transform = target_text.split(' ')
    try:
        y_transform.remove(' ')
    except ValueError as err:
        continue
    
    for t, char in enumerate(y_transform):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.


In [57]:
from keras.models import Input
from keras.layers import LSTM, Dense
latent_dim = 8

encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [58]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [59]:
from keras.models import Model
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [63]:
batch_size = 4
epochs = 10
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Train on 59 samples, validate on 15 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fd82e39cb90>

In [64]:
# Save model
model.save('MT.h5')

In [65]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [66]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


In [71]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['START_']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [74]:

for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', x_train[seq_index])
    print('Actual sentence Translation: ', y_train[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: i found the book
Actual sentence Translation:  START_ áchọ̀tárà ḿ ákwúkwọ́ _END
Decoded sentence: nkirinaebu
-
Input sentence: i did it
Actual sentence Translation:  START_ émèrè ḿ yá _END
Decoded sentence: naarahụịkwụsị
-
Input sentence: he made  the cake
Actual sentence Translation:  START_ émèrè ḿ áchíchá ahù _END
Decoded sentence: naarahụịkwụsị
-
Input sentence: i got home
Actual sentence Translation:  START_ énwò ḿ nụ́lọ̀ _END
Decoded sentence: naarahụịkwụsị
-


KeyError: 4