In [1]:
import keras
import tensorflow as tf
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Using TensorFlow backend.


In [2]:
import pandas as pd
import io
import csv
from docx import Document

def read_docx_tab(tab, **kwargs):
    vf = io.StringIO()
    writer = csv.writer(vf)
    for row in tab.rows:
        writer.writerow(cell.text for cell in row.cells)
    vf.seek(0)
    return pd.read_csv(vf, **kwargs)

def read_docx_tables(filename, tab_id=None, **kwargs):
    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in doc.tables]
    else:
        try:
            return read_docx_tab(doc.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise

            
table = read_docx_tables(filename = 'Tabular.docx', tab_id = 0)

In [3]:
table.head()

Unnamed: 0,S/N,ENG VERB,ENG VERB.1,PAST TENSE,PAST TENSE .1,PAST TENSE .2,English Sentence,IGBO SENTENCES
0,1,FIND,FIND,FOUND,FOUND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,2,DO,DO,DID,DID,DID,I did it,émèrè ḿ yá
2,3,MAKE,MAKE,MADE,MADE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,4,GET,GET,GOT,GOT,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,5,SAY,SAY,SAID,SAID,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [4]:
table.columns

Index(['S/N', 'ENG VERB', 'ENG VERB.1', 'PAST TENSE ', 'PAST TENSE .1',
       'PAST TENSE .2', 'English Sentence ', 'IGBO SENTENCES '],
      dtype='object')

In [5]:
updated_table = table.drop(['S/N','ENG VERB.1','PAST TENSE .1','PAST TENSE .2'], axis = 1)

In [6]:
updated_table.head()

Unnamed: 0,ENG VERB,PAST TENSE,English Sentence,IGBO SENTENCES
0,FIND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,DO,DID,I did it,émèrè ḿ yá
2,MAKE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,GET,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,SAY,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [7]:
updated_table = updated_table.dropna()
updated_table.shape

(93, 4)

In [8]:
x, y = updated_table['English Sentence '], updated_table['IGBO SENTENCES ']
raw_dataset = updated_table[['English Sentence ','IGBO SENTENCES ']]

In [9]:
x[:2]

0    I found the book
1           I did it 
Name: English Sentence , dtype: object

In [10]:
y[:2]

0    Áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

# Data Cleaning and preprocessing

In [11]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [12]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [13]:
# remove quotes
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [14]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [15]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [16]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [17]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [18]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [19]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_igbo_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_igbo_words:
            all_igbo_words.add(word)

In [20]:
print(len(all_eng_words))
print(len(all_igbo_words))

177
183


In [21]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_igbo_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_igbo_words)
num_encoder_tokens, num_decoder_tokens

(177, 183)

In [22]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [28]:
target_token_index

{'START_': 1,
 '_END': 2,
 'abalị': 3,
 'agaghị': 4,
 'ahù': 5,
 'ahịhịa': 6,
 'ahụ': 7,
 'akwụ̀siri': 8,
 'akụkọ': 9,
 'ala': 10,
 'ama': 11,
 'amụ̀tàrà': 12,
 'anya': 13,
 'anyi': 14,
 'anyí': 15,
 'anyị': 16,
 'anụrụ': 17,
 'bgàkwúnye': 18,
 'bụla': 19,
 'bụ́lá': 20,
 'chọ̀rọ̀': 21,
 'dì': 22,
 'dịghị': 23,
 'dọtara': 24,
 'dọwara': 25,
 'echere': 26,
 'echiche': 27,
 'ege': 28,
 'ego': 29,
 'egosi': 30,
 'egwuregwu': 31,
 'ehi': 32,
 'enwere': 33,
 'enye': 34,
 'esemokwu': 35,
 'furu': 36,
 'gara': 37,
 'gbanwere': 38,
 'gbasara': 39,
 'gburu': 40,
 'gbàpụ̀rụ̀': 41,
 'ghọrọ': 42,
 'gwara': 43,
 'gwụchara': 44,
 'gị': 45,
 'ha': 46,
 'hàpụ̀rụ̀': 47,
 'há': 48,
 'họọrọ': 49,
 'i': 50,
 'ihe': 51,
 'ihé': 52,
 'ima': 53,
 'ire': 54,
 'ji': 55,
 'jìdèrè': 56,
 'jụ̀rụ̀': 57,
 'kacha': 58,
 'kpebiri': 59,
 'kpochapuru': 60,
 'kpuchiri': 61,
 'kpọ̀chìrí': 62,
 'kpọ̀rọ̀': 63,
 'kwadoro': 64,
 'kwalitere': 65,
 'kwuru': 66,
 'kwàgàrà': 67,
 'kwèrè': 68,
 'kwụ́ghi': 69,
 'kèrè': 70,
 'kụrụ':

In [23]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_lower, y_lower, test_size = 0.2)
x_train.shape, x_test.shape

((74,), (19,))

In [24]:
x_train[:2]

34     i wrote the letter
16    he knocked the door
Name: English Sentence , dtype: object

In [25]:
y_train[:2]

34    START_ édèrè ḿ ákwụ́kwọ́ ozí ahù _END
16        START_ ọ kụrụ áká nọnú ụ́zọ̀ _END
Name: IGBO SENTENCES , dtype: object

In [27]:
y_train[0].split()

['START_', 'áchọ̀tárà', 'ḿ', 'ákwúkwọ́', '_END']

In [29]:
src_length = 5
tar_length = 9

def batch_generator(X,y, batch_size):
    while True:
        for j in enumerate(0, len(X), batch_size):
            #preparing a dummy array of zeros that will be encoded for the corresponding batch_size
            encode_input = np.zeros((batch_size, src_length), dtype = np.float32)
            decode_input_target = np.zeros((batch_size, tar_length), dtype = np.float32)
            decode_target_data = np.zeros((batch_size, tar_length,num_decoder_tokens), dtype = np.float32)
            
            #encoding the data in batches
            for i, (text_input, target) in enumerate(zip(X[j:j+batch_size],y[j:j+batch_size])):
                # dealing with each input, target (sentence by sentence in the batch)
                for t, word in enumerate(text_input.split()):
                    # encode input data
                    encode_input[i][t] = input_token_index[word] #get the encoded number for every corresponding word
                    
                for t, word in enumerate(target_text.split()):
                    #encode_target data
                    decode_input_target[i][t] = target_token_index[word] # get the docoded number for every corresponding word in the target.
                    
                    if t > 0:
                        # creating an offset of one time step
                        # that is excluding the start token
                        decode_target_data[i][t-1][target_token_index[word]] = 1
            return encoder_input, decode_input_target, decode_target_data
        
        

In [32]:
# encoder-decoder architecture

#encoder
from keras.layers import Dropout, Input, Embedding, LSTM

dimension = 5
#Encoder
encoder_inputs = Input(shape = (None,))
enc_emb = Embedding(num_encoder_tokens, dimension, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(dimension, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

encoder_states = [state_h, state_c]

In [33]:
# Decoder model
from keras.layers import Dense
from keras.models import Model

decoder_inputs = Input(shape = (None,))
dec_emb_layer = Embedding(num_decoder_tokens, dimension,mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(dimension, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = encoder_states)
output = Dense(num_decoder_tokens, activation = 'softmax')(decoder_outputs)


In [34]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [35]:
model.compile(optimizer= 'rmsprop', loss = 'categorical_crossentropy')

In [None]:
# decoder at test time

#encode the input sequence to get the 'thought vectors'
encoder_model = Model(encoder_inputs, encoder_states)