In [2]:
import keras
import tensorflow as tf
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Using TensorFlow backend.


In [3]:
#!pip install python-docx 

In [4]:
import pandas as pd
import io
import csv
from docx import Document

def read_docx_tab(tab, **kwargs):
    vf = io.StringIO()
    writer = csv.writer(vf)
    for row in tab.rows:
        writer.writerow(cell.text for cell in row.cells)
    vf.seek(0)
    return pd.read_csv(vf, **kwargs)

def read_docx_tables(filename, tab_id=None, **kwargs):
    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in doc.tables]
    else:
        try:
            return read_docx_tab(doc.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise

            
table = read_docx_tables(filename = 'Tabular.docx', tab_id = 0)

In [5]:
table.head()

Unnamed: 0,S/N,ENG VERB,ENG VERB.1,PAST TENSE,PAST TENSE .1,PAST TENSE .2,English Sentence,IGBO SENTENCES
0,1,FIND,FIND,FOUND,FOUND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,2,DO,DO,DID,DID,DID,I did it,émèrè ḿ yá
2,3,MAKE,MAKE,MADE,MADE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,4,GET,GET,GOT,GOT,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,5,SAY,SAY,SAID,SAID,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [6]:
table.columns

Index(['S/N', 'ENG VERB', 'ENG VERB.1', 'PAST TENSE ', 'PAST TENSE .1',
       'PAST TENSE .2', 'English Sentence ', 'IGBO SENTENCES '],
      dtype='object')

In [7]:
updated_table = table.drop(['S/N','ENG VERB.1','PAST TENSE .1','PAST TENSE .2'], axis = 1)

In [8]:
updated_table.head()

Unnamed: 0,ENG VERB,PAST TENSE,English Sentence,IGBO SENTENCES
0,FIND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,DO,DID,I did it,émèrè ḿ yá
2,MAKE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,GET,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,SAY,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [9]:
updated_table = updated_table.dropna()
updated_table.shape

(93, 4)

In [10]:
x, y = updated_table['English Sentence '], updated_table['IGBO SENTENCES ']
raw_dataset = updated_table[['English Sentence ','IGBO SENTENCES ']]

In [11]:
x[:2]

0    I found the book
1           I did it 
Name: English Sentence , dtype: object

In [12]:
y[:2]

0    Áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [13]:
# lowercase all letters
x_lower = x.apply(lambda x:x.lower())
y_lower = y.apply(lambda x:x.lower())

In [16]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [17]:
# remove quotes
import re
x_lower = x_lower.apply(lambda x:re.sub("'",'',x))
y_lower = y_lower.apply(lambda x:re.sub("'",'',x))

In [18]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [19]:
import string
exclude = set(string.punctuation) # set of all special characters
#remove all the special characters
x_lower = x_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
y_lower = y_lower.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [20]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [21]:
# remove all numbers from text
digits = string.digits
remove_digits = str.maketrans('','',digits)
x_lower = x_lower.apply(lambda x:x.translate(remove_digits))
y_lower = y_lower.apply(lambda x:x.translate(remove_digits))

In [22]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [23]:
# remove extra spaces
x_lower = x_lower.apply(lambda x: x.strip())
y_lower = y_lower.apply(lambda x: x.strip())

In [24]:
y_lower[:2]

0    áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
Name: IGBO SENTENCES , dtype: object

In [25]:
# add start and end tokens to target sequences
y_lower = y_lower.apply(lambda x: 'START_ '+x +' _END')

In [26]:
y_lower[:2]

0    START_ áchọ̀tárà ḿ ákwúkwọ́ _END
1              START_ émèrè ḿ yá _END
Name: IGBO SENTENCES , dtype: object

In [27]:
# get english and igbo vocabulary
all_eng_words = set()
for eng in x_lower:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
            
all_igbo_words = set()

for igb in y_lower:
    for word in igb.split():
        if word not in all_igbo_words:
            all_igbo_words.add(word)

In [30]:
print(len(all_eng_words))
print(len(all_igbo_words))

177
183


In [38]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_igbo_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_igbo_words)
num_encoder_tokens, num_decoder_tokens

(177, 183)

In [39]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())


In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_lower, y_lower, test_size = 0.2)
x_train.shape, x_test.shape

((74,), (19,))

In [45]:
x_train[:2]

19    he provided for us
9            i came back
Name: English Sentence , dtype: object

In [46]:
y_train[:2]

19    START_ o nyèrè anyí _END
9      START_ ábiàkwàrà ḿ _END
Name: IGBO SENTENCES , dtype: object

In [47]:
max_length_src = 5
max_length_tar = 7
def generate_batch(X = x_train, y = y_train, batch_size = 9):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype = 'float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype = 'float32')
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i,t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                        
                    if t>0:
                        # decoder target sequence (one hot decoder)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i,t-1, target_token_index[word]] =1
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [48]:
# encoder-decoder arhitecture

# encoder
from keras.layers import Dropout,Input, Embedding, LSTM
latent_dim = 7
#Encoder 
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# we discard encoder_outputs and only keep the states
encoder_states = [state_h, state_c]


In [49]:
# Decoder
from keras.layers import Dense
from keras.models import Model
decoder_inputs = Input(shape = (None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

# we set up out decoder to return full output sequences
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inferences
decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# define the model that will turn
# encoder_input_data and decoder_input data into decoder_target_data
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [50]:
# train the model for 100 epochs
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')


In [51]:
# decoder at test time

# encode the input sequence to get the 'thought vectors'
encoder_model = Model(encoder_inputs, encoder_states)

#Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape = (latent_dim,))
decoder_state_input_c = Input(shape = (latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs) # get the embedding of the decoder sequence

# to predict the next word in the sequence, set the initial states to the states
# from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state = decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate
# prob dist. over the target vec

# final decoder model

decoder_model = Model([decoder_inputs]+decoder_states_inputs,
                     [decoder_outputs2]+decoder_states2)

In [55]:
def decode_sequence(input_seq):
    # encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    # generate empty target sequence of length 1
    target_seq = np.zeros((1,1))
    # populate the first character of target sequence with the start character
    target_seq[0,0] = target_token_index['START_']
    
    # sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sequence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq]+states_value)
        
        # sample a token
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
        sampled_char= reverse_target_char_index[sampled_token_index]
        decoded_sequence += ' '+ sampled_char
        
        # Exit condition either hit max length
        # or find stop character
        if (sampled_char == '_END' or len(decoded_sequence)>50):
            stop_condition = True
            
            
        # update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_token_index
        
        
        #update states
        states_value = [h,c]
    return decoded_sequence


In [56]:
train_gen = generate_batch(x_train, y_train, batch_size = 1)
k= -1

k+= 1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print("Input English sentence:", x_train[k:k+1].values[0])
print('Actual Igbo Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Igbo Translation:', decoded_sentence[:-4])

Input English sentence: he provided for us
Actual Igbo Translation:  o nyèrè anyí 
Predicted Igbo Translation:  kpọ̀chìrí naegwu amụ̀tàrà amụ̀tàrà gburu ama ama


https://medium.com/analytics-vidhya/machine-translation-encoder-decoder-model-7e4867377161