In [1]:
import keras
import tensorflow as tf
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Using TensorFlow backend.


In [2]:
import pandas as pd
import io
import csv
from docx import Document

def read_docx_tab(tab, **kwargs):
    vf = io.StringIO()
    writer = csv.writer(vf)
    for row in tab.rows:
        writer.writerow(cell.text for cell in row.cells)
    vf.seek(0)
    return pd.read_csv(vf, **kwargs)

def read_docx_tables(filename, tab_id=None, **kwargs):
    """
    parse table(s) from a Word Document (.docx) into Pandas DataFrame(s)

    Parameters:
        filename:   file name of a Word Document

        tab_id:     parse a single table with the index: [tab_id] (counting from 0).
                    When [None] - return a list of DataFrames (parse all tables)

        kwargs:     arguments to pass to `pd.read_csv()` function

    Return: a single DataFrame if tab_id != None or a list of DataFrames otherwise
    """
    

    doc = Document(filename)
    if tab_id is None:
        return [read_docx_tab(tab, **kwargs) for tab in doc.tables]
    else:
        try:
            return read_docx_tab(doc.tables[tab_id], **kwargs)
        except IndexError:
            print('Error: specified [tab_id]: {}  does not exist.'.format(tab_id))
            raise

            
table = read_docx_tables(filename = 'Tabular.docx', tab_id = 0)

In [3]:
table.head()

Unnamed: 0,S/N,ENG VERB,ENG VERB.1,PAST TENSE,PAST TENSE .1,PAST TENSE .2,English Sentence,IGBO SENTENCES
0,1,FIND,FIND,FOUND,FOUND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,2,DO,DO,DID,DID,DID,I did it,émèrè ḿ yá
2,3,MAKE,MAKE,MADE,MADE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,4,GET,GET,GOT,GOT,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,5,SAY,SAY,SAID,SAID,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [4]:
table.columns

Index(['S/N', 'ENG VERB', 'ENG VERB.1', 'PAST TENSE ', 'PAST TENSE .1',
       'PAST TENSE .2', 'English Sentence ', 'IGBO SENTENCES '],
      dtype='object')

In [5]:
updated_table = table.drop(['S/N','ENG VERB.1','PAST TENSE .1','PAST TENSE .2'], axis = 1)

In [6]:
updated_table.head()

Unnamed: 0,ENG VERB,PAST TENSE,English Sentence,IGBO SENTENCES
0,FIND,FOUND,I found the book,Áchọ̀tárà ḿ ákwúkwọ́
1,DO,DID,I did it,émèrè ḿ yá
2,MAKE,MADE,He made the cake,émèrè ḿ áchíchá ahù
3,GET,GOT,I got home,énwò ḿ n'ụ́lọ̀
4,SAY,SAID,He said nothing,O kwụ́ghi ihé ọ bụ́lá


In [7]:
updated_table = updated_table.dropna()
updated_table.shape

(93, 4)

In [8]:
x, y = updated_table['English Sentence '], updated_table['IGBO SENTENCES ']
raw_dataset = updated_table[['English Sentence ','IGBO SENTENCES ']]

In [9]:
x[:4]

0     I found the book
1            I did it 
2    He made  the cake
3           I got home
Name: English Sentence , dtype: object

In [10]:
y[:4]

0    Áchọ̀tárà ḿ ákwúkwọ́
1              émèrè ḿ yá
2     émèrè ḿ áchíchá ahù
3          énwò ḿ n'ụ́lọ̀
Name: IGBO SENTENCES , dtype: object

In [11]:
splited_x = [k.split() for k in x]
splited_y = [k.split() for k in y]

In [12]:
splited_x[0], splited_y[0]

(['I', 'found', 'the', 'book'], ['Áchọ̀tárà', 'ḿ', 'ákwúkwọ́'])

In [13]:
x[0], y[0]

('I found the book', 'Áchọ̀tárà ḿ ákwúkwọ́')

In [14]:
def input_char(b):
    characters = []
    for l in b:
        for m in l:
            if m.lower() in characters:
                pass
            else:
                characters.append(m.lower())
    return sorted(characters)

def output_char(b):
    characters = []
    for l in b:
        for m in l:
            if m in characters:
                pass
            else:
                characters.append(m)
    return sorted(characters)


input_characters = input_char(splited_x)
target_characters = output_char(splited_y)

In [15]:
len(input_characters), len(target_characters)

(177, 185)

In [16]:
input_characters[:5]

['a', 'added', 'allowed', 'appeared', 'argument']

In [17]:
target_characters[:5]

['Agaghị', 'Akwụ̀siri', 'Amụ̀tàrà', 'Anyị', 'Anụrụ']

In [18]:
num_of_encoder_tokens, num_of_decoder_tokens = len(input_characters), len(target_characters)

In [19]:
def find_max(x):
    list_length = []
    for k in x:
        length = len(k)
        list_length.append(length)
    return max(list_length)

find_max(splited_x), find_max(splited_y)

(5, 7)

In [20]:
ratio = 10/177
ratio

0.05649717514124294

In [21]:
dict_input_char = {}
ratio = 0.0565
update = ratio
#zero is for input-target 0
for k in input_characters:
    dict_input_char[k] = update
    update = update+ ratio



In [22]:
output_ratio = 10/num_of_decoder_tokens
output_ratio

0.05405405405405406

In [23]:
dict_output_char = {}
output_ratio = 0.0541
update = output_ratio
#zero is for input-target 0
for k in target_characters:
    dict_output_char[k] = update
    update = update+ output_ratio



In [24]:
# pad the data

def padding(x, val):
    copy_x = x.copy()
    for i,k in enumerate(x):
        if len(k) <val:
            dif = val- len(k)
            list_diff = [' ']*dif
            for k in range(dif):
                copy_x[i].append(' ')
        else:
            pass
    return copy_x

copy_splited_x = padding(splited_x,5)


In [25]:
copy_splited_y = padding(splited_y,7)

In [26]:
# input data is of shape 5
# output data is of shape 7
dict_input_char[' '] = 0.000
dict_output_char[' '] = 0.000



In [27]:
def encode_input_data(data):
    for k,l in enumerate(data):
        for i,j in enumerate(l):
            data[k][i] = dict_input_char[j.lower()]
    return data

encoded_input_data = encode_input_data(copy_splited_x)

In [28]:
#encoded_input_data

In [29]:
def encode_target_data(data):
    for k,l in enumerate(data):
        for i,j in enumerate(l):
            data[k][i] = dict_output_char[j]
    return data

encoded_target_data = encode_target_data(copy_splited_y)

In [30]:
encoded_input_data[:5]

[[4.463500000000006, 3.277000000000005, 8.81399999999999, 0.8475, 0.0],
 [4.463500000000006, 2.1470000000000007, 4.802500000000005, 0.0, 0.0],
 [3.9550000000000076, 5.819500000000001, 8.81399999999999, 1.1865, 0.0],
 [4.463500000000006, 3.559500000000006, 4.3505000000000065, 0.0, 0.0],
 [3.9550000000000076, 7.796999999999993, 6.666999999999997, 0.0, 0.0]]

In [31]:
encoded_target_data[:5]

[[7.8445000000000045,
  8.980600000000004,
  8.331400000000004,
  0.0,
  0.0,
  0.0,
  0.0],
 [8.601900000000004, 8.980600000000004, 7.519900000000004, 0.0, 0.0, 0.0, 0.0],
 [8.601900000000004,
  8.980600000000004,
  8.115000000000004,
  0.9197000000000002,
  0.0,
  0.0,
  0.0],
 [8.656000000000004,
  8.980600000000004,
  4.9772000000000025,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.8115000000000001,
  3.841100000000002,
  2.9214000000000016,
  9.467500000000005,
  1.5148000000000006,
  0.0,
  0.0]]

# modelling

In [32]:
#create dataframe

def create_frame(x,y, index = 0):
    a,b,c,d,e, target = [],[],[],[],[],[]
    
    for l,m in zip(x,y):
        a.append(l[0])
        b.append(l[1])
        c.append(l[2])
        d.append(l[3])
        e.append(l[4])
        target.append(m[index])
    dict_frame = {'a':a,'b':b,'c':c,'d':d,'e':e}
    y = {'target': target}
    
    return pd.DataFrame(dict_frame), np.array(target)

x0, y0 = create_frame(encoded_input_data,encoded_target_data, index = 0)
x1, y1 = create_frame(encoded_input_data,encoded_target_data, index = 1)
x2, y2 = create_frame(encoded_input_data,encoded_target_data, index = 2)
x3, y3 = create_frame(encoded_input_data,encoded_target_data, index = 3)
x4, y4 = create_frame(encoded_input_data,encoded_target_data, index = 4)
x5, y5 = create_frame(encoded_input_data,encoded_target_data, index = 5)
x6, y6 = create_frame(encoded_input_data,encoded_target_data, index = 6)

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train0, x_test0, y_train0, y_test0 = train_test_split(x0, y0, test_size = 0.1, random_state = 42)

In [35]:
from sklearn.tree import DecisionTreeRegressor
lr0 = DecisionTreeRegressor()
lr0.fit(x_train0, y_train0)

print('The training accuracy is: ',lr0.score(x_train0, y_train0))
print('The test accuracy is: ',lr0.score(x_test0, y_test0))



The training accuracy is:  1.0
The test accuracy is:  -2.3239361993152285


In [36]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.2, random_state = 42)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size = 0.2, random_state = 42)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size = 0.2, random_state = 42)
x_train4, x_test4, y_train4, y_test4 = train_test_split(x4, y4, test_size = 0.2, random_state = 42)
x_train5, x_test5, y_train5, y_test5 = train_test_split(x5, y5, test_size = 0.2, random_state = 42)
x_train6, x_test6, y_train6, y_test6 = train_test_split(x6, y6, test_size = 0.2, random_state = 42)


In [37]:
lr1 = DecisionTreeRegressor()
lr1.fit(x_train1, y_train1)

print('The training accuracy is: ',lr1.score(x_train1, y_train1))
print('The test accuracy is: ',lr1.score(x_test1, y_test1))



The training accuracy is:  1.0
The test accuracy is:  -0.4749105176211456


In [38]:
lr2 = DecisionTreeRegressor()
lr2.fit(x_train2, y_train2)

print('The training accuracy is: ',lr2.score(x_train2, y_train2))
print('The test accuracy is: ',lr2.score(x_test2, y_test2))



The training accuracy is:  1.0
The test accuracy is:  -0.13692036920883344


In [39]:
lr3 = DecisionTreeRegressor()
lr3.fit(x_train3, y_train3)

print('The training accuracy is: ',lr3.score(x_train3, y_train3))
print('The test accuracy is: ',lr3.score(x_test3, y_test3))



The training accuracy is:  1.0
The test accuracy is:  -1.3598738941165367


In [40]:
lr4 = DecisionTreeRegressor()
lr4.fit(x_train4, y_train4)

print('The training accuracy is: ',lr4.score(x_train4, y_train4))
print('The test accuracy is: ',lr4.score(x_test4, y_test4))



The training accuracy is:  1.0
The test accuracy is:  -2.474303514900809


In [41]:
lr5 = DecisionTreeRegressor()
lr5.fit(x_train5, y_train5)

print('The training accuracy is: ',lr5.score(x_train5, y_train5))
print('The test accuracy is: ',lr5.score(x_test5, y_test5))



The training accuracy is:  1.0
The test accuracy is:  -0.08912632744804205


In [42]:
lr6 = DecisionTreeRegressor()
lr6.fit(x_train6, y_train6)

print('The training accuracy is: ',lr6.score(x_train6, y_train6))
print('The test accuracy is: ',lr6.score(x_test6, y_test6))



The training accuracy is:  1.0
The test accuracy is:  -0.05555555555555558


In [43]:
import joblib, os
current_dir = os.getcwd()
save_directory = os.path.join(current_dir, 'models/dtr/')
if not os.path.exists(save_directory):
    os.makedirs(save_directory)
joblib.dump(lr0, save_directory+'/lr0dtr.pkl')
joblib.dump(lr1, save_directory+'/lr1dtr.pkl')
joblib.dump(lr2, save_directory+'/lr2dtr.pkl')
joblib.dump(lr3, save_directory+'/lr3dtr.pkl')
joblib.dump(lr4, save_directory+'/lr4dtr.pkl')
joblib.dump(lr5, save_directory+'/lr5dtr.pkl')
joblib.dump(lr6, save_directory+'/lr6dtr.pkl')

['C:\\Users\\user\\Desktop\\NLP\\models/dtr//lr6dtr.pkl']

In [44]:
# write the dict_input_char
joblib.dump(dict_input_char, save_directory+'/dict_input_char.pkl')
joblib.dump(dict_output_char, save_directory+'/dict_output_char.pkl')


['C:\\Users\\user\\Desktop\\NLP\\models/dtr//dict_output_char.pkl']

# create pipeline

In [45]:
os.getcwd()

'C:\\Users\\user\\Desktop\\NLP'

In [46]:
import pickle
#os.chdir(os.getcwd() +'/models/dtr')
input_dict = pickle.load(open('dict_input_char.pkl', 'rb'))
output_dict = pickle.load(open('dict_output_char.pkl','rb'))
lr0 = pickle.load(open('lr0dtr.pkl', 'rb'))
lr1 = pickle.load(open('lr1dtr.pkl', 'rb'))
lr2 = pickle.load(open('lr2dtr.pkl', 'rb'))
lr3 = pickle.load(open('lr3dtr.pkl', 'rb'))
lr4 = pickle.load(open('lr4dtr.pkl', 'rb'))
lr5 = pickle.load(open('lr5dtr.pkl', 'rb'))
lr6 = pickle.load(open('lr6dtr.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'dict_input_char.pkl'

In [47]:

def encode_input_data(data):
    data = data.split()
    
    data_length = len(data)
    if data_length <5:
        diff = 5 - data_length
        for k in range(diff):
            data.append(' ')
    for i,j in enumerate(data):
        data[i] = dict_input_char[j.lower()]
    return data

input_test = 'I did it'
encoded_input_data = encode_input_data(input_test)
encoded_input_data

[4.463500000000006, 2.1470000000000007, 4.802500000000005, 0.0, 0.0]

In [48]:
def predict_encoder(data):
    data = np.array(data).reshape(1,-1)
    p0 = lr0.predict(data)
    p1 = lr1.predict(data)
    p2 = lr2.predict(data)
    p3 = lr3.predict(data)
    p4 = lr4.predict(data)
    p5 = lr5.predict(data)
    p6 = lr6.predict(data)
    return [float(p0),float(p1),float(p2),float(p3),float(p4),float(p5),float(p6)]

prediction = predict_encoder(encoded_input_data)
prediction

[8.601900000000004, 8.980600000000004, 7.519900000000004, 0.0, 0.0, 0.0, 0.0]

In [49]:
list(dict_output_char.values())[0]

0.0541

In [50]:
def three_dp(val):
    val_str = str(val)
    val_length = len(val_str)
    if val_length <5:
        diff = 5 - val_length
        for k in range(diff):
            val_str = val_str +'0'
        cut_val = val_str
    else:
        cut_val = val_str[:5]
    return float(cut_val)
three_dp(3.10000)

3.1

In [51]:

def get_value(value):
    va = three_dp(0.0541/2)
    upper_range, lower_range = value +va, value - va
    key = ''
    for i,k in zip(list(dict_output_char.keys()), dict_output_char.values()):
        if k < three_dp(value) and k > lower_range:
            key = i
        elif k > three_dp(value) and k < upper_range:
            key = i
        else:
            continue
    return key
get_value(8.980600000000004)

'ḿ'

In [52]:

def model_decoder(data):
    result = []
    for k in data:
        result.append(get_value(k))
    output = ''
    for k in result:
        output += ' ' +k
    return output

model_decoder(prediction)

' émèrè ḿ yá    '

In [61]:
import os, sys
import numpy as np

def encode_input_data(data):
    data = data.split()
    
    data_length = len(data)
    if data_length <5:
        diff = 5 - data_length
        for k in range(diff):
            data.append(' ')
    for i,j in enumerate(data):
        data[i] = dict_input_char[j.lower()]
    return data

def predict_encoder(data):
    data = np.array(data).reshape(1,-1)
    p0 = lr0.predict(data)
    p1 = lr1.predict(data)
    p2 = lr2.predict(data)
    p3 = lr3.predict(data)
    p4 = lr4.predict(data)
    p5 = lr5.predict(data)
    p6 = lr6.predict(data)
    return [float(p0),float(p1),float(p2),float(p3),float(p4),float(p5),float(p6)]


def three_dp(val):
    val_str = str(val)
    val_length = len(val_str)
    if val_length <5:
        diff = 5 - val_length
        for k in range(diff):
            val_str = val_str +'0'
        cut_val = val_str
    else:
        cut_val = val_str[:5]
    return float(cut_val)


def get_value(value):
    va = three_dp(0.0541/2)
    upper_range, lower_range = value +va, value - va
    key = ''
    for i,k in zip(list(dict_output_char.keys()), dict_output_char.values()):
        if k < three_dp(value) and k > lower_range:
            key = i
        elif k > three_dp(value) and k < upper_range:
            key = i
        else:
            continue
    return key

def model_decoder(data):
    result = []
    for k in data:
        result.append(get_value(k))
    output = ''
    for k in result:
        output += ' ' +k
    return output



    import pickle
#os.chdir(os.getcwd() +'/models/dtr')
#input_dict = pickle.load(open('dict_input_char.pkl', 'rb'))
#output_dict = pickle.load(open('dict_output_char.pkl','rb'))
#lr0 = pickle.load(open('lr0dtr.pkl', 'rb'))
#lr1 = pickle.load(open('lr1dtr.pkl', 'rb'))
#lr2 = pickle.load(open('lr2dtr.pkl', 'rb'))
#lr3 = pickle.load(open('lr3dtr.pkl', 'rb'))
#lr4 = pickle.load(open('lr4dtr.pkl', 'rb'))
#lr5 = pickle.load(open('lr5dtr.pkl', 'rb'))
#lr6 = pickle.load(open('lr6dtr.pkl', 'rb'))



def main(string):
    val = ''
    encoded_input_data = encode_input_data(string)
    prediction = predict_encoder(encoded_input_data)
    val = model_decoder(prediction)
    return val


if __name__== '__main__':
    string = 'He worked hard'
    result = main(string)
    print(result)

 Ọ́ rụ̀sìrì ọ́rụ́ ire   


In [59]:
x[15:20]

15         He worked hard 
16    He knocked the door 
17    I included his name 
18            I helped him
19      He provided for us
Name: English Sentence , dtype: object

In [60]:
y[15:20]

15      Ọ́ rụ̀sìrì ọ́rụ́ ire
16    Ọ kụrụ áká n'ọnú ụ́zọ̀
17       Ḿ tinyèrè yá áhà yá
18           Ḿ nyeèrè yá áká
19              O nyèrè anyí
Name: IGBO SENTENCES , dtype: object