In [1]:
import numpy as np
from numpy import array
import pickle
from pickle import dump
import re
import string
from unicodedata import normalize
from nltk.stem.porter import PorterStemmer
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense

Using Theano backend.


In [2]:
import pandas.core

In [3]:
def clean_string(row):
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    # normalize unicode characters
    row['text'] = normalize('NFD', row['text']).encode('ascii', 'ignore')
    row['text'] = row['text'].decode('UTF-8')
    # tokenize on white space
    row['text'] = row['text'].split()
    # convert to lowercase
    row['text'] = [word.lower() for word in row['text']]
    # remove punctuation from each token
    row['text'] = [word.translate(table) for word in row['text']]
    # remove non-printable chars form each token
    row['text'] = [re_print.sub('', w) for w in row['text']]
    # remove tokens with numbers in them
    row['text'] = [word for word in row['text'] if word.isalpha()]
    row['text'] = [x.strip(' ') for x in row['text']]
    return row['text']


#row['text'].strip().lower().replace('(', '').replace(')', '')

In [4]:
# import and clean all language pkl files
#EN = English, PT = Portuguese, HI = Hindi
pkl_file = open('../data/sentences_EN.pkl', 'rb')
sentences = pickle.load(pkl_file)
pkl_file.close()
sentences['text']=sentences.apply(clean_string, axis=1)
clean_EN_df = sentences

pkl_file = open('../data/sentences_PT.pkl', 'rb')
sentences = pickle.load(pkl_file)
pkl_file.close()
sentences['text']=sentences.apply(clean_string, axis=1)
clean_PT_df = sentences

pkl_file = open('../data/sentences_HI.pkl', 'rb')
sentences = pickle.load(pkl_file)
pkl_file.close()
print(sentences.head())
sentences['text']=sentences.apply(clean_string, axis=1)
print(sentences.head())
clean_HI_df = sentences

#TODO: trying to remove blank values is not working
#these methods are not working
#clean_HI_df = clean_HI_df["text"].apply(lambda x: pd.Series(x[0]) if x else pd.Series()).dropna()
#clean_HI_df = clean_HI_df[clean_HI_df["text"] != ""]
#indexNames = clean_HI_df[len(clean_HI_df['text'])==0].index

# Get names of indexes for which text is blank, currently all hindi unicode
# by nature of how the data are loaded the hindi unicode is loaded third out of 4
# This is fragile, but works for now
indexNames = clean_HI_df[ clean_HI_df['sentence_id'] % 4 == 3 ].index

# Delete these row indexes from dataFrame
clean_HI_df.drop(indexNames , inplace=True)
print(clean_HI_df.head())

                                  text  sentence_id
0       मैंने अपना पासपोर्ट खो दिया है            3
1  mainne apana paasaport kho diya hai            4
2          किसी ने मेरा पैसा चुरा लिया            7
3       kisee ne mera paisa chura liya            8
4                                  मदद           11
                                         text  sentence_id
0                                          []            3
1  [mainne, apana, paasaport, kho, diya, hai]            4
2                                          []            7
3       [kisee, ne, mera, paisa, chura, liya]            8
4                                          []           11
                                         text  sentence_id
1  [mainne, apana, paasaport, kho, diya, hai]            4
3       [kisee, ne, mera, paisa, chura, liya]            8
5                                     [madad]           12
7         [kya, mujhe, bil, mil, sakata, hai]           16
9                 [main, mithaee, chaah

In [5]:
#view the first 5 rows of one of the new dataframes to 
print(clean_EN_df.head())
print(clean_PT_df.head())
print(clean_HI_df.head())

                            text  sentence_id
0  [i, have, lost, my, passport]            1
1    [someone, stole, my, money]            5
2                         [help]            9
3      [may, i, have, the, bill]           13
4      [i, would, like, dessert]           17
                              text  sentence_id
0     [eu, perdi, meu, passaporte]            2
1  [alguem, roubou, meu, dinheiro]            6
2                        [socorro]           10
3         [pode, trazer, a, conta]           14
4    [eu, gostaria, de, sobremesa]           18
                                         text  sentence_id
1  [mainne, apana, paasaport, kho, diya, hai]            4
3       [kisee, ne, mera, paisa, chura, liya]            8
5                                     [madad]           12
7         [kya, mujhe, bil, mil, sakata, hai]           16
9                 [main, mithaee, chaahoonga]           20


In [6]:
# stemming of words
#Stemming refers to the process of reducing each word to its root or base.
#For example “fishing,” “fished,” “fisher” all reduce to the stem “fish.”
def stem_string(row):
    row['text'] = [porter.stem(word) for word in row['text']]
    return row['text']

In [7]:
#cioying to preserve integrity
stemmed_clean_EN_df = clean_EN_df.copy()
stemmed_clean_PT_df = clean_PT_df.copy()
stemmed_clean_HI_df = clean_HI_df.copy()

#Stem wirds to get to roots of words
porter = PorterStemmer()
stemmed_clean_EN_df['text'] = stemmed_clean_EN_df.apply(stem_string, axis=1)
#stemmed_clean_PT_df['text'] = stemmed_clean_PT_df.apply(stem_string, axis=1)
#stemmed_clean_HI_df['text'] = stemmed_clean_HI_df.apply(stem_string, axis=1)

print(stemmed_clean_EN_df.head())
print(stemmed_clean_PT_df.head())
print(stemmed_clean_HI_df.head())

                            text  sentence_id
0  [i, have, lost, my, passport]            1
1     [someon, stole, my, money]            5
2                         [help]            9
3      [may, i, have, the, bill]           13
4      [i, would, like, dessert]           17
                              text  sentence_id
0     [eu, perdi, meu, passaporte]            2
1  [alguem, roubou, meu, dinheiro]            6
2                        [socorro]           10
3         [pode, trazer, a, conta]           14
4    [eu, gostaria, de, sobremesa]           18
                                         text  sentence_id
1  [mainne, apana, paasaport, kho, diya, hai]            4
3       [kisee, ne, mera, paisa, chura, liya]            8
5                                     [madad]           12
7         [kya, mujhe, bil, mil, sakata, hai]           16
9                 [main, mithaee, chaahoonga]           20


In [8]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [9]:
# max sentence length
def max_length(lines):
    max_line_length = 0
    for line in lines:
        if(len(line) > max_line_length):
            max_line_length=len(line)
    return max_line_length


#def max_length(lines):
    #return max(len(line.split()) for line in lines)

In [10]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    #X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [11]:
# define NMT model
def define_model(src_vocab, src_timesteps, n_units):
    model = Sequential()
    model.add(Dense(12, input_dim=1, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model


# define NMT model - original
#def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
#    model = Sequential()
#    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
#    model.add(LSTM(n_units))
#    model.add(RepeatVector(tar_timesteps))
#    model.add(LSTM(n_units, return_sequences=True))
#    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
#    return model

In [12]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(stemmed_clean_EN_df['text'])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(stemmed_clean_EN_df['text'])

print(eng_tokenizer)
print(eng_vocab_size)
print(eng_length)



<keras_preprocessing.text.Tokenizer object at 0x000001AC76744A58>
306
12


In [13]:
X_EN=eng_tokenizer
Y_EN=stemmed_clean_EN_df['sentence_id'].to_frame()

In [14]:
type(X_EN)

keras_preprocessing.text.Tokenizer

In [15]:

# define English model
model = define_model(eng_vocab_size, eng_length, 256)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [16]:
model.fit(X_EN, Y_EN, epochs=5, batch_size=10,  verbose=2)

AttributeError: 'Tokenizer' object has no attribute 'ndim'

In [None]:

# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)

In [None]:

# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)