In [1]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.


In [2]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [3]:
# load datasets

dataset = load_clean_sentences('english-portuguese-training-both.pkl')
train = load_clean_sentences('english-portuguese-training.pkl')
validation = load_clean_sentences('english-portuguese-validation.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
por_tokenizer = create_tokenizer(dataset[:, 1])
por_vocab_size = len(por_tokenizer.word_index) + 1
por_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(por_tokenizer, por_length, train[:, 1])
validationX = encode_sequences(por_tokenizer, por_length, validation[:, 1])


In [4]:
# load model
model = load_model('model.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


### Without one hot encoding

In [5]:
preds = model.predict_classes(validationX.reshape((validationX.shape[0],validationX.shape[1])))

In [6]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None


preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                 temp.append('')
            else:
                 temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t) 

    preds_text.append(' '.join(temp))

In [9]:
import pandas as pd
pred_df = pd.DataFrame({'actual' : validation[:,0], 'predicted' : preds_text})


In [10]:
# print 15 rows randomly
pred_df.sample(15)

Unnamed: 0,actual,predicted
3068,tom seems upset,tom seems bummed
4931,i didnt like what tom did,i didnt what tom did ...
4685,the audience clapped when the concert was over,the united took when was ...
5270,i let my sister use my new computer,i left my hair my pen ...
7892,you know where it is dont you,you know where dont you ...
1606,tom decided to marry mary,tom decided to mary ...
2059,tom didnt seem to like your cooking,tom doesnt like his ...
821,the pond is meters deep,the basket has full different apples ...
5698,actually we dont have a choice,no didnt no choice
4997,do you have any sunscreen,do you use on sunscreen ...


### Test data

In [15]:
# load test dataset

test = load_clean_sentences('en_pt_test.pkl')

testX = encode_sequences(por_tokenizer, por_length, array(test)[:, 1])
test_preds = model.predict_classes(array(testX).reshape((array(testX).shape[0],array(testX).shape[1])))

In [21]:
preds_text = []
for i in test_preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                 temp.append('')
            else:
                 temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t) 

    preds_text.append(' '.join(temp))

(200, 33)

In [22]:
pred_df = pd.DataFrame({'actual' : array(test)[:,0], 'predicted' : preds_text})

In [24]:
# print 15 rows randomly
pred_df.sample(25)

Unnamed: 0,actual,predicted
107,where can i get some ice,where can i turn ice ...
103,where is the elevator,wheres the
127,do you mind,do you doubt
159,whats your favorite,whats your favorite ...
48,what time is it,what time is it
163,ill try my best,ill try to best
61,hello,hello
182,my mouth is watering,im getting in of mouth ...
188,let bygones be bygones,youve forget the
171,here comes a bus,come it bus


### One hot encoding

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        #print(i)
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        #print("Before ", source.shape)
        source = source.reshape((1, source.shape[0]))
        #print("After ", source.shape)
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))



In [7]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, validationX, validation)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
train
src=[nao seja rude], target=[dont be rude], predicted=[dont be rude]
src=[sou um homem ocupado], target=[im a busy man], predicted=[im a busy man]
src=[eu nao ri], target=[i didnt laugh], predicted=[i didnt laugh]
src=[falo rapido], target=[i talk fast], predicted=[i laughed fast]
src=[eu o quero], target=[i want you], predicted=[i want it]
src=[eu sou culpado], target=[im guilty], predicted=[im guilty]
src=[volte mais tarde], target=[come back later], predicted=[come back later]
src=[pule], target=[jump], predicted=[jump]
src=[o que sao eles], target=[what are those], predicted=[what are those]
src=[espere], target=[wait up], predicted=[wait]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.078774
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
test
src=[eu nao estou me sentindo bem], target=[im not well], predicted=[i not unwell]
src=[voce me conhece], target=[you know me], predicted=[do you know me]
src=[eu falo rapido], target=[i talk fast], predicted=[i tried fast]
src=[tom fez arroz], target=[tom made rice], predicted=[tom got walk]
src=[eu fiquei chateado], target=[i got upset], predicted=[i got lonely]
src=[eu odeio perder], target=[i hate to lose], predicted=[i hate losing]
src=[nao culpe tom], target=[dont blame tom], predicted=[dont up tom]
src=[eles morrerao], target=[they will die], predicted=[they voted]
src=[tom nao vai vir], target=[tom wont come], predicted=[tom will die]
src=[leve o tom], target=[take tom], predicted=[take that tom]
BLEU-1: 0.070933
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


### Running the Test data

In [12]:

# load model
#model = load_model('model.h5')

print('final test')
evaluate_model(model, eng_tokenizer, array(testX), array(test))

final test
src=[eu perdi meu passaporte], target=[i have lost my passport], predicted=[i felt hungry]
src=[alguem roubou meu dinheiro], target=[someone stole my money], predicted=[how is money]
src=[socorro], target=[help], predicted=[help]
src=[pode trazer a conta], target=[may i have the bill], predicted=[do it show me]
src=[eu gostaria de sobremesa], target=[i would like dessert], predicted=[i like driving]
src=[eu gostaria de pedir], target=[i would like to order], predicted=[i was fun]
src=[posso ver um menu], target=[may i see a menu], predicted=[can i see]
src=[quero uma bebida], target=[i would like a drink], predicted=[i want a knife]
src=[eu gostaria de um pouco de agua], target=[i would like some water], predicted=[i need both bed]
src=[uma mesa para dois], target=[a table for two], predicted=[take a minute]
BLEU-1: 0.100276
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
