In [1]:
import numpy as np
import string
import re
import pandas as pd
from pickle import dump
from unicodedata import normalize
from pickle import load
from pickle import dump
from numpy.random import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers import Dropout
from keras.layers import Flatten
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.





In [2]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def text_pairs(text):
    text = text.strip().split('\n')
    text_pairs = [line.split('\t')[:2] for line in text]
    return text_pairs

In [4]:
def clean_text(lines):
    clean_pair = []
    re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    re_print = re.compile('[^%s]'%re.escape(string.printable))
    
    for line in lines:
        clean_sentence = []
        for sentence in line:
            
            sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
            
            sentence = sentence.decode('UTF-8')
            
            tokens = sentence.split()
            
            cleaned_text = [re_punct.sub('',word) for word in tokens]
            
            cleaned_text = [re_print.sub('',word) for word in cleaned_text]
            
            cleaned_text = [word.lower() for word in cleaned_text]
            
            cleaned_text = [word for word in cleaned_text if word.isalpha()]
            
            cleaned_text = ' '.join(cleaned_text)
            
            
            clean_sentence.append(cleaned_text)
        
        
        clean_pair.append(clean_sentence)
        
    return np.array(clean_pair)

In [5]:
def save_file(text, filename):
    dump(text, open(filename,'wb'))

In [6]:
### Observation :  Total Sentences :  208487
### Observation : Take only the first Two elements afger splitting on '\t'

In [7]:
filename = 'deu.txt'
text = load_doc(filename)
lines = text_pairs(text)
clean_pairs = clean_text(lines)
filename = 'english_german.pkl'
save_file(clean_pairs, filename)

In [8]:
for i in range(10):
    print(clean_pairs[i][0], clean_pairs[i][1], sep =" : ")

go : geh
hi : hallo
hi : gru gott
run : lauf
run : lauf
wow : potzdonner
wow : donnerwetter
fire : feuer
help : hilfe
help : zu hulf


In [9]:
def load_cleaned_file(filename):
    return load(open(filename,'rb'))

In [10]:
def split_dataset(filename, num_samples):
    sentence_pairs = load_cleaned_file(filename)
    dataset = sentence_pairs[:num_samples]
    
    splitpoint = int(num_samples * 0.9)
    shuffle(dataset)
    train,test= dataset[:splitpoint],dataset[splitpoint:]
    
    return train, test, dataset

In [11]:
filename = 'english_german.pkl'
num_samples = 10000
train, test, dataset = split_dataset(filename, num_samples)

In [12]:
train

array([['i had jeans on', 'ich trug eine nietenhose'],
       ['youll like it', 'du wirst es mogen'],
       ['what is truth', 'was ist wahrheit'],
       ...,
       ['well miss you', 'wir werden sie vermissen'],
       ['it cant be', 'das kann nicht sein'],
       ['you work hard', 'ihr arbeitet schwer']], dtype='<U527')

In [13]:
save_file(dataset, 'english-german-both.pkl') 
save_file(train, 'english-german-train.pkl') 
save_file(test, 'english-german-test.pkl')

In [14]:
document = load_cleaned_file('english-german-both.pkl')
train = load_cleaned_file('english-german-train.pkl')
test = load_cleaned_file('english-german-test.pkl')

In [15]:
def create_tokeinzer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [16]:
def max_sentence_length(lines):
    max_len = max([len(line) for line in lines])
    return max_len

In [17]:
english_tokenizer = create_tokeinzer(document[:,0])
german_tokenizer = create_tokeinzer(document[:,1])
english_max_sentence_length = max_sentence_length(document[:,0])
german_max_sentence_length = max_sentence_length(document[:,1])
english_vocabulary_size = len(english_tokenizer.word_index)+1
german_vocabulary_size = len(german_tokenizer.word_index)+1

In [18]:
print("English Max Sentence Length", english_max_sentence_length, sep =" : ")
print("German Max Sentence Length", german_max_sentence_length, sep =" : ")

English Max Sentence Length : 14
German Max Sentence Length : 42


In [19]:
print("English Vocabulary Size", english_vocabulary_size, sep =" : ")
print("German Vocabulary Size", german_vocabulary_size, sep =" : ")

English Vocabulary Size : 2214
German Vocabulary Size : 3526


In [20]:
def encode_sequences(lines, tokenizer, max_length):
    X = tokenizer.texts_to_sequences(lines)
    X = np.array(pad_sequences(X, maxlen = max_length))
    return X

### Observation :  

Since this is a German to English Translation

We will prepare the data as follows : 

For X - 
 - Step 1  : Transform the sentences into tokens
 - Step 2  : Pad the tokenized sentences as per the respective language vocabulary size
 
For Y -  
 - Step 3  : Transform the sentences into tokens
 - Step 4  : Encode the tokens using to_categorical function, and reshape them to a 3D array with shape :  
             ((data in Step3).shape[0], (data in Step3).shape[1], respective language vocabulary size)

In [21]:
def encode_output(sequences, vocabulary_size):
    encoded = []
    for line in sequences:
        to_cat = to_categorical(line, num_classes= vocabulary_size)
        encoded.append(to_cat)
    encoded_data = np.array(encoded)
    y = np.reshape(encoded_data,(sequences.shape[0], sequences.shape[1], vocabulary_size))
    return y

In [22]:
train_X = encode_sequences(train[:,1], german_tokenizer, german_max_sentence_length)
train_y = encode_sequences(train[:,0], english_tokenizer, english_max_sentence_length)
train_y = encode_output(train_y, english_vocabulary_size)

In [23]:
test_X = encode_sequences(test[:,1], german_tokenizer, german_max_sentence_length)
test_y = encode_sequences(test[:,0], english_tokenizer, english_max_sentence_length)
test_y = encode_output(test_y, english_vocabulary_size)

In [24]:
def make_model(source_language_vocabulary_size, n_units, source_language_timesteps, target_language_timesteps, target_language_vocabulary_size):
    model = Sequential()
    model.add(Embedding(input_dim = source_language_vocabulary_size, output_dim = n_units, input_length = source_language_timesteps, mask_zero= True))
   
    model.add(Bidirectional(LSTM(n_units , activation = 'relu', input_shape=(source_language_timesteps, 1), return_sequences= True)))
    #model.add(Bidirectional(LSTM(n_units, return_sequences=True), input_shape=(source_language_timesteps, 1)))
   
    model.add(LSTM(n_units , activation = 'relu', input_shape=(source_language_timesteps, 1), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(n_units , activation = 'relu'))
    model.add(RepeatVector(target_language_timesteps))
    #model.add(Bidirectional(LSTM(n_units, return_sequences=True), input_shape=(target_language_timesteps, 1)))
    model.add(LSTM(n_units, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(n_units, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(n_units, return_sequences = True))
    model.add(TimeDistributed(Dense(target_language_vocabulary_size, activation = 'softmax')))
    model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'], optimizer = 'adam')
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [25]:
# Source language : German | Target Language : English

# Number of words in the German vocabulary
source_language_vocabulary_size = german_vocabulary_size
# we will form a 256 dimensional embedding
n_units = 256
# timesteps would be the max sequence length for the source and 
source_language_timesteps = german_max_sentence_length
target_language_timesteps = english_max_sentence_length

# Number words in the English vocabulary
target_language_vocabulary_size = english_vocabulary_size


In [26]:
model = make_model(source_language_vocabulary_size, n_units, source_language_timesteps, target_language_timesteps, target_language_vocabulary_size)





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 42, 256)           902656    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 42, 512)           1050624   
_________________________________________________________________
lstm_2 (LSTM)                (None, 42, 256)           787456    
_________________________________________________________________
dropout_1 (Dropout)          (None, 42, 256)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
___________________________________________________________

# Model Training : AWS

In [27]:
model = make_model(source_language_vocabulary_size, n_units, source_language_timesteps, target_language_timesteps, target_language_vocabulary_size)
checkpoint = ModelCheckpoint('final_model_NMT.h5', monitor='val_loss',  save_best_only= True, mode = 'min', verbose=1)
# model.fit(train_X, train_y, batch_size = 64, epochs = 200 ,validation_data = (test_X, test_y), callbacks = [checkpoint] )

# loss,_ = model.evaluate(test_X, test_y, batch_size = 64, verbose = 2)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 42, 256)           902656    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 42, 512)           1050624   
_________________________________________________________________
lstm_8 (LSTM)                (None, 42, 256)           787456    
_________________________________________________________________
dropout_4 (Dropout)          (None, 42, 256)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 14, 256)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 14, 256)           525312    
__________

In [28]:
# model = make_model(source_language_vocabulary_size, n_units, source_language_timesteps, target_language_timesteps, target_language_vocabulary_size)
# checkpoint = ModelCheckpoint('best_nmt_model.h5', monitor='val_loss', save_best_only=True, mode='min')
# model.fit(train_X, train_y, batch_size = 64, epochs = 30, callbacks = [checkpoint],validation_data = (test_X, test_y))
# loss,_ = model.evaluate(test_X, test_y, batch_size = 64, verbose = 2)



In [29]:
# model.save('final_model_NMT.h5')

In [30]:
model_nmt = load_model('final_model_NMT.h5')











In [37]:
model_nmt.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 42, 256)           902656    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 42, 512)           1050624   
_________________________________________________________________
lstm_8 (LSTM)                (None, 42, 256)           787456    
_________________________________________________________________
dropout_4 (Dropout)          (None, 42, 256)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 14, 256)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 14, 256)           525312    
__________

In [31]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            
            return word
    return None

In [32]:
for i in [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 71, 1, 1]:
    word_for_id(i, english_tokenizer)

In [33]:
def predict_sequence(model, tokenizer, source_text):
    prediction = model.predict(source_text, verbose = 0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    
    translated = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            continue
        translated.append(word)
    return ' '.join(translated)

In [34]:
def evaluate_model(model, sources, raw_dataset, english_tokenizer): 
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        print("Sample : ",i+1)
    # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, english_tokenizer, source) 
        
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation)) 
        actual.append([raw_target.split()])
        predicted.append(translation.split())
  # calculate BLEU score
    print("Calculating BLEU Score")
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))) 
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) 
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


In [35]:
print("Train")
evaluate_model(model_nmt, train_X, train, english_tokenizer)

Train
Sample :  1
src=[ich trug eine nietenhose], target=[i had jeans on], predicted=[i check up]
Sample :  2
src=[du wirst es mogen], target=[youll like it], predicted=[there didnt]
Sample :  3
src=[was ist wahrheit], target=[what is truth], predicted=[this is lucky]
Sample :  4
src=[geh schlafen], target=[go to sleep], predicted=[keep up]
Sample :  5
src=[ihr habt was gut bei uns], target=[we owe you one], predicted=[have a hugged]
Sample :  6
src=[tom hat recht], target=[tom is right], predicted=[tom came up]
Sample :  7
src=[tom ist ein weichei], target=[tom is a wimp], predicted=[tom is a gets]
Sample :  8
src=[ich habe aufgelegt], target=[i hung up], predicted=[i begin]
Sample :  9
src=[tom wird gehen], target=[tom will go], predicted=[tom will tom]
Sample :  10
src=[tom ist gesund], target=[tom is healthy], predicted=[toms dying]
Sample :  11
Sample :  12
Sample :  13
Sample :  14
Sample :  15
Sample :  16
Sample :  17
Sample :  18
Sample :  19
Sample :  20
Sample :  21
Sample :

Sample :  542
Sample :  543
Sample :  544
Sample :  545
Sample :  546
Sample :  547
Sample :  548
Sample :  549
Sample :  550
Sample :  551
Sample :  552
Sample :  553
Sample :  554
Sample :  555
Sample :  556
Sample :  557
Sample :  558
Sample :  559
Sample :  560
Sample :  561
Sample :  562
Sample :  563
Sample :  564
Sample :  565
Sample :  566
Sample :  567
Sample :  568
Sample :  569
Sample :  570
Sample :  571
Sample :  572
Sample :  573
Sample :  574
Sample :  575
Sample :  576
Sample :  577
Sample :  578
Sample :  579
Sample :  580
Sample :  581
Sample :  582
Sample :  583
Sample :  584
Sample :  585
Sample :  586
Sample :  587
Sample :  588
Sample :  589
Sample :  590
Sample :  591
Sample :  592
Sample :  593
Sample :  594
Sample :  595
Sample :  596
Sample :  597
Sample :  598
Sample :  599
Sample :  600
Sample :  601
Sample :  602
Sample :  603
Sample :  604
Sample :  605
Sample :  606
Sample :  607
Sample :  608
Sample :  609
Sample :  610
Sample :  611
Sample :  612
Sample

Sample :  1120
Sample :  1121
Sample :  1122
Sample :  1123
Sample :  1124
Sample :  1125
Sample :  1126
Sample :  1127
Sample :  1128
Sample :  1129
Sample :  1130
Sample :  1131
Sample :  1132
Sample :  1133
Sample :  1134
Sample :  1135
Sample :  1136
Sample :  1137
Sample :  1138
Sample :  1139
Sample :  1140
Sample :  1141
Sample :  1142
Sample :  1143
Sample :  1144
Sample :  1145
Sample :  1146
Sample :  1147
Sample :  1148
Sample :  1149
Sample :  1150
Sample :  1151
Sample :  1152
Sample :  1153
Sample :  1154
Sample :  1155
Sample :  1156
Sample :  1157
Sample :  1158
Sample :  1159
Sample :  1160
Sample :  1161
Sample :  1162
Sample :  1163
Sample :  1164
Sample :  1165
Sample :  1166
Sample :  1167
Sample :  1168
Sample :  1169
Sample :  1170
Sample :  1171
Sample :  1172
Sample :  1173
Sample :  1174
Sample :  1175
Sample :  1176
Sample :  1177
Sample :  1178
Sample :  1179
Sample :  1180
Sample :  1181
Sample :  1182
Sample :  1183
Sample :  1184
Sample :  1185
Sample :  

Sample :  1668
Sample :  1669
Sample :  1670
Sample :  1671
Sample :  1672
Sample :  1673
Sample :  1674
Sample :  1675
Sample :  1676
Sample :  1677
Sample :  1678
Sample :  1679
Sample :  1680
Sample :  1681
Sample :  1682
Sample :  1683
Sample :  1684
Sample :  1685
Sample :  1686
Sample :  1687
Sample :  1688
Sample :  1689
Sample :  1690
Sample :  1691
Sample :  1692
Sample :  1693
Sample :  1694
Sample :  1695
Sample :  1696
Sample :  1697
Sample :  1698
Sample :  1699
Sample :  1700
Sample :  1701
Sample :  1702
Sample :  1703
Sample :  1704
Sample :  1705
Sample :  1706
Sample :  1707
Sample :  1708
Sample :  1709
Sample :  1710
Sample :  1711
Sample :  1712
Sample :  1713
Sample :  1714
Sample :  1715
Sample :  1716
Sample :  1717
Sample :  1718
Sample :  1719
Sample :  1720
Sample :  1721
Sample :  1722
Sample :  1723
Sample :  1724
Sample :  1725
Sample :  1726
Sample :  1727
Sample :  1728
Sample :  1729
Sample :  1730
Sample :  1731
Sample :  1732
Sample :  1733
Sample :  

Sample :  2216
Sample :  2217
Sample :  2218
Sample :  2219
Sample :  2220
Sample :  2221
Sample :  2222
Sample :  2223
Sample :  2224
Sample :  2225
Sample :  2226
Sample :  2227
Sample :  2228
Sample :  2229
Sample :  2230
Sample :  2231
Sample :  2232
Sample :  2233
Sample :  2234
Sample :  2235
Sample :  2236
Sample :  2237
Sample :  2238
Sample :  2239
Sample :  2240
Sample :  2241
Sample :  2242
Sample :  2243
Sample :  2244
Sample :  2245
Sample :  2246
Sample :  2247
Sample :  2248
Sample :  2249
Sample :  2250
Sample :  2251
Sample :  2252
Sample :  2253
Sample :  2254
Sample :  2255
Sample :  2256
Sample :  2257
Sample :  2258
Sample :  2259
Sample :  2260
Sample :  2261
Sample :  2262
Sample :  2263
Sample :  2264
Sample :  2265
Sample :  2266
Sample :  2267
Sample :  2268
Sample :  2269
Sample :  2270
Sample :  2271
Sample :  2272
Sample :  2273
Sample :  2274
Sample :  2275
Sample :  2276
Sample :  2277
Sample :  2278
Sample :  2279
Sample :  2280
Sample :  2281
Sample :  

Sample :  2764
Sample :  2765
Sample :  2766
Sample :  2767
Sample :  2768
Sample :  2769
Sample :  2770
Sample :  2771
Sample :  2772
Sample :  2773
Sample :  2774
Sample :  2775
Sample :  2776
Sample :  2777
Sample :  2778
Sample :  2779
Sample :  2780
Sample :  2781
Sample :  2782
Sample :  2783
Sample :  2784
Sample :  2785
Sample :  2786
Sample :  2787
Sample :  2788
Sample :  2789
Sample :  2790
Sample :  2791
Sample :  2792
Sample :  2793
Sample :  2794
Sample :  2795
Sample :  2796
Sample :  2797
Sample :  2798
Sample :  2799
Sample :  2800
Sample :  2801
Sample :  2802
Sample :  2803
Sample :  2804
Sample :  2805
Sample :  2806
Sample :  2807
Sample :  2808
Sample :  2809
Sample :  2810
Sample :  2811
Sample :  2812
Sample :  2813
Sample :  2814
Sample :  2815
Sample :  2816
Sample :  2817
Sample :  2818
Sample :  2819
Sample :  2820
Sample :  2821
Sample :  2822
Sample :  2823
Sample :  2824
Sample :  2825
Sample :  2826
Sample :  2827
Sample :  2828
Sample :  2829
Sample :  

Sample :  3312
Sample :  3313
Sample :  3314
Sample :  3315
Sample :  3316
Sample :  3317
Sample :  3318
Sample :  3319
Sample :  3320
Sample :  3321
Sample :  3322
Sample :  3323
Sample :  3324
Sample :  3325
Sample :  3326
Sample :  3327
Sample :  3328
Sample :  3329
Sample :  3330
Sample :  3331
Sample :  3332
Sample :  3333
Sample :  3334
Sample :  3335
Sample :  3336
Sample :  3337
Sample :  3338
Sample :  3339
Sample :  3340
Sample :  3341
Sample :  3342
Sample :  3343
Sample :  3344
Sample :  3345
Sample :  3346
Sample :  3347
Sample :  3348
Sample :  3349
Sample :  3350
Sample :  3351
Sample :  3352
Sample :  3353
Sample :  3354
Sample :  3355
Sample :  3356
Sample :  3357
Sample :  3358
Sample :  3359
Sample :  3360
Sample :  3361
Sample :  3362
Sample :  3363
Sample :  3364
Sample :  3365
Sample :  3366
Sample :  3367
Sample :  3368
Sample :  3369
Sample :  3370
Sample :  3371
Sample :  3372
Sample :  3373
Sample :  3374
Sample :  3375
Sample :  3376
Sample :  3377
Sample :  

Sample :  3860
Sample :  3861
Sample :  3862
Sample :  3863
Sample :  3864
Sample :  3865
Sample :  3866
Sample :  3867
Sample :  3868
Sample :  3869
Sample :  3870
Sample :  3871
Sample :  3872
Sample :  3873
Sample :  3874
Sample :  3875
Sample :  3876
Sample :  3877
Sample :  3878
Sample :  3879
Sample :  3880
Sample :  3881
Sample :  3882
Sample :  3883
Sample :  3884
Sample :  3885
Sample :  3886
Sample :  3887
Sample :  3888
Sample :  3889
Sample :  3890
Sample :  3891
Sample :  3892
Sample :  3893
Sample :  3894
Sample :  3895
Sample :  3896
Sample :  3897
Sample :  3898
Sample :  3899
Sample :  3900
Sample :  3901
Sample :  3902
Sample :  3903
Sample :  3904
Sample :  3905
Sample :  3906
Sample :  3907
Sample :  3908
Sample :  3909
Sample :  3910
Sample :  3911
Sample :  3912
Sample :  3913
Sample :  3914
Sample :  3915
Sample :  3916
Sample :  3917
Sample :  3918
Sample :  3919
Sample :  3920
Sample :  3921
Sample :  3922
Sample :  3923
Sample :  3924
Sample :  3925
Sample :  

Sample :  4408
Sample :  4409
Sample :  4410
Sample :  4411
Sample :  4412
Sample :  4413
Sample :  4414
Sample :  4415
Sample :  4416
Sample :  4417
Sample :  4418
Sample :  4419
Sample :  4420
Sample :  4421
Sample :  4422
Sample :  4423
Sample :  4424
Sample :  4425
Sample :  4426
Sample :  4427
Sample :  4428
Sample :  4429
Sample :  4430
Sample :  4431
Sample :  4432
Sample :  4433
Sample :  4434
Sample :  4435
Sample :  4436
Sample :  4437
Sample :  4438
Sample :  4439
Sample :  4440
Sample :  4441
Sample :  4442
Sample :  4443
Sample :  4444
Sample :  4445
Sample :  4446
Sample :  4447
Sample :  4448
Sample :  4449
Sample :  4450
Sample :  4451
Sample :  4452
Sample :  4453
Sample :  4454
Sample :  4455
Sample :  4456
Sample :  4457
Sample :  4458
Sample :  4459
Sample :  4460
Sample :  4461
Sample :  4462
Sample :  4463
Sample :  4464
Sample :  4465
Sample :  4466
Sample :  4467
Sample :  4468
Sample :  4469
Sample :  4470
Sample :  4471
Sample :  4472
Sample :  4473
Sample :  

Sample :  4956
Sample :  4957
Sample :  4958
Sample :  4959
Sample :  4960
Sample :  4961
Sample :  4962
Sample :  4963
Sample :  4964
Sample :  4965
Sample :  4966
Sample :  4967
Sample :  4968
Sample :  4969
Sample :  4970
Sample :  4971
Sample :  4972
Sample :  4973
Sample :  4974
Sample :  4975
Sample :  4976
Sample :  4977
Sample :  4978
Sample :  4979
Sample :  4980
Sample :  4981
Sample :  4982
Sample :  4983
Sample :  4984
Sample :  4985
Sample :  4986
Sample :  4987
Sample :  4988
Sample :  4989
Sample :  4990
Sample :  4991
Sample :  4992
Sample :  4993
Sample :  4994
Sample :  4995
Sample :  4996
Sample :  4997
Sample :  4998
Sample :  4999
Sample :  5000
Sample :  5001
Sample :  5002
Sample :  5003
Sample :  5004
Sample :  5005
Sample :  5006
Sample :  5007
Sample :  5008
Sample :  5009
Sample :  5010
Sample :  5011
Sample :  5012
Sample :  5013
Sample :  5014
Sample :  5015
Sample :  5016
Sample :  5017
Sample :  5018
Sample :  5019
Sample :  5020
Sample :  5021
Sample :  

Sample :  5504
Sample :  5505
Sample :  5506
Sample :  5507
Sample :  5508
Sample :  5509
Sample :  5510
Sample :  5511
Sample :  5512
Sample :  5513
Sample :  5514
Sample :  5515
Sample :  5516
Sample :  5517
Sample :  5518
Sample :  5519
Sample :  5520
Sample :  5521
Sample :  5522
Sample :  5523
Sample :  5524
Sample :  5525
Sample :  5526
Sample :  5527
Sample :  5528
Sample :  5529
Sample :  5530
Sample :  5531
Sample :  5532
Sample :  5533
Sample :  5534
Sample :  5535
Sample :  5536
Sample :  5537
Sample :  5538
Sample :  5539
Sample :  5540
Sample :  5541
Sample :  5542
Sample :  5543
Sample :  5544
Sample :  5545
Sample :  5546
Sample :  5547
Sample :  5548
Sample :  5549
Sample :  5550
Sample :  5551
Sample :  5552
Sample :  5553
Sample :  5554
Sample :  5555
Sample :  5556
Sample :  5557
Sample :  5558
Sample :  5559
Sample :  5560
Sample :  5561
Sample :  5562
Sample :  5563
Sample :  5564
Sample :  5565
Sample :  5566
Sample :  5567
Sample :  5568
Sample :  5569
Sample :  

Sample :  6052
Sample :  6053
Sample :  6054
Sample :  6055
Sample :  6056
Sample :  6057
Sample :  6058
Sample :  6059
Sample :  6060
Sample :  6061
Sample :  6062
Sample :  6063
Sample :  6064
Sample :  6065
Sample :  6066
Sample :  6067
Sample :  6068
Sample :  6069
Sample :  6070
Sample :  6071
Sample :  6072
Sample :  6073
Sample :  6074
Sample :  6075
Sample :  6076
Sample :  6077
Sample :  6078
Sample :  6079
Sample :  6080
Sample :  6081
Sample :  6082
Sample :  6083
Sample :  6084
Sample :  6085
Sample :  6086
Sample :  6087
Sample :  6088
Sample :  6089
Sample :  6090
Sample :  6091
Sample :  6092
Sample :  6093
Sample :  6094
Sample :  6095
Sample :  6096
Sample :  6097
Sample :  6098
Sample :  6099
Sample :  6100
Sample :  6101
Sample :  6102
Sample :  6103
Sample :  6104
Sample :  6105
Sample :  6106
Sample :  6107
Sample :  6108
Sample :  6109
Sample :  6110
Sample :  6111
Sample :  6112
Sample :  6113
Sample :  6114
Sample :  6115
Sample :  6116
Sample :  6117
Sample :  

Sample :  6600
Sample :  6601
Sample :  6602
Sample :  6603
Sample :  6604
Sample :  6605
Sample :  6606
Sample :  6607
Sample :  6608
Sample :  6609
Sample :  6610
Sample :  6611
Sample :  6612
Sample :  6613
Sample :  6614
Sample :  6615
Sample :  6616
Sample :  6617
Sample :  6618
Sample :  6619
Sample :  6620
Sample :  6621
Sample :  6622
Sample :  6623
Sample :  6624
Sample :  6625
Sample :  6626
Sample :  6627
Sample :  6628
Sample :  6629
Sample :  6630
Sample :  6631
Sample :  6632
Sample :  6633
Sample :  6634
Sample :  6635
Sample :  6636
Sample :  6637
Sample :  6638
Sample :  6639
Sample :  6640
Sample :  6641
Sample :  6642
Sample :  6643
Sample :  6644
Sample :  6645
Sample :  6646
Sample :  6647
Sample :  6648
Sample :  6649
Sample :  6650
Sample :  6651
Sample :  6652
Sample :  6653
Sample :  6654
Sample :  6655
Sample :  6656
Sample :  6657
Sample :  6658
Sample :  6659
Sample :  6660
Sample :  6661
Sample :  6662
Sample :  6663
Sample :  6664
Sample :  6665
Sample :  

Sample :  7148
Sample :  7149
Sample :  7150
Sample :  7151
Sample :  7152
Sample :  7153
Sample :  7154
Sample :  7155
Sample :  7156
Sample :  7157
Sample :  7158
Sample :  7159
Sample :  7160
Sample :  7161
Sample :  7162
Sample :  7163
Sample :  7164
Sample :  7165
Sample :  7166
Sample :  7167
Sample :  7168
Sample :  7169
Sample :  7170
Sample :  7171
Sample :  7172
Sample :  7173
Sample :  7174
Sample :  7175
Sample :  7176
Sample :  7177
Sample :  7178
Sample :  7179
Sample :  7180
Sample :  7181
Sample :  7182
Sample :  7183
Sample :  7184
Sample :  7185
Sample :  7186
Sample :  7187
Sample :  7188
Sample :  7189
Sample :  7190
Sample :  7191
Sample :  7192
Sample :  7193
Sample :  7194
Sample :  7195
Sample :  7196
Sample :  7197
Sample :  7198
Sample :  7199
Sample :  7200
Sample :  7201
Sample :  7202
Sample :  7203
Sample :  7204
Sample :  7205
Sample :  7206
Sample :  7207
Sample :  7208
Sample :  7209
Sample :  7210
Sample :  7211
Sample :  7212
Sample :  7213
Sample :  

Sample :  7696
Sample :  7697
Sample :  7698
Sample :  7699
Sample :  7700
Sample :  7701
Sample :  7702
Sample :  7703
Sample :  7704
Sample :  7705
Sample :  7706
Sample :  7707
Sample :  7708
Sample :  7709
Sample :  7710
Sample :  7711
Sample :  7712
Sample :  7713
Sample :  7714
Sample :  7715
Sample :  7716
Sample :  7717
Sample :  7718
Sample :  7719
Sample :  7720
Sample :  7721
Sample :  7722
Sample :  7723
Sample :  7724
Sample :  7725
Sample :  7726
Sample :  7727
Sample :  7728
Sample :  7729
Sample :  7730
Sample :  7731
Sample :  7732
Sample :  7733
Sample :  7734
Sample :  7735
Sample :  7736
Sample :  7737
Sample :  7738
Sample :  7739
Sample :  7740
Sample :  7741
Sample :  7742
Sample :  7743
Sample :  7744
Sample :  7745
Sample :  7746
Sample :  7747
Sample :  7748
Sample :  7749
Sample :  7750
Sample :  7751
Sample :  7752
Sample :  7753
Sample :  7754
Sample :  7755
Sample :  7756
Sample :  7757
Sample :  7758
Sample :  7759
Sample :  7760
Sample :  7761
Sample :  

Sample :  8244
Sample :  8245
Sample :  8246
Sample :  8247
Sample :  8248
Sample :  8249
Sample :  8250
Sample :  8251
Sample :  8252
Sample :  8253
Sample :  8254
Sample :  8255
Sample :  8256
Sample :  8257
Sample :  8258
Sample :  8259
Sample :  8260
Sample :  8261
Sample :  8262
Sample :  8263
Sample :  8264
Sample :  8265
Sample :  8266
Sample :  8267
Sample :  8268
Sample :  8269
Sample :  8270
Sample :  8271
Sample :  8272
Sample :  8273
Sample :  8274
Sample :  8275
Sample :  8276
Sample :  8277
Sample :  8278
Sample :  8279
Sample :  8280
Sample :  8281
Sample :  8282
Sample :  8283
Sample :  8284
Sample :  8285
Sample :  8286
Sample :  8287
Sample :  8288
Sample :  8289
Sample :  8290
Sample :  8291
Sample :  8292
Sample :  8293
Sample :  8294
Sample :  8295
Sample :  8296
Sample :  8297
Sample :  8298
Sample :  8299
Sample :  8300
Sample :  8301
Sample :  8302
Sample :  8303
Sample :  8304
Sample :  8305
Sample :  8306
Sample :  8307
Sample :  8308
Sample :  8309
Sample :  

Sample :  8792
Sample :  8793
Sample :  8794
Sample :  8795
Sample :  8796
Sample :  8797
Sample :  8798
Sample :  8799
Sample :  8800
Sample :  8801
Sample :  8802
Sample :  8803
Sample :  8804
Sample :  8805
Sample :  8806
Sample :  8807
Sample :  8808
Sample :  8809
Sample :  8810
Sample :  8811
Sample :  8812
Sample :  8813
Sample :  8814
Sample :  8815
Sample :  8816
Sample :  8817
Sample :  8818
Sample :  8819
Sample :  8820
Sample :  8821
Sample :  8822
Sample :  8823
Sample :  8824
Sample :  8825
Sample :  8826
Sample :  8827
Sample :  8828
Sample :  8829
Sample :  8830
Sample :  8831
Sample :  8832
Sample :  8833
Sample :  8834
Sample :  8835
Sample :  8836
Sample :  8837
Sample :  8838
Sample :  8839
Sample :  8840
Sample :  8841
Sample :  8842
Sample :  8843
Sample :  8844
Sample :  8845
Sample :  8846
Sample :  8847
Sample :  8848
Sample :  8849
Sample :  8850
Sample :  8851
Sample :  8852
Sample :  8853
Sample :  8854
Sample :  8855
Sample :  8856
Sample :  8857
Sample :  

In [36]:
print("Test")
evaluate_model(model_nmt, test_X, test, english_tokenizer)

Test
Sample :  1
src=[lasst uns uben], target=[lets practice], predicted=[busy go]
Sample :  2
src=[in keinster weise], target=[no way], predicted=[there good see]
Sample :  3
src=[sie heien es gut], target=[they approve], predicted=[there these]
Sample :  4
src=[ich singe selten], target=[i seldom sing], predicted=[i what happy]
Sample :  5
src=[nimms leicht], target=[take it easy], predicted=[deaf]
Sample :  6
src=[ich habe auch geweint], target=[i cried too], predicted=[i like it]
Sample :  7
src=[tom meidet mich], target=[tom avoids me], predicted=[tom will it]
Sample :  8
src=[hast du es vergessen], target=[did you forget], predicted=[do to it]
Sample :  9
src=[ich kann rennen], target=[i can run], predicted=[i cant go]
Sample :  10
src=[ich arbeite gerne], target=[i like to work], predicted=[i i catch]
Sample :  11
Sample :  12
Sample :  13
Sample :  14
Sample :  15
Sample :  16
Sample :  17
Sample :  18
Sample :  19
Sample :  20
Sample :  21
Sample :  22
Sample :  23
Sample :  2

Sample :  545
Sample :  546
Sample :  547
Sample :  548
Sample :  549
Sample :  550
Sample :  551
Sample :  552
Sample :  553
Sample :  554
Sample :  555
Sample :  556
Sample :  557
Sample :  558
Sample :  559
Sample :  560
Sample :  561
Sample :  562
Sample :  563
Sample :  564
Sample :  565
Sample :  566
Sample :  567
Sample :  568
Sample :  569
Sample :  570
Sample :  571
Sample :  572
Sample :  573
Sample :  574
Sample :  575
Sample :  576
Sample :  577
Sample :  578
Sample :  579
Sample :  580
Sample :  581
Sample :  582
Sample :  583
Sample :  584
Sample :  585
Sample :  586
Sample :  587
Sample :  588
Sample :  589
Sample :  590
Sample :  591
Sample :  592
Sample :  593
Sample :  594
Sample :  595
Sample :  596
Sample :  597
Sample :  598
Sample :  599
Sample :  600
Sample :  601
Sample :  602
Sample :  603
Sample :  604
Sample :  605
Sample :  606
Sample :  607
Sample :  608
Sample :  609
Sample :  610
Sample :  611
Sample :  612
Sample :  613
Sample :  614
Sample :  615
Sample