In [407]:
import nltk
import pandas as pd
import numpy as np
from unicodedata import normalize
from pickle import dump
from pickle import load
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
import string
import re

import pickle
from collections import Counter
from nltk.corpus import comtrans
from googletrans import Translator

from nltk.translate import Alignment, AlignedSent
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.translate.bleu_score import corpus_bleu

In [280]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [281]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [282]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [283]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

### Data

In [284]:
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')

Saved: english-german.pkl


In [286]:
# 221533 texts
clean_pairs.shape

(221533, 3)

### Take sample of total

In [288]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [289]:
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')
 
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


### Tokenize

In [296]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [297]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [298]:
from keras.preprocessing.text import Tokenizer

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [299]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 2241
English Max Length: 5


In [300]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 3572
German Max Length: 9


In [178]:
#eng_tokenizer.index_word
#eng_tokenizer.word_counts

### Embedding

In [303]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [304]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [305]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [306]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_output(trainY, ger_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_output(testY, ger_vocab_size)

In [307]:
# 9000 x 9 
trainX[0]

array([12, 56,  0,  0,  0], dtype=int32)

In [192]:
trainX.shape

(9000, 5)

In [191]:
trainY.shape

(9000, 9, 3572)

### Model

In [309]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint

In [310]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [311]:
# define model
model = define_model(eng_vocab_size, ger_vocab_size, eng_length, ger_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 5, 256)            573696    
_________________________________________________________________
lstm_12 (LSTM)               (None, 256)               525312    
_________________________________________________________________
repeat_vector_6 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 9, 256)            525312    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 9, 3572)           918004    
Total params: 2,542,324
Trainable params: 2,542,324
Non-trainable params: 0
_________________________________________________________________
None


In [312]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 2.35695, saving model to model.h5
141/141 - 20s - loss: 3.2473 - val_loss: 2.3570
Epoch 2/30

Epoch 00002: val_loss improved from 2.35695 to 2.19320, saving model to model.h5
141/141 - 16s - loss: 2.2330 - val_loss: 2.1932
Epoch 3/30

Epoch 00003: val_loss improved from 2.19320 to 2.14883, saving model to model.h5
141/141 - 15s - loss: 2.1249 - val_loss: 2.1488
Epoch 4/30

Epoch 00004: val_loss improved from 2.14883 to 2.08886, saving model to model.h5
141/141 - 16s - loss: 2.0514 - val_loss: 2.0889
Epoch 5/30

Epoch 00005: val_loss improved from 2.08886 to 1.99659, saving model to model.h5
141/141 - 16s - loss: 1.9549 - val_loss: 1.9966
Epoch 6/30

Epoch 00006: val_loss improved from 1.99659 to 1.93944, saving model to model.h5
141/141 - 16s - loss: 1.8569 - val_loss: 1.9394
Epoch 7/30

Epoch 00007: val_loss improved from 1.93944 to 1.87789, saving model to model.h5
141/141 - 16s - loss: 1.7782 - val_loss: 1.8779
Epoch 8/30

Epoch

<tensorflow.python.keras.callbacks.History at 0x7f26c5106be0>

In [313]:
testX[2]

array([ 2, 63, 14,  0,  0], dtype=int32)

### Testing the model

In [319]:
from keras.models import load_model
from numpy import array
from numpy import argmax

In [320]:
model = load_model('model.h5')

In [321]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [475]:
testing = raw_dataset[10000:10100,0]
testing_encoded = encode_sequences(eng_tokenizer, eng_length, testing)

In [525]:
# testing

In [476]:
testing_encoded.shape

(100, 5)

In [477]:
preds = []
for i in range(testing_encoded.shape[0]):
    res = model.predict(testing_encoded[i].reshape((1, testing_encoded[i].shape[0])), verbose=0)[0]
    integers = [argmax(vector) for vector in res]
    target = list()
    for j in integers:
        word = word_for_id(j, ger_tokenizer)
        if word is None:
            break
        target.append(word)
    translated = ' '.join(target)
    preds.append([testing[i],translated])

In [489]:
mine = []
for i in range(len(preds)):
    translator = Translator()
    result = translator.translate(preds[i][1], src='de', dest='en')
    mine.append(result.text)
    #print('actual:', preds[i][0])
    #print('mine:', result.text, '\n')

### testing singular 

In [531]:
train[30]

array(['theyre quiet', 'sie sind still',
       'ccby france attribution tatoebaorg ck zaghawa'], dtype='<U527')

In [532]:
to_test = ["theyre not quiet"]
temp = encode_sequences(eng_tokenizer, eng_length, to_test)
temp[0]

array([ 76,  41, 144,   0,   0], dtype=int32)

In [533]:
res = model.predict(temp[0].reshape((1, temp[0].shape[0])), verbose=0)[0]

In [534]:
integers = [argmax(vector) for vector in res]
integers

[4, 19, 8, 167, 0, 0, 0, 0, 0]

In [535]:
target = list()
for i in integers:
    word = word_for_id(i, ger_tokenizer)
    if word is None:
        break
    target.append(word)
translated = ' '.join(target)
print(translated)

sie sind nicht warten


### using Google

In [536]:
translator = Translator()
result = translator.translate(translated, src='de', dest='en')
print(to_test[0])
print(result.text)

theyre not quiet
they are not waiting


# BLEU

In [402]:
from nltk.translate.bleu_score import corpus_bleu

In [483]:
preds[:5]

[['tom was gentle', 'tom ist klug'],
 ['tom was greedy', 'tom war durst'],
 ['tom was grumpy', 'tom ist klug'],
 ['tom was guilty', 'tom war gelangweilt'],
 ['tom was heroic', 'tom war wein']]

In [506]:
hypothesis = []
reference = []
for i in range(len(preds)):
    hyp_split = preds[i][0].split(" ") 
    ref_split = mine[i].split(" ")
    hypothesis.append(hyp_split)
    reference.append(ref_split)
    #print(hyp_split, ref_split)

In [508]:
import nltk
hyp = hypothesis
ref = reference

In [537]:
num=10
for i in range(num):
    print(ref[i],hyp[i])

['tom', 'is', 'smart'] ['tom', 'was', 'gentle']
['tom', 'was', 'thirsty'] ['tom', 'was', 'greedy']
['tom', 'is', 'smart'] ['tom', 'was', 'grumpy']
['tom', 'was', 'bored'] ['tom', 'was', 'guilty']
['tom', 'was', 'wine'] ['tom', 'was', 'heroic']
['tom', 'was', 'blind'] ['tom', 'was', 'honest']
['tom', 'was', 'scary'] ['tom', 'was', 'humble']
['tom', 'was', 'thirsty'] ['tom', 'was', 'hungry']
['tom', 'was', 'angry'] ['tom', 'was', 'insane']
['tom', 'was', 'naked'] ['tom', 'was', 'inside']


In [517]:
nltk.translate.bleu_score.corpus_bleu(ref[num], hyp[num])

1.646211035903463e-231