## Language Translator

In [None]:
import nltk
import pandas as pd
import numpy as np
from unicodedata import normalize
from pickle import dump
from pickle import load
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
import string
import re

import pickle
from collections import Counter
from nltk.corpus import comtrans

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.translate import Alignment, AlignedSent
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
#nltk.download('comtrans')

In [320]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [321]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [348]:
# 100002
english = [sent.words for sent in comtrans.aligned_sents('alignment-en-fr.txt')]
french = [sent.mots for sent in comtrans.aligned_sents('alignment-en-fr.txt')]

In [351]:
data = []
for i in range(len(english)):
    eng = " ".join(english[i])
    fre = " ".join(french[i])
    data.append([eng,fre])

In [352]:
data[0]

['Resumption of the session', 'Reprise de la session']

# data prep

In [353]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [354]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [355]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [356]:
# clean sentences
clean_data = clean_pairs(data)
# save clean pairs to file
save_clean_data(clean_data, 'english-french.pkl')

Saved: english-french.pkl


# Take sub sample

In [357]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))

In [358]:
# load dataset
raw_dataset = load_clean_sentences('english-french.pkl')

In [359]:
 # reduce dataset size
n_sentences = 5000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:4000], dataset[4000:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

In [387]:
dataset[10]

array(['i quite agree', 'ich stimme ganz zu',
       'ccby france attribution tatoebaorg ck pfirsichbaeumchen'],
      dtype='<U527')

# Tokenize

In [388]:
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [389]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [390]:
# English
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:,0])

In [391]:
# French
fre_tokenizer = create_tokenizer(dataset[:,1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:,1])

In [392]:
print(eng_vocab_size)
print(eng_length)

9173
39


In [393]:
print(fre_vocab_size)
print(fre_length)

11892
39


# Embedding

In [394]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [395]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_output(trainY, fre_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_output(testY, fre_vocab_size)

# Model

In [370]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint

In [371]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [372]:
# define model
model = define_model(eng_vocab_size, fre_vocab_size, eng_length, fre_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 256)            573696    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 9, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9, 3572)           918004    
Total params: 2,542,324
Trainable params: 2,542,324
Non-trainable params: 0
_________________________________________________________________
None


In [373]:
filename = 'eng_to_fre_model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 2.38867, saving model to fr_model.h5
141/141 - 16s - loss: 3.2527 - val_loss: 2.3887
Epoch 2/30

Epoch 00002: val_loss improved from 2.38867 to 2.22132, saving model to fr_model.h5
141/141 - 14s - loss: 2.2289 - val_loss: 2.2213
Epoch 3/30

Epoch 00003: val_loss improved from 2.22132 to 2.16376, saving model to fr_model.h5
141/141 - 13s - loss: 2.1111 - val_loss: 2.1638
Epoch 4/30

Epoch 00004: val_loss improved from 2.16376 to 2.06626, saving model to fr_model.h5
141/141 - 14s - loss: 2.0092 - val_loss: 2.0663
Epoch 5/30

Epoch 00005: val_loss improved from 2.06626 to 1.98578, saving model to fr_model.h5
141/141 - 14s - loss: 1.9004 - val_loss: 1.9858
Epoch 6/30

Epoch 00006: val_loss improved from 1.98578 to 1.92941, saving model to fr_model.h5
141/141 - 13s - loss: 1.8179 - val_loss: 1.9294
Epoch 7/30

Epoch 00007: val_loss improved from 1.92941 to 1.87072, saving model to fr_model.h5
141/141 - 13s - loss: 1.7331 - val_loss: 1.8

<tensorflow.python.keras.callbacks.History at 0x7f226eb69040>

# Testing model

In [374]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [375]:
from keras.models import load_model
from numpy import array
from numpy import argmax

In [288]:
# # prepare english tokenizer
# eng_tokenizer = create_tokenizer(dataset[:, 0])
# eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_length = max_length(dataset[:, 0])
# # prepare german tokenizer
# ger_tokenizer = create_tokenizer(dataset[:, 1])
# ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_length = max_length(dataset[:, 1])
# # prepare data
# trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
# testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

'er kann nicht fahren'

In [376]:
train[1]

array(['be a man', 'sei ein mann',
       'ccby france attribution tatoebaorg hybrid pfirsichbaeumchen'],
      dtype='<U527')

In [377]:
to_test = ["tom be a man"]
temp = encode_sequences(eng_tokenizer, eng_length, to_test)
temp[0]

array([  1,  30,   6, 272,   0], dtype=int32)

In [378]:
res = model.predict(temp[0].reshape((1, temp[0].shape[0])), verbose=0)[0]

In [379]:
integers = [argmax(vector) for vector in res]
integers

[2, 3, 16, 1091, 0, 0, 0, 0, 0]

In [380]:
target = list()
for i in integers:
    word = word_for_id(i, fre_tokenizer)
    if word is None:
        break
    target.append(word)
print(' '.join(target))

tom ist ein boot
