## Language Translator

In [1]:
import nltk
import pandas as pd
import numpy as np
from unicodedata import normalize
from pickle import dump
from pickle import load
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
import string
import re

import pickle
from collections import Counter
from nltk.corpus import comtrans

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.translate import Alignment, AlignedSent
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
#nltk.download('comtrans')

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [4]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [5]:
# 100002
english = [sent.words for sent in comtrans.aligned_sents('alignment-en-fr.txt')]
french = [sent.mots for sent in comtrans.aligned_sents('alignment-en-fr.txt')]

In [6]:
data = []
for i in range(len(english)):
    eng = " ".join(english[i])
    fre = " ".join(french[i])
    data.append([eng,fre])

In [7]:
data[0]

['Resumption of the session', 'Reprise de la session']

# data prep

In [8]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [9]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [10]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [11]:
# clean sentences
clean_data = clean_pairs(data)
# save clean pairs to file
save_clean_data(clean_data, 'english-french.pkl')

Saved: english-french.pkl


# Take sub sample

In [12]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))

In [13]:
# load dataset
raw_dataset = load_clean_sentences('english-french.pkl')

In [14]:
 # reduce dataset size
n_sentences = 5000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:4000], dataset[4000:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

In [17]:
dataset[20]

array(['that is very much the right approach',
       'cette approche est tout a fait appropriee'], dtype='<U274')

# Tokenize

In [18]:
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [19]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [20]:
# English
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:,0])

In [21]:
# French
fre_tokenizer = create_tokenizer(dataset[:,1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:,1])

In [22]:
print(eng_vocab_size)
print(eng_length)

6701
39


In [23]:
print(fre_vocab_size)
print(fre_length)

8381
39


# Embedding

In [24]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [25]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [26]:
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_output(trainY, fre_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_output(testY, fre_vocab_size)

# Model

In [29]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint

In [30]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [31]:
# define model
model = define_model(eng_vocab_size, fre_vocab_size, eng_length, fre_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 39, 256)           1715456   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 39, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 39, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 39, 8381)          2153917   
Total params: 4,919,997
Trainable params: 4,919,997
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
filename = 'eng_to_fre_model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 4.05077, saving model to eng_to_fre_model.h5
63/63 - 139s - loss: 4.9291 - val_loss: 4.0508
Epoch 2/30

Epoch 00002: val_loss improved from 4.05077 to 3.98098, saving model to eng_to_fre_model.h5
63/63 - 131s - loss: 3.9440 - val_loss: 3.9810
Epoch 3/30

Epoch 00003: val_loss improved from 3.98098 to 3.87314, saving model to eng_to_fre_model.h5
63/63 - 135s - loss: 3.8446 - val_loss: 3.8731
Epoch 4/30

Epoch 00004: val_loss improved from 3.87314 to 3.73898, saving model to eng_to_fre_model.h5
63/63 - 131s - loss: 3.7003 - val_loss: 3.7390
Epoch 5/30

Epoch 00005: val_loss improved from 3.73898 to 3.64298, saving model to eng_to_fre_model.h5
63/63 - 131s - loss: 3.5847 - val_loss: 3.6430
Epoch 6/30

Epoch 00006: val_loss improved from 3.64298 to 3.62498, saving model to eng_to_fre_model.h5
63/63 - 131s - loss: 3.5111 - val_loss: 3.6250
Epoch 7/30

Epoch 00007: val_loss improved from 3.62498 to 3.61490, saving model to eng_to_fre_mod

<tensorflow.python.keras.callbacks.History at 0x7efe4c1e31c0>

# Testing model

In [None]:
model = load_model('eng_to_fre_model.h5')

In [33]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [34]:
from keras.models import load_model
from numpy import array
from numpy import argmax

In [288]:
# # prepare english tokenizer
# eng_tokenizer = create_tokenizer(dataset[:, 0])
# eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_length = max_length(dataset[:, 0])
# # prepare german tokenizer
# ger_tokenizer = create_tokenizer(dataset[:, 1])
# ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_length = max_length(dataset[:, 1])
# # prepare data
# trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
# testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

'er kann nicht fahren'

In [35]:
train[1]

array(['it refers to the special commission working group dealing with questions of equality',
       'elle concerne le groupe special de la commission qui s occupe des questions d egalite'],
      dtype='<U274')

In [45]:
to_test = ["it refers to the special commission working group dealing with questions of equality"]
temp = encode_sequences(eng_tokenizer, eng_length, to_test)
temp[0]

array([  14, 1667,    2,    1,  453,   22,  397,  150,  981,   23,  290,
          3, 1070,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [46]:
res = model.predict(temp[0].reshape((1, temp[0].shape[0])), verbose=0)[0]

In [47]:
integers = [argmax(vector) for vector in res]
#integers

In [48]:
target = list()
for i in integers:
    word = word_for_id(i, fre_tokenizer)
    if word is None:
        break
    target.append(word)
print(' '.join(target))

je le la de de de de de de de de de de
