## Language Translator

In [1]:
import nltk

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [3]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     /Users/jurajkapasny/nltk_data...


True

In [8]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [9]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

## Beginning of code

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename,mode='rt',encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    return text

In [2]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

- Return all non-printable characters
- Remove all punctuation characters
- Normalize all Unicode characters to ASCII
- Normalize the case to lowercase
- Remove any reminaing tokens that are not alphabetic

In [3]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars from each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [4]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# save a list of clean sentences to file
def save_clean_data(sentences,filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved %s' % filename)

filename = '../fra-eng/fra.txt'
doc = load_doc(filename)
# split into english-french pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, './english_french.pkl')
# spot_check
for i in range(50):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved ./english_french.pkl
[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]
[stop] => [ca suffit]
[stop] => [stop]
[stop] => [arretetoi]
[wait] => [attends]
[wait] => [attendez]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[hello] => [bonjour]
[hello] => [salut]
[i see] => [je comprends]
[i try] => [jessaye]
[i won] => [jai gagne]
[i won] => [je lai emporte]
[i won] => [jai gagne]
[oh no] => [oh non]
[attack] => [attaque]
[attack] => [attaquez]
[cheers] => [sante]
[cheers] => [a votre sante]
[cheers] => [merci]
[cheers] => [tchintchin]
[get up] => [levetoi]
[go now] => [va maintenant]
[go now] => [allezy maintenant]
[go now] => [vasy maintenant]
[got it] => [jai pige]
[got it] => [compris]
[got it] => [pige]
[got it] => [compris]
[got it] => [tas capte]
[hop in] => [monte]
[hop in] => [montez]
[hug me] => [serremoi dans tes bras]
[hug me] =>

In [6]:
from pickle import load
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# load dataset
raw_dataset = load_clean_sentences('./english_french.pkl')

# reduce dataset size
n_sentences = 10**4
dataset = raw_dataset[:n_sentences,:]
# random shuffle
shuffle(dataset)
# split into train/test
train_len = int(0.9*len(dataset))
train, test = dataset[:train_len], dataset[train_len:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

Saved english-french-both.pkl
Saved english-french-train.pkl
Saved english-french-test.pkl


## Train Neural Translation Model

In [10]:
from numpy import array
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
print('All packages loaded')

All packages loaded


In [11]:
# load clean dataset
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [12]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [14]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index)+1
eng_length = max_length(dataset[:,0])
print('English Vocabulary size: %d' % (eng_vocab_size))
print('English Max length: %d' % (eng_length))
# prepare french tokenizer
fr_tokenizer = create_tokenizer(dataset[:,1])
fr_vocab_size = len(fr_tokenizer.word_index)+1
fr_length = max_length(dataset[:,1])
print('French Vocabulary size: %d' % (fr_vocab_size))
print('French Max length: %d' % (fr_length))

English Vocabulary size: 2158
English Max length: 5
French Vocabulary size: 4388
French Max length: 10


In [15]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [16]:
# one hot encode target sequence
def encode_ouput(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes = vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [17]:
# prepare training data
trainX = encode_sequences(fr_tokenizer, fr_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_ouput(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(fr_tokenizer,fr_length,test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length,test[:,0])
testY = encode_ouput(testY, eng_vocab_size)

In [24]:
#!pip install pydot
!pip install graphviz

Collecting graphviz
  Downloading graphviz-0.14.1-py2.py3-none-any.whl (18 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.14.1


In [25]:
import pydot

In [26]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero = True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# define model
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 256)           1123328   
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2158)           554606    
Total params: 2,728,558
Trainable params: 2,728,558
Non-trainable params: 0
_________________________________________________________________
None
Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX,testY), callbacks=[checkpoint], verbose=2)