## Language Translator

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [None]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     /Users/jurajkapasny/nltk_data...


True

In [None]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [None]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

## Beginning of code

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename,mode='rt',encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    return text

In [2]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

- Return all non-printable characters
- Remove all punctuation characters
- Normalize all Unicode characters to ASCII
- Normalize the case to lowercase
- Remove any reminaing tokens that are not alphabetic

In [3]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars from each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [4]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# save a list of clean sentences to file
def save_clean_data(sentences,filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved %s' % filename)

filename = './drive/My Drive/fra.txt'
doc = load_doc(filename)
# split into english-french pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, './english_french.pkl')
# spot_check
for i in range(50):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved ./english_french.pkl
[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]
[stop] => [ca suffit]
[stop] => [stop]
[stop] => [arretetoi]
[wait] => [attends]
[wait] => [attendez]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[hello] => [bonjour]
[hello] => [salut]
[i see] => [je comprends]
[i try] => [jessaye]
[i won] => [jai gagne]
[i won] => [je lai emporte]
[i won] => [jai gagne]
[oh no] => [oh non]
[attack] => [attaque]
[attack] => [attaquez]
[cheers] => [sante]
[cheers] => [a votre sante]
[cheers] => [merci]
[cheers] => [tchintchin]
[get up] => [levetoi]
[go now] => [va maintenant]
[go now] => [allezy maintenant]
[go now] => [vasy maintenant]
[got it] => [jai pige]
[got it] => [compris]
[got it] => [pige]
[got it] => [compris]
[got it] => [tas capte]
[hop in] => [monte]
[hop in] => [montez]
[hug me] => [serremoi dans tes bras]
[hug me] =>

In [5]:
from pickle import load
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# load dataset
raw_dataset = load_clean_sentences('./english_french.pkl')

# reduce dataset size
n_sentences = int(0.25*10**5)
dataset = raw_dataset[:n_sentences,:]
# random shuffle
shuffle(dataset)
# split into train/test
train_len = int(0.9*len(dataset))
train, test = dataset[:train_len], dataset[train_len:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

Saved english-french-both.pkl
Saved english-french-train.pkl
Saved english-french-test.pkl


## Train Neural Translation Model

In [6]:
from numpy import array
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
print('All packages loaded')

All packages loaded


In [7]:
# load clean dataset
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [8]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [9]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index)+1
eng_length = max_length(dataset[:,0])
print('English Vocabulary size: %d' % (eng_vocab_size))
print('English Max length: %d' % (eng_length))
# prepare french tokenizer
fr_tokenizer = create_tokenizer(dataset[:,1])
fr_vocab_size = len(fr_tokenizer.word_index)+1
fr_length = max_length(dataset[:,1])
print('French Vocabulary size: %d' % (fr_vocab_size))
print('French Max length: %d' % (fr_length))

English Vocabulary size: 3969
English Max length: 6
French Vocabulary size: 7959
French Max length: 12


In [10]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [11]:
# one hot encode target sequence
def encode_ouput(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes = vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [12]:
# prepare training data
trainX = encode_sequences(fr_tokenizer, fr_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_ouput(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(fr_tokenizer,fr_length,test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length,test[:,0])
testY = encode_ouput(testY, eng_vocab_size)

In [None]:
#!pip install pydot
!pip install graphviz

Collecting graphviz
  Downloading graphviz-0.14.1-py2.py3-none-any.whl (18 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.14.1


In [None]:
import pydot

In [13]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero = True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# define model
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12, 256)           2037504   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 6, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 6, 256)            525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 6, 3969)           1020033   
Total params: 4,108,161
Trainable params: 4,108,161
Non-trainable params: 0
_________________________________________________________________
None


NameError: ignored

In [14]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=50, batch_size=64, validation_data=(testX,testY), callbacks=[checkpoint], verbose=2)

Epoch 1/50

Epoch 00001: val_loss improved from inf to 3.27278, saving model to model.h5
352/352 - 12s - loss: 3.7866 - val_loss: 3.2728
Epoch 2/50

Epoch 00002: val_loss improved from 3.27278 to 3.12541, saving model to model.h5
352/352 - 11s - loss: 3.1510 - val_loss: 3.1254
Epoch 3/50

Epoch 00003: val_loss improved from 3.12541 to 2.89511, saving model to model.h5
352/352 - 11s - loss: 2.9447 - val_loss: 2.8951
Epoch 4/50

Epoch 00004: val_loss improved from 2.89511 to 2.69216, saving model to model.h5
352/352 - 11s - loss: 2.6797 - val_loss: 2.6922
Epoch 5/50

Epoch 00005: val_loss improved from 2.69216 to 2.49845, saving model to model.h5
352/352 - 11s - loss: 2.4299 - val_loss: 2.4984
Epoch 6/50

Epoch 00006: val_loss improved from 2.49845 to 2.33349, saving model to model.h5
352/352 - 11s - loss: 2.1915 - val_loss: 2.3335
Epoch 7/50

Epoch 00007: val_loss improved from 2.33349 to 2.18844, saving model to model.h5
352/352 - 11s - loss: 1.9771 - val_loss: 2.1884
Epoch 8/50

Epoch

<tensorflow.python.keras.callbacks.History at 0x7fc7d1fc9588>

## Evaluate Neural Translation model

In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 1.6 MB/s eta 0:00:01
Collecting regex
  Downloading regex-2020.7.14-cp38-cp38-manylinux2010_x86_64.whl (672 kB)
[K     |████████████████████████████████| 672 kB 3.5 MB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434674 sha256=74499cfb5f0215ba98189ec5a7cd920cf3acd74ad88082eadf4d2a8503e6c332
  Stored in directory: /home/jovyan/.cache/pip/wheels/ff/d5/7b/f1fb4e1e1603b2f01c2424dd60fbcc50c12ef918bafc44b155
Successfully built nltk
Installing collected packages: regex, nltk
Successfully installed nltk-3.5 regex-2020.7.14


In [15]:
from numpy import argmax
from tensorflow.keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index)+1
eng_length = max_length(dataset[:,0])
# prepare french tokenizer
fr_tokenizer = create_tokenizer(dataset[:,1])
fr_vocab_size = len(fr_tokenizer.word_index)+1
fr_length = max_length(dataset[:,1])
# prepare data
trainX = encode_sequences(fr_tokenizer, fr_length, train[:,1])
testX = encode_sequences(fr_tokenizer, fr_length, test[:,1])

# load model
model = load_model('model.h5')

In [None]:
#translation = model.predict(source,verbose=0)

In [16]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [17]:
# generate target given source sequence
def predict_sequence(model, tokenizer,source): 
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [18]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1,source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        #print(raw_dataset[i])
        raw_target, raw_src = raw_dataset[i][0], raw_dataset[i][1]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate Bleu score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0,0,0)))
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5,0,0)))
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3,0.3,0)))
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25,0.25,0.25)))

In [20]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[ne demande pas je te prie], target=[please dont ask], predicted=[please dont ask]
src=[je vais vous attendre], target=[i can wait for you], predicted=[i will to you]
src=[fais ce que tu veux], target=[do what you want], predicted=[do what you want]
src=[il alla faire des emplettes], target=[he went shopping], predicted=[he went shopping]
src=[vous en avez termine], target=[youre through], predicted=[youre through]
src=[allumez le cierge], target=[light the candle], predicted=[light the candle]
src=[la vie nest pas facile], target=[life aint easy], predicted=[life is easy]
src=[cest tres collant], target=[its very sticky], predicted=[its very sticky]
src=[je vais y aller en premier], target=[ill go first], predicted=[ill go first]
src=[astu dormi], target=[have you slept], predicted=[have you slept]
BLEU-1: 0.908369
BLEU-1: 0.869415
BLEU-1: 0.820528
BLEU-1: 0.630555
test
src=[elles firent une promenade], target=[they took a walk], predicted=[they took a walk]
src=[je commence

In [23]:
pred = encode_sequences(fr_tokenizer, fr_length, ['ca fonctionne'])
predict_sequence(model, eng_tokenizer,pred)

'it works'