# CLEANING DATASET

Dataset selected from: http://www.manythings.org/anki/ \
Tutorial: https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/

In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [3]:
# Cleaning and saving the dataset (execute this once)
# filename = "data/spa.txt"
# doc = load_doc(filename)
# pairs = to_pairs(doc)
# clean_pairs = clean_pairs(pairs)
# save_clean_data(clean_pairs, 'data/english-spanish.pkl')

# SPLITING DATASET IN TRAIN AND TEST

In [4]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [5]:
# load dataset
raw_dataset = load_clean_sentences('data/english-spanish.pkl')
 
print(raw_dataset.shape)
    
# dataset size
n_sentences = 40000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:39000], dataset[39000:]
# save
save_clean_data(dataset, 'data/english-spanish-both.pkl')
save_clean_data(train, 'data/english-spanish-train.pkl')
save_clean_data(test, 'data/english-spanish-test.pkl')

(128084, 3)
Saved: data/english-spanish-both.pkl
Saved: data/english-spanish-train.pkl
Saved: data/english-spanish-test.pkl


In [6]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [7]:
# load datasets
dataset = load_clean_sentences('data/english-spanish-both.pkl')
train = load_clean_sentences('data/english-spanish-train.pkl')
test = load_clean_sentences('data/english-spanish-test.pkl')

In [8]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [9]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare spanish tokenizer
esp_tokenizer = create_tokenizer(dataset[:, 1])
esp_vocab_size = len(esp_tokenizer.word_index) + 1
esp_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % esp_vocab_size)
print('Spanish Max Length: %d' % (esp_length))

English Vocabulary Size: 5893
English Max Length: 7
Spanish Vocabulary Size: 11178
Spanish Max Length: 15


# TRAINING THE MODEL

In [10]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    del lines
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    del ylist
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [29]:
# MODEL 3
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [11]:
# MODEL 4
# define NMT model
# def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
#     model = Sequential()
#     model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
#     model.add(LSTM(n_units))
#     model.add(RepeatVector(tar_timesteps))
#     model.add(LSTM(n_units))
#     model.add(RepeatVector(tar_timesteps))
#     model.add(LSTM(n_units, return_sequences=True))
#     model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
#     return model

In [30]:
# define model
model = define_model(esp_vocab_size, eng_vocab_size, esp_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 15, 256)           2861568   
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 7, 5893)           1514501   
Total params: 5,426,693
Trainable params: 5,426,693
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
# prepare training data
trainX = encode_sequences(esp_tokenizer, esp_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(esp_tokenizer, esp_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [14]:
# fit model
filename = 'models/model3.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30
610/610 - 101s - loss: 3.7663 - val_loss: 3.4487

Epoch 00001: val_loss improved from inf to 3.44874, saving model to models\model4.h5
Epoch 2/30
610/610 - 98s - loss: 3.3674 - val_loss: 3.3099

Epoch 00002: val_loss improved from 3.44874 to 3.30992, saving model to models\model4.h5
Epoch 3/30
610/610 - 99s - loss: 3.1523 - val_loss: 3.1225

Epoch 00003: val_loss improved from 3.30992 to 3.12253, saving model to models\model4.h5
Epoch 4/30
610/610 - 99s - loss: 2.9889 - val_loss: 3.0369

Epoch 00004: val_loss improved from 3.12253 to 3.03691, saving model to models\model4.h5
Epoch 5/30
610/610 - 100s - loss: 2.8410 - val_loss: 2.8745

Epoch 00005: val_loss improved from 3.03691 to 2.87449, saving model to models\model4.h5
Epoch 6/30
610/610 - 99s - loss: 2.6199 - val_loss: 2.6736

Epoch 00006: val_loss improved from 2.87449 to 2.67359, saving model to models\model4.h5
Epoch 7/30
610/610 - 99s - loss: 2.3691 - val_loss: 2.5390

Epoch 00007: val_loss improved from 2.67359 to 2

<tensorflow.python.keras.callbacks.History at 0x21937247948>

# EVALUATING THE MODEL

In [15]:
# load datasets
dataset = load_clean_sentences('data/english-spanish-both.pkl')
train = load_clean_sentences('data/english-spanish-train.pkl')
test = load_clean_sentences('data/english-spanish-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare spanish tokenizer
esp_tokenizer = create_tokenizer(dataset[:, 1])
esp_vocab_size = len(esp_tokenizer.word_index) + 1
esp_length = max_length(dataset[:, 1])

# prepare data
trainX = encode_sequences(esp_tokenizer, esp_length, train[:, 1])
testX = encode_sequences(esp_tokenizer, esp_length, test[:, 1])

In [16]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [17]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src, x = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [18]:
# load model
model = load_model('models/model3.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[podemos sentarnos alli], target=[can we sit over there], predicted=[can we sit on later]
src=[conoci a tom en australia], target=[i met tom in australia], predicted=[i met tom boston]
src=[las mujeres lo adoraron], target=[women loved it], predicted=[women loved it]
src=[te gustan las ostras], target=[do you like oysters], predicted=[do you like oysters]
src=[salve al gato], target=[i rescued the cat], predicted=[i rescued the cat]
src=[era cierta su historia], target=[was her story true], predicted=[all the story true]
src=[no me hizo caso], target=[he ignored me], predicted=[it didnt me me]
src=[tom solo se encogio de hombros], target=[tom just shrugged], predicted=[tom never shrugged]
src=[cuentame acerca de tu hijo], target=[tell me about your son], predicted=[tell me what your team]
src=[oye tom abreme], target=[hey tom open up], predicted=[hey tom hurt up]
BLEU-1: 0.781127
BLEU-2: 0.690336
BLEU-3: 0.634418
BLEU-4: 0.495131
test
src=[tom odia las reglas], target=[tom ha

# SELF-CONTAINED FUNCTION 

In [22]:
def traductor(model, sentence):
    sentence = clean_text(sentence)
    sentence = encode_sequences(esp_tokenizer, esp_length, sentence)
    for i, source in enumerate(sentence):
        source = source.reshape((1, source.shape[0]))
        return predict_sequence(model, eng_tokenizer, source)

def clean_text(line):
    table = str.maketrans('', '', string.punctuation)
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    clean_pair = list()
    # normalize unicode characters
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    # store as string
    clean_pair.append(' '.join(line)) 
    return clean_pair

In [33]:
# load model
model = load_model('models/model3.h5')
print(model.summary())
traductor(model, "ella le despertó")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 256)           2398976   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 7, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 7, 4889)           1256473   
Total params: 4,706,073
Trainable params: 4,706,073
Non-trainable params: 0
_________________________________________________________________
None


'theres the brother'