In [1]:
# Function that commits the contents of the dataset to memory.

def ds_commit(dataset):

  # Opening the file in a "read only" format and encoding it using the UTF-8 method.
  file = open(dataset, mode = "rt", encoding = "utf-8")

  # Reading the contents of the dataset.
  df_contents = file.read()

  # Close the dataset.
  file.close()

  return df_contents

In [2]:
 # Mounting the google drive onto Google collaboratory.

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Splitting the text conjured in the function above into phrases.

def to_phrase(df):

  # Getting rid of white space and grouping the phrases.
  lines = df.strip().split("\n")
  tr_sets = [line.split("\t") for line in lines]
  return tr_sets

In [4]:
import string
import re
import numpy as np
from pickle import dump
from pickle import load
from numpy.random import rand
from numpy.random import shuffle
from unicodedata import normalize


def clean_pairs(lines):
	cleaned = list()
	# Creating ReGex instance for character filtering.
	re_print = re.compile('[^%s]' % re.escape(string.printable))
 
	# Forming a translation table for removing punctuation marks.
	table = str.maketrans('', '', string.punctuation)
 
	for pair in lines:
		clean_pair = list()
		for line in pair:
			
      
      
      # Normalizing unicode characters.
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
   
			# Tokenizing the phrases.
			line = line.split()
   
			# Changing all letters into to their lowercase form.
			line = [word.lower() for word in line]

			# Omitting the punctuation marks from each token.
			line = [word.translate(table) for word in line]

			# Omitting all non-printable characters from each token.
			line = [re_print.sub('', w) for w in line]

			# Omitting tokens with numbers.
			line = [word for word in line if word.isalpha()]

			# Storing the cleaned phrases in a list and converting it into an array.
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return np.array(cleaned)

In [5]:
# Saving the cleaned phrases to a new file.

def save_clean_data(phrases, filename):
	dump(phrases, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [6]:
# Importing dataset and extracting the contents using the "df_commit" function.
dataset = '/content/drive/MyDrive/deu.txt'
df = ds_commit(dataset)

In [7]:
# Splitting the contents into groups of english phrases and their corresponding german translations ("to_phrase" function).
EGpairs = to_phrase(df)

In [8]:
# Cleaning the phrases of unnecessary elements as described in the "clean_pairs" function.
clean_df = clean_pairs(EGpairs)

In [9]:
# Saving the cleaned phrases file (as a pkl file)
save_clean_data(clean_df, 'eng-german-transl.pkl')

Saved: eng-german-transl.pkl


In [None]:
# spot check 
for i in range(100):
	print('[%s] => [%s]' % (clean_df[i,0], clean_df[i,1]))

[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [das gibts doch nicht]
[no wa

In [10]:
# Fetching the cleaned data.

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [11]:
# Loading the compiled and cleaned dataset
comp_dataset = load_clean_sentences('eng-german-transl.pkl')
 
# Reducing the dataset size to save computation time.
n_phrases = 35000
dataset = comp_dataset[:n_phrases, :]

# Randomly shuffling data in order to eliminate any sort of "order".
shuffle(dataset)

# Splitting the data in the training and testing set.
train, test = dataset[:31500], dataset[31500:]

# Saving the split data as pkl files.
save_clean_data(dataset, 'eng-german-comp.pkl')
save_clean_data(train, 'eng-german-train.pkl')
save_clean_data(test, 'eng-german-test.pkl')

Saved: eng-german-comp.pkl
Saved: eng-german-train.pkl
Saved: eng-german-test.pkl


In [None]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
 
# Loading the cleaned dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# Creating a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
 
# Function to determine the length of the longest phrase (enabled by the .fit_on_texts function).
def max_length(lines):
	return max(len(line.split()) for line in lines)
 
# Encoding the sequences and then padding them with zeroes.
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
 
# One hot encoding the output english phrases.
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y
 
# define translation model (LSTM)
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
 
# Loading the training, testing and compiled datasets datasets
dataset = load_clean_sentences('eng-german-comp.pkl')
train = load_clean_sentences('eng-german-train.pkl')
test = load_clean_sentences('eng-german-test.pkl')
 
# Forming the tokenizers for the english and their corresponding german phrases.
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))
 
# Prepping the training and testing data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)
 
# Defining the model.
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Model summary.
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

# Fitting the model.
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 5416
English Max Length: 7
German Vocabulary Size: 9011
German Max Length: 17
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 256)           2306816   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 7, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 7, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 7, 5416)          1391912   
 ibuted)                                                         
                   

<keras.callbacks.History at 0x7f86e8c10a90>

In [15]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
 
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 
# load datasets
dataset = load_clean_sentences('eng-german-comp.pkl')
train = load_clean_sentences('eng-german-train.pkl')
test = load_clean_sentences('eng-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
 
# load model
model = load_model('Bestmodel.h5')
# test on some training sequences
#print('train')
#evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

test
src=[es ist spat], target=[its late], predicted=[its theyre]
src=[ich bin tvsuchtig], target=[im a tv addict], predicted=[im a]
src=[ich habe zu viele taschen], target=[i have too many bags], predicted=[i am used to]
src=[tom ist dreiig], target=[tom is thirty], predicted=[tom is a]
src=[bitte geht], target=[please go], predicted=[please go]
src=[er erwiderte kein wort], target=[he made no response], predicted=[he will not]
src=[habe ich das erwahnt], target=[did i mention that], predicted=[did i girls this]
src=[haben sie ihren brief bekommen], target=[did you get her letter], predicted=[have you run your told]
src=[ich ziehe mich aus], target=[im undressing], predicted=[im dreaming to]
src=[ich werde spater bezahlen], target=[ill pay later], predicted=[ill be miss hear]
BLEU-1: 0.364155
BLEU-2: 0.225690
BLEU-3: 0.178976
BLEU-4: 0.096772
