# **Pre Processing**

In [None]:
def load_data(filepath):
	file = open(filepath, mode='rt', encoding='utf-8')
	text = file.read()
	file.close()
	return text

In [None]:
def phrase_pairs(data):
	lines = data.strip().split('\n')
	pairs = [line.split('\t')[:2] for line in  lines]
	return pairs

In [None]:
def clean_data(lines):
	cleaned = list()
	# regex for characters 
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [None]:
def save_clean_data(sentences, filepath):
	dump(sentences, open(filepath, 'wb'))
	print('Saved: %s' % filepath)

In [None]:
filepath = 'fra.txt'
data = load_data(filepath)
pairs = phrase_pairs(data)
clean_pairs = clean_data(pairs)
save_clean_data(clean_pairs, 'french-english.pkl')


Saved: french-english.pkl


In [None]:
for i in range(10):
  print('[%s] => [%s]' % (clean_pairs[i,0],clean_pairs[i,1]))

[go] => [va]
[go] => [marche]
[go] => [bouge]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[run] => [prenez vos jambes a vos cous]
[run] => [file]
[run] => [filez]


In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

raw_dataset = load_clean_sentences('french-english.pkl')
print(len(raw_dataset))

192341


In [None]:
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = dataset[:18000], dataset[18000:]

In [None]:
save_clean_data(dataset, 'french-reduced.pkl')
save_clean_data(train, 'french-train.pkl')
save_clean_data(test, 'french-test.pkl')

Saved: french-reduced.pkl
Saved: french-train.pkl
Saved: french-test.pkl



**Data splitting**

In [None]:
from pickle import load
from pickle import dump

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# load datasets
dataset = load_clean_sentences('french-reduced.pkl')
train = load_clean_sentences('french-train.pkl')
test = load_clean_sentences('french-test.pkl')

In [None]:
from keras.preprocessing.text import Tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [None]:
# max phrase length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:
tokenizer = Tokenizer()
french_tokenizer = create_tokenizer(dataset[:, 1])
french_vocab_size = len(french_tokenizer.word_index) + 1
french_max_length = max_length(dataset[:, 1])
print("French vocabulary size: ", french_vocab_size)
print("French maximum phrase length: ", french_max_length)

French vocabulary size:  6977
French maximum phrase length:  11


In [None]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_max_length))

English Vocabulary Size: 3418
English Max Length: 5


In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical


In [None]:
def encode_sequences(tokenizer, length, lines):
  X = tokenizer.texts_to_sequences(lines)
  X = pad_sequences(X, maxlen=length, padding='post')
  print(X)
  return X

In [None]:
import numpy as np
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [None]:
# prepare training data
trainX = encode_sequences(french_tokenizer, french_max_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_max_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

[[ 173    9  670 ...    0    0    0]
 [1814    0    0 ...    0    0    0]
 [  69  153  149 ...    0    0    0]
 ...
 [  14   91   58 ...    0    0    0]
 [   2   43 1192 ...    0    0    0]
 [   6  249  135 ...    0    0    0]]
[[  50   23  718    0    0]
 [ 127    0    0    0    0]
 [  29  497   29 1050    0]
 ...
 [   1   74   19    0    0]
 [ 441  203    0    0    0]
 [  13    5 1145    0    0]]


In [None]:
testX = encode_sequences(french_tokenizer, french_max_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_max_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

[[  14   47 2887 ...    0    0    0]
 [6637   15 3265 ...    0    0    0]
 [3671    0    0 ...    0    0    0]
 ...
 [  99  721   31 ...    0    0    0]
 [  14   44   11 ...    0    0    0]
 [  36  331    0 ...    0    0    0]]
[[   1   59 2212    0    0]
 [ 273   14 2190    0    0]
 [  33   57    0    0    0]
 ...
 [  28    1  143   67    0]
 [   1   42    5  123    0]
 [  84  110    0    0    0]]


**Defining the Model**

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.utils import plot_model
from keras.callbacks import ModelCheckpoint

In [None]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [None]:
model = define_model(french_vocab_size, eng_vocab_size, french_max_length, eng_max_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11, 256)           1786112   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 3418)          878426    
 ibuted)                                                         
                                                                 
Total params: 3,715,162
Trainable params: 3,715,162
Non-

In [None]:
filepath = 'model.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=1)

In [None]:
from keras.models import load_model

In [None]:
model = load_model('model.h5')

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [None]:
# generate target given source sequence
import numpy
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [numpy.argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [None]:
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 15:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())

In [None]:
evaluate_model(model, eng_tokenizer, trainX, train)

# GUI

In [None]:
import tkinter as tk
from tkinter import *

In [None]:
window=Tk()
btn=Button(window, text="Translate", fg='blue')
btn.place(x=220, y=275)
btn2=Button(window, text="Record Audio", fg='blue')
btn2.place(x=210, y=320)
lbl=Label(window, text="Welcome to the translator", fg='red', font=("Helvetica", 16))
lbl.place(x=130, y=50)
txtfld=Entry(window, text="Type here", bd=5)
txtfld.place(x=230, y=150)
lbl_1=Label(window, text="Type here:")
lbl_1.place(x=160, y=150)
lbl_2=Label(window, text="Translation:")
lbl_2.place(x=150, y=220)
resul=Entry()
resul.insert(END, str("Result"))
resul.place(x=230, y=220)
window.title('Translator')
window.geometry("500x500+10+10")
window.mainloop()

# SPEECH TO TEXT CODE

In [None]:
import speech_recognition as sr

In [None]:
#to convert speech to text
def speech_recog():
    r=sr.Recognizer()
    with sr.Microphone() as source:
        print("Speak:")
        audio=r.listen(source)

    try:
        txt=r.recognize_google(audio)
        print("You said:",txt)
    except sr.UnknownValueError:
        print("Could not understand audio")
    except sr.RequestError as e:
        print("Could not request reults; {0}".format(e))

In [None]:
speech_recog()

In [None]:
#to translate the speech input
def speech_translate():
    return predict_sequence(model, eng_tokenizer, txt)

# TEXT TO SPEECH CODE

In [None]:
#convert a given text to speech and saves file
from gtts import gTTS
def text_to_speech():
    import os
    mytext=str("chocolate")
    language='en'
    obj=gTTS(text=mytext, lang=language, slow=False)
    obj.save("1.mp3")
    os.system("mpg321 1.mp3")

In [None]:
text_to_speech()