# French → Tamil Translator
This notebook uses a small slice of the <a href="https://opus.nlpl.eu/XLEnt/fr&ta/v1.2/XLEnt">https://opus.nlpl.eu/XLEnt/fr&ta/v1.2/XLEnt<a> to build a translation model.

## Loading the necessary Libraries

In [1]:
import collections
import numpy as np
import json

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, GRU, Bidirectional, Dropout, TimeDistributed
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9083487643978985882
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2238133044
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10033659875732934849
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


## Loading the Datasets

In [3]:
french_file = "fr-ta/XLEnt.fr-ta.fr"
tamil_file = "fr-ta/XLEnt.fr-ta.ta"


with open(french_file, "r", encoding="utf-8") as f:
    fr_sentences = f.read().splitlines()


with open(tamil_file, "r", encoding="utf-8") as f:
    tam_sentences = f.read().splitlines()

# Making a list that contains only 5-letter words
french_sentences = [sentence for sentence in fr_sentences if len(sentence.strip()) == 5]
tamil_sentences = [tam_sentences[fr_sentences.index(sentence)] for sentence in french_sentences]

# Verifying that the lists are equal in length 
print("No. of French sentences",len(french_sentences))
print("No. of Tamil sentences",len(tamil_sentences))

No. of French sentences 14810
No. of Tamil sentences 14810


In [4]:
# Verifying the the translation match up
print(french_sentences[0:5])
print(tamil_sentences[0:5])

['Jésus ', 'Moïse ', 'de US ', 'Chine ', 'Allah ']
['இயேசு ', 'மூஸா ', 'அமெரிக்க ', 'சீனா ', 'அல்லாஹ் ']


In [5]:
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
tamil_words_counter = collections.Counter([word for sentence in tamil_sentences for word in sentence.split()])

print('{} French words'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

print()
print('{} Tamil words'.format(len([word for sentence in tamil_sentences for word in sentence.split()])))
print('{} unique Tamil words'.format(len(tamil_words_counter)))
print('10 Most common words in the Tamil dataset:')
print('"' + '" "'.join(list(zip(*tamil_words_counter.most_common(10)))[0]) + '"')

15393 French words
13471 unique French words
10 Most common words in the French dataset:
"de" "la" "Hôtel" "Certs" ")" "genre" "A" "du" "Allah" "ville"

16114 Tamil words
12136 unique Tamil words
10 Most common words in the Tamil dataset:
"சிலர்" "ஜெயலலிதா" "சென்னை" "பாபா" "பாகிஸ்தான்" "போலீஸ்" "ராஜா" ")" "ஹோட்டலில்" "Pearson"


## Preprocessing the Data

In [6]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x),tokenizer
def pad(x,length=None):
    if length==None:
        length = max(len(sentence) for sentence in x)
    return pad_sequences(x,maxlen=length,padding='post')

def preprocess(x,y):
    preprocess_x,tk_x = tokenize(x)
    preprocess_y,tk_y = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    return preprocess_x,preprocess_y,tk_x,tk_y


preproc_french_sentences,preproc_tamil_sentences,french_tokenizer,tamil_tokenizer = preprocess(french_sentences,tamil_sentences)

max_french_sequence_length = preproc_french_sentences.shape[1]
max_tamil_sequence_length = preproc_tamil_sentences.shape[1]
french_vocab_size = len(french_tokenizer.word_index)+1
tamil_vocab_size = len(tamil_tokenizer.word_index)+1

print('Data Preprocessed')
print("Max French sentence length:", max_french_sequence_length)
print("Max Tamil sentence length:", max_tamil_sequence_length)
print("French vocabulary size:", french_vocab_size)
print("Tamil vocabulary size:", tamil_vocab_size)

Data Preprocessed
Max French sentence length: 3
Max Tamil sentence length: 7
French vocabulary size: 12820
Tamil vocabulary size: 12011


## Model Building and Training

In [7]:
def bidirectional_embed_model(input_shape, output_sequence_length, french_vocab_size, tamil_vocab_size):
    
    # Hyperparameters
    learning_rate = 5e-3
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(french_vocab_size, 32, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(32, return_sequences=True)))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(tamil_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate, clipnorm=1.0),
                  metrics = ['accuracy'])
    
    return model


# Prepping the input layer 
tmp_x = pad(preproc_french_sentences, max_tamil_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_tamil_sentences.shape[-1]))

# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_tamil_sequence_length,
    french_vocab_size,
    tamil_vocab_size)

print(embed_rnn_model.summary())

embed_rnn_model.fit(tmp_x, preproc_tamil_sentences, batch_size=32, epochs=20, validation_split=0.20)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 32)             410240    
                                                                 
 bidirectional (Bidirectiona  (None, 7, 64)            12672     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 7, 256)           16640     
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 7, 256)            0         
                                                                 
 time_distributed_1 (TimeDis  (None, 7, 12011)         3086827   
 tributed)                                                       
                                                        

<keras.callbacks.History at 0x2463bb67a00>

## Model Testing

In [8]:
french_sentence = ["pomme"]
french_sentence = french_sentence[0].lower()

actual_translation = ["ஆப்பிள்"]

french_sentence = french_tokenizer.texts_to_sequences([french_sentence])
french_sentence = pad_sequences(french_sentence,max_tamil_sequence_length,padding="post")


tamil_sentence = embed_rnn_model.predict(french_sentence)[0]

tamil_sentence = [np.argmax(word) for word in tamil_sentence]
print(tamil_sentence)

tamil_sentence = tamil_tokenizer.sequences_to_texts([tamil_sentence])[0]

print("Predicted Tamil Sentences", tamil_sentence)
print("Actual Tamil Sentences", actual_translation[0])

[1362, 0, 0, 0, 0, 0, 0]
Predicted Tamil Sentences மதரஸா
Actual Tamil Sentences ஆப்பிள்


## Saving the model

In [9]:
embed_rnn_model.save('french_to_tamil_model')
# Serialize English Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))
    
# Serialize French Tokenizer to JSON
with open('tamil_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(tamil_tokenizer.to_json(), ensure_ascii=False))
    
# Save max lengths
max_tamil_sequence_length_json = max_tamil_sequence_length
with open('sequence_length.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(max_tamil_sequence_length_json, ensure_ascii=False))



INFO:tensorflow:Assets written to: french_to_tamil_model\assets


INFO:tensorflow:Assets written to: french_to_tamil_model\assets


## Run the GUI 

In [10]:
# This gui.py file using the above created model as the means of translation 
%run gui.py

In [1]:
# This gui_2.py file uses the googletrans library for the translation purposes
%run gui_2.py