## Model for translating sentence from french language to english language

## Model Creation

### Importing statements

In [1]:
#import
import os
import string
import numpy as np
import tensorflow as tf
from keras.models import load_model
from keras.models import Model, Sequential
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GRU, Input, Dense, TimeDistributed, Dropout

### Loading data

The data is getting loaded from the french and english data.  After that the data is getting preprocessed to find the length and read few sentences to see how the data looks like.

In [2]:
#method for loading data
def load_data(path):
    #adding path to read data
    text_data = os.path.join(path)
    with open(text_data, "r") as file:
        data = file.read()
    return data.split('\n')

In [3]:
#loading english data and french data
english_sentences = load_data('english_data')
french_sentences = load_data('french_data')

In [4]:
#in and out of data. Length ofeach data types
print("English sentences: {}".format(len(english_sentences)))
print("French_sentences: {}".format(len(french_sentences)))
print(english_sentences[0])
print(french_sentences[0])

English sentences: 137861
French_sentences: 137861
new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [5]:
#removing punctuation from english and french sentences
input1 = str.maketrans('', '', string.punctuation+ '“”')

#pasing punctuation from input and output sentences
english_sentences = [w.translate(input1) for  w in english_sentences]
french_sentences = [w.translate(input1) for  w in french_sentences]

#out after removing punctuation
print(english_sentences[0])
print(french_sentences[0])

new jersey is sometimes quiet during autumn  and it is snowy in april 
new jersey est parfois calme pendant l automne  et il est neigeux en avril 


In [6]:
#removing data having more than 16 words in a sentence.
english_data = []
french_data = []
for i in range(len(french_sentences)):
    #condition on how sentences will be removed
    if len(english_sentences[i].split())<=16 and len(french_sentences[i].split())<=16:
        #adding sentence in new list
        english_data.append(english_sentences[i])
        french_data.append(french_sentences[i]) 

In [7]:
#length of english and french sentences after removing sentences more than 16 words 
print(len(english_data))
print(len(french_data))

135594
135594


As it can be seen out data has been reduced as few sentences were having more than 16 words

In [8]:
#print of one english and french sentence after removing punctuation and caping limit to maximum of 16
print(english_data[0])
print(french_data[0])


new jersey is sometimes quiet during autumn  and it is snowy in april 
new jersey est parfois calme pendant l automne  et il est neigeux en avril 


### Tokenizer

In [9]:
#adding tokenizer instance on french data
french_tokenizer = Tokenizer()
#french data on tokenizer 
french_tokenizer.fit_on_texts(french_data)
sequence_french_data = french_tokenizer.texts_to_sequences(french_data)
    
#adding tokenizer instance on english data
english_tokenizer = Tokenizer()
#english data on tokenizer 
english_tokenizer.fit_on_texts(english_data)
sequence_english_data = english_tokenizer.texts_to_sequences(english_data)

#adding padding to english and french data having words less than 16 in a sentence
sequence_french_data = pad_sequences(sequence_french_data, maxlen=16, padding='post')
sequence_english_data = pad_sequences(sequence_english_data, maxlen=16, padding='post')  

#reshaping english data
sequence_english_data = sequence_english_data.reshape(*sequence_english_data.shape, 1)

In [10]:
#french vocab length
french_vocab = len(french_tokenizer.word_index)+1
print(french_vocab)

#english vocab length
english_vocab = len(english_tokenizer.word_index)+1
print(english_vocab)

#reshaping french data
sequence_french_data = sequence_french_data.reshape((-1, sequence_english_data.shape[-2]))

345
200


### Model

We are creating model using GRUs. It is a sequential model. The data is  trained in 4 epocs as the data, as model becomes over fitted after 4 epocs. We have used activation function  Relu in hidden layers  and softmax in  output layer.

In [29]:
#creating model
model = Sequential()

#adding embedding layer
model.add(Embedding(french_vocab,512, input_length=sequence_french_data.shape[1], 
                    input_shape=sequence_french_data.shape[1:]))

#gru network
model.add(GRU(512, return_sequences=True))    
model.add(TimeDistributed(Dense(1024, activation='relu')))
model.add(Dropout(0.4))
model.add(TimeDistributed(Dense(english_vocab, activation='softmax'))) 

#model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 512)           176640    
_________________________________________________________________
gru_1 (GRU)                  (None, 16, 512)           1575936   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 16, 1024)          525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 1024)          0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 16, 200)           205000    
Total params: 2,482,888
Trainable params: 2,482,888
Non-trainable params: 0
_________________________________________________________________


In [30]:
#model compilation for getting information
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

### Model training

After training the model, it shows an accuracy of 84%. The accuracy can be improved by adding more data.

In [31]:
#model training for four epocs as after that it is over fitting the model
model.fit(sequence_french_data, sequence_english_data, batch_size=1024, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x14823c070>

### Saving model

The model is saved so that we  dont have to  train model again and again. We can use the saved model for predicting outputs.

In [32]:
#saving model
model.save('my_model.h5')

In [11]:
#create model tflite file
model = tf.keras.models.load_model('my_model.h5')
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
open("my_model.tflite", "wb").write(tflite_model)



INFO:tensorflow:Assets written to: /var/folders/8t/y10bcz3j1qzbzvsz4s3wdtv80000gn/T/tmpkr92e1ci/assets


INFO:tensorflow:Assets written to: /var/folders/8t/y10bcz3j1qzbzvsz4s3wdtv80000gn/T/tmpkr92e1ci/assets


9954968

In [12]:
#create french tokenizer file
import io,json
french_tokenizer_json = french_tokenizer.to_json()
with io.open('french_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(french_tokenizer_json, ensure_ascii=False))

In [13]:
#create english tokenizer file
import io
english_tokenizer_json = english_tokenizer.to_json()
with io.open('english_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(english_tokenizer_json, ensure_ascii=False))

In [14]:
#create english word index file
english_word_index = []
for i in english_tokenizer.word_index:
    a = {i :english_tokenizer.word_index[i]}
    english_word_index.append(a)

with io.open('english_word_index.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(english_word_index, ensure_ascii=False))

In [15]:
#create french word index file
french_word_index = []
for i in french_tokenizer.word_index:
    a = {i :french_tokenizer.word_index[i]}
    french_word_index.append(a)

with io.open('french_word_index.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(french_word_index, ensure_ascii=False))

In [16]:
#create english index word file
english_index_word = []
for i in english_tokenizer.index_word:
    a = {i :english_tokenizer.index_word[i]}
    english_index_word.append(a)

with io.open('english_index_word.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(english_index_word, ensure_ascii=False))

In [17]:
#create french index word file
french_index_word = []
for i in french_tokenizer.index_word:
    a = {i :french_tokenizer.index_word[i]}
    french_index_word.append(a)

with io.open('french_index_word.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(french_index_word, ensure_ascii=False))