In [None]:
# We are importing librarires here
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
            

In [None]:
#importing datasets
!wget http://www.manythings.org/anki/mar-eng.zip -O mar-eng.zip
!unzip mar-eng.zip

--2023-01-22 14:38:17--  http://www.manythings.org/anki/mar-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1458910 (1.4M) [application/zip]
Saving to: ‘mar-eng.zip’


2023-01-22 14:38:18 (6.01 MB/s) - ‘mar-eng.zip’ saved [1458910/1458910]

Archive:  mar-eng.zip
  inflating: mar.txt                 
  inflating: _about.txt              


In [None]:
#reading data
#This code is reading in a file called "mar.txt" and creating a DataFrame named "lines" with two columns, "eng" and "mar", using the read_table method from the pandas library.
lines = pd.read_table( 'mar.txt' , names=[ 'eng' , 'mar' ] )
lines.reset_index( level=0 , inplace=True )
lines.rename( columns={ 'index' : 'eng' , 'eng' : 'mar' , 'mar' : 'c' } , inplace=True )
lines = lines.drop( 'c' , 1 )
lines = lines.iloc[ 10000 : 20000 ] 
lines.head()

  lines = lines.drop( 'c' , 1 )


Unnamed: 0,eng,mar
10000,Everything changes.,सगळं बदलतं.
10001,Everything is fine.,सगळं काही ठीक आहे.
10002,Everything is fine.,सर्व काही ठीक आहे.
10003,Exercise every day.,दररोज व्यायाम कर.
10004,Exercise every day.,दररोज व्यायाम करा.


In [None]:
#Preparing input data for the Encoder
#Now we have the tokenized and padded English lines in the encoder_input_data variable. This is the input for the encoder of your neural machine translation model.
#We also have the eng_word_dict, which is a dictionary containing the English words and their corresponding index, and num_eng_tokens which is the number of unique English tokens.
eng_lines = list()
for line in lines.eng:
    eng_lines.append( line ) 

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( eng_lines ) 
tokenized_eng_lines = tokenizer.texts_to_sequences( eng_lines ) 

length_list = list()
for token_seq in tokenized_eng_lines:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
print( 'English max length is {}'.format( max_input_length ))

padded_eng_lines = preprocessing.sequence.pad_sequences( tokenized_eng_lines , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_eng_lines )
print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

eng_word_dict = tokenizer.word_index
num_eng_tokens = len( eng_word_dict )+1
print( 'Number of English tokens = {}'.format( num_eng_tokens))

English max length is 7
Encoder input data shape -> (10000, 7)
Number of English tokens = 2382


In [None]:
#Preparing input data for the Decoder
#Now, we have tokenized and padded the input English and Marathi sentences. encoder_input_data is the padded input English sentences and decoder_input_data is the padded input Marathi sentences. eng_word_dict is the dictionary of English words and their corresponding index and num_eng_tokens is the number of unique English tokens.
#Similarly, mar_word_dict is the dictionary of Marathi words and their corresponding index and num_mar_tokens is the number of unique Marathi tokens.
mar_lines = list()
for line in lines.mar:
    mar_lines.append( '<START> ' + line + ' <END>' )  

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( mar_lines ) 
tokenized_mar_lines = tokenizer.texts_to_sequences( mar_lines ) 

length_list = list()
for token_seq in tokenized_mar_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( 'Marathi max length is {}'.format( max_output_length ))

padded_mar_lines = preprocessing.sequence.pad_sequences( tokenized_mar_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_mar_lines )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

mar_word_dict = tokenizer.word_index
num_mar_tokens = len( mar_word_dict )+1
print( 'Number of Marathi tokens = {}'.format( num_mar_tokens))

Marathi max length is 11
Decoder input data shape -> (10000, 11)
Number of Marathi tokens = 4771


In [None]:
#Preparing target data for the Decoder 
#This code is preprocessing the data for a machine translation model using the Encoder-Decoder architecture. The first half of the code preprocesses the English sentences in the dataset and tokenizes them, and pads the tokenized sequences to make them of the same length.
#It also creates a word dictionary for English words and finds the maximum length of the English sentences. The second half of the code preprocesses the Marathi sentences in the dataset and tokenizes them, and pads the tokenized sequences to make them of the same length.
#It also creates a word dictionary for Marathi words and finds the maximum length of the Marathi sentences. The code also creates one-hot encoded target data for the decoder by removing the first element of the tokenized Marathi sentences and padding the resulting sequences to make them of the same length as the padded input sequences.
decoder_target_data = list()
for token_seq in tokenized_mar_lines:
    decoder_target_data.append( token_seq[ 1 : ] ) 
    
padded_mar_lines = preprocessing.sequence.pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_mar_lines = utils.to_categorical( padded_mar_lines , num_mar_tokens )
decoder_target_data = np.array( onehot_mar_lines )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (10000, 11, 4771)


In [None]:
#Defining the Encoder-Decoder model
#This code defines a neural network architecture for a sequence-to-sequence (Seq2Seq) model using the TensorFlow library. The architecture consists of an encoder and a decoder.
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 128 , return_state=True  )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_mar_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 128 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_mar_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    609792      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    1221376     ['input_2[0][0]']                
                                                                                              

In [None]:
#This code will train the encoder-decoder model on the given dataset of English and Marathi sentences. The encoder input is the English sentence and the decoder input is the Marathi sentence with <START> and <END> tokens added.
#The decoder target is the Marathi sentence with the <START> token removed. The model is trained for 50 epochs with a batch size of 250. After training, the model is saved to a file named 'model.h5'.
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=250, epochs=50 ) 
model.save( 'model.h5' )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
