# Devanagari Phonetic Dictionary

#### GPU

In [None]:
#restricts the use of CUDA devices to only the first GPU in the system. This is useful when working with multi-GPU systems and wanting to limit the use of specific devices.

%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


#### Download Data

In [None]:

pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:

pip install tqdm

Note: you may need to restart the kernel to use updated packages.


#### Import Stuff

In [None]:
import nltk
import tqdm

#Python module that provides alternatives to Python's built-in container types, such as dictionaries, lists, and tuples.
#and is used to count the frequency of elements in a given list.
from collections import Counter

#The tqdm_notebook function is a version of tqdm specifically designed to work with Jupyter notebooks.
from tqdm import tqdm_notebook

# for working with multi-dimensional arrays and matrices.
import numpy as np

#for building and training deep neural networks.
import tensorflow as tf

#The seq2seq module contains a number of classes and functions for building sequence-to-sequence models, including encoder and decoder classes, attention mechanisms, and helper functions for training and inference.
from tensorflow.contrib import seq2seq

#module contains a number of classes and functions for building recurrent neural network (RNN) models in TensorFlow.
#Dropout regularization is a technique used to prevent overfitting in neural networks. The basic idea is to randomly drop out (i.e., set to zero) some percentage of the outputs of a layer during training.
from tensorflow.contrib.rnn import DropoutWrapper

#module is a built-in Python library that provides functions for generating random numbers and selecting random items from lists.
import random

In [None]:
tf.reset_default_graph()

In [None]:
#downloads the punkt dataset from the Natural Language Toolkit
#The punkt dataset contains pre-trained models and data for tokenizing natural language text into individual words and sentences.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Global Parameters

In [None]:
#"MAX_SEQ_LEN" and "BATCH_SIZE". These constants are used later to define the maximum length of input
#and output sequences and the number of sentences per batch.

MAX_SEQ_LEN = 20
BATCH_SIZE = 64

### Language Vocabulary 
* (Vocab of characters, i.e. an Alphabet)

In [None]:
#The Lang class provides methods for encoding and decoding sentences using a vocabulary of words learned from the text corpus. 
#object counter containing the frequency counts of words in the text corpus, and an integer vocab_size indicating the maximum number of words to include in the vocabulary
#The constructor initializes the word-to-id and id-to-word mappings for the vocabulary, as well as special tokens for padding, start-of-sentence, end-of-sentence, and unknown words.





class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        # is initialized to 0, which suggests that it may be used to represent a padding token. Padding tokens are often used to ensure that sequences of variable length can be processed efficiently in batches.
        self.ipad = 0
        #is initialized to 1, which suggests that it may be used to represent a start-of-sequence token. This token can be useful in tasks like sequence generation or machine translation, where the model needs to know when to start generating or translating a sequence.
        self.isos = 1
        # is initialized to 2, which suggests that it may be used to represent an end-of-sequence token. This token can be used in the same way as the start-of-sequence token, but to indicate the end of a generated or translated sequence.
        self.ieos = 2
        #is initialized to 3, which suggests that it may be used to represent an unknown token. This token is often used to handle out-of-vocabulary words or rare words that do not appear in the vocabulary of the model.
        self.iunk = 3
        
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
            
    #This method takes a string s as input and returns a list of word ids corresponding to the words in the sentence.
    #wseq = s.lower().strip()

    #The "encodeSentence" method of the Lang class takes a sentence string as input and returns a list of word ids corresponding to the words in the sentence. 
    # The method checks whether each word in the sentence is in the vocabulary and replaces it with the corresponding integer if it is, or with the "<UNK>" token if it is not.
    # If the "max_len" parameter is specified, the output list is padded with "<PAD>" tokens to have a fixed length of "max_len".

    def encodeSentence(self, s, max_len=-1):
        wseq = s.strip()
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        

    #wseq = wseq = s.lower().strip() 
    #This method is similar to encodeSentence, but it also returns the actual length of the encoded sequence as a separate integer value.
    def encodeSentence2(self, s, max_len=-1):
        wseq = wseq = s.strip()
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    

    #The "decodeSentence" method of the Lang class takes a list of word ids as input and returns the corresponding decoded sentence as a string. 
    # The method looks up each word id in the "id2word" attribute and replaces the "<UNK>" token with "UNK".
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ''.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

In [None]:
# Total number of samples to read
N = 7877

### Reading the data files
- Each line contains a hindi word in both English and Devnagari script

In [None]:
#The Hindi and English sentences are stored separately in the hi_sentences and en_sentences lists.
# A Counter object is created for each language, and the frequency counts of each character in the sentences 
#are added to the appropriate Counter object using a loop.

hi_counter = Counter()
hi_sentences=[]
en_counter = Counter()
en_sentences=[]
with open("sorted_mapped_output_female_16_20.txt", encoding="utf8") as f:
    for line in tqdm_notebook(f, total=N, desc="Reading file:"):
        en, hi = line.strip().split("\t")
        hi_sentences.append(hi)
        en_sentences.append(en)
    for line in tqdm_notebook(hi_sentences, desc="Processing inputs:"):
        for w in line.strip():
            hi_counter[w] += 1
            
    for line in tqdm_notebook(en_sentences, desc="Processing inputs:"):
        for w in line.strip():
            en_counter[w] += 1

print(hi_counter)
print(en_counter)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


Reading file::   0%|          | 0/7877 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


Processing inputs::   0%|          | 0/7877 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


Processing inputs::   0%|          | 0/7877 [00:00<?, ?it/s]

Counter({'a': 11954, 'A': 7318, 'M': 2981, 'h': 2867, 'r': 2806, 't': 2630, 'k': 2584, 'E': 2515, 'y': 2508, 's': 2303, 'l': 2152, 'c': 2058, 'I': 1840, 'n': 1757, 'p': 1718, 'v': 1667, 'O': 1531, 'i': 1438, 'd': 1256, 'm': 1197, 'g': 1025, 'u': 914, 'b': 895, 'j': 760, 'D': 703, 'T': 690, 'N': 598, 'U': 490, 'L': 484, 'S': 111, 'o': 76, 'e': 26, 'H': 2, 'x': 2})
Counter({'ा': 6958, '्': 3439, 'ं': 2982, 'र': 2760, 'य': 2482, 'त': 2433, 'े': 2433, 'क': 2262, 'ल': 2152, 'च': 2052, 'ी': 1829, 'न': 1742, 'व': 1667, 'स': 1641, 'ो': 1510, 'प': 1498, 'ि': 1289, 'म': 1165, 'द': 976, 'ग': 784, 'ज': 705, 'ड': 692, 'श': 662, 'ु': 654, 'ट': 620, 'ण': 598, 'ब': 573, 'ह': 521, 'ू': 484, 'ळ': 484, 'आ': 368, 'भ': 322, 'ख': 322, 'ध': 280, 'अ': 258, 'घ': 241, 'उ': 238, 'थ': 197, 'फ': 194, 'ष': 111, 'ए': 82, 'ठ': 70, 'ॉ': 65, 'इ': 63, 'ॅ': 56, 'ै': 44, 'ृ': 43, 'झ': 40, 'ऱ': 40, 'ँ': 32, 'ओ': 23, 'ौ': 21, 'ञ': 15, 'ऑ': 12, 'ढ': 11, 'ई': 11, 'छ': 7, 'ऊ': 6, '़': 4, 'य़': 4, 'ः': 2, 'ऋ': 2, '।': 1, 'औ': 1}

In [None]:
# A few sample hindi characters
print("Most common hi characters in dataset:\n", hi_counter.most_common(10))

print("\nTotal (hi)characters gathered from dataset:",len(hi_counter))

# A few sample english characters
print("\nMost common en characters in dataset:\n", en_counter.most_common(10))

print("\nTotal (en)characters gathered from dataset:", len(en_counter))

Most common hi characters in dataset:
 [('a', 11954), ('A', 7318), ('M', 2981), ('h', 2867), ('r', 2806), ('t', 2630), ('k', 2584), ('E', 2515), ('y', 2508), ('s', 2303)]

Total (hi)characters gathered from dataset: 34

Most common en characters in dataset:
 [('ा', 6958), ('्', 3439), ('ं', 2982), ('र', 2760), ('य', 2482), ('त', 2433), ('े', 2433), ('क', 2262), ('ल', 2152), ('च', 2052)]

Total (en)characters gathered from dataset: 64


In [None]:
# Passed through the class to initiate the attributes
en_lang = Lang(en_counter, len(en_counter))
hi_lang = Lang(hi_counter, len(hi_counter))

In [None]:
print("Test en encoding:", en_lang.encodeSentence("शुक्रिया"))

print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence("शुक्रिया", 10)))

print("Test hindi encoding:", hi_lang.encodeSentence("Shukriya", 10))

print("Test hindi decoding:", hi_lang.decodeSentence((hi_lang.encodeSentence("Shukriya", 10))))

Test en encoding: [26, 27, 11, 5, 7, 20, 8, 4]
Test en decoding: शुक्रिया
Test hindi encoding: [33, 7, 25, 10, 8, 21, 12, 4, 2, 0]
Test hindi decoding: Shukriya


In [None]:
#The variables VE and VH represent the vocabulary sizes of the English and Hindi language models

VE = len(en_lang.word2id)
VH = len(hi_lang.word2id)

### The Seq2Seq architecture


#### Character Embedding Matrix

In [None]:
#These variables represent the word embedding matrices for English and Hindi languages. 
# Word embeddings are dense vector representations of words in a high-dimensional space 
#that capture the semantic and syntactic meaning of words. In this case, the embedding 
#dimension is 300.
#The get_variable() method creates a variable with the given name and shape, and the dtype 
# argument specifies the data type of the values stored in the variable, which is tf.float32 in this case.
# By default, these variables are trainable, which means that their values can be updated during the training process.

en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (VE, 300), dtype=tf.float32)
hi_word_emb_matrix = tf.get_variable("hi_word_emb_matrix", (VH, 300), dtype=tf.float32)

#### Placeholders
- Input to a tensorflow graph is 

In [None]:
#keep_prob: A scalar placeholder with tf.float32 datatype, which represents the dropout keep probability. Dropout is a regularization technique used to prevent overfitting in neural networks. The keep_prob placeholder is used to pass the dropout keep probability as a feed_dict during training.

#input_ids: A 2D placeholder tensor with tf.int32 datatype and shape (None, MAX_SEQ_LEN). This placeholder is used to pass the input sequences to the model during training and inference. MAX_SEQ_LEN is the maximum sequence length of the input sequences and None indicates that the batch size can be variable.

#input_lens: A 1D placeholder tensor with tf.int32 datatype and shape (None,). This placeholder is used to pass the length of each input sequence to the model during training and inference. None indicates that the batch size can be variable.

#ph_target_ids: A 2D placeholder tensor with tf.int32 datatype and shape (None, MAX_SEQ_LEN). This placeholder is used to pass the target sequences to the model during training. MAX_SEQ_LEN is the maximum sequence length of the target sequences and None indicates that the batch size can be variable.

#target_lens: A 1D placeholder tensor with tf.int32 datatype and shape (None,). This placeholder is used to pass the length of each target sequence to the model during training. None indicates that the batch size can be variable.


keep_prob = tf.placeholder(tf.float32)

input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

ph_target_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
target_lens = tf.placeholder(tf.int32, (None, ))

In [None]:
# Add SOS or GO symbol
#The tf.fill() function creates a tensor of shape [BATCH_SIZE, 1] and fills it with hi_lang.isos value. 
#This tensor represents the start-of-sequence token for each batch element in the target sequences.

target_ids = tf.concat([tf.fill([BATCH_SIZE,1], hi_lang.isos), ph_target_ids], -1)

#### Building the computation graph

In [None]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)
target_emb = tf.nn.embedding_lookup(hi_word_emb_matrix, target_ids[:, :-1])

In [None]:
#  the shape of input_emb tensor would be (batch_size, max_seq_len, embedding_dim)
input_emb.shape

TensorShape([Dimension(None), Dimension(20), Dimension(300)])

#### Encoder - RNN based sequence encoder

In [None]:
encoder_cell = tf.nn.rnn_cell.GRUCell(128) # The 128 argument specifies the number of hidden units in the GRU cell.
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob) #  Dropout is a regularization technique that randomly drops out (sets to zero) some of the neuron activations during training to prevent overfitting

In [None]:
# enc_outputs: It is a tensor of shape (batch_size, max_seq_len, hidden_units), 
#where hidden_units is the number of hidden units in the GRU cell (128 in this case). 
#It contains the hidden state of the encoder at each time step for each input sequence in the batch. 
# enc_state: It is a tensor of shape (batch_size, hidden_units), which contains the final hidden state of the encoder for each input sequence in the batch. This final hidden state is typically used as the initial state of the decoder.

enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, # The encoder GRU cell
    input_emb, # Embedded input sequence
    sequence_length=input_lens, # Sequence lengths of individual inputs in a batch
    initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)



In [None]:
# Confirm the shape of the final hidden state
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

#### Decoder

In [None]:
#  The output_keep_prob parameter specifies the probability that each output element will be kept during training. In other words, it controls the dropout rate for the output of the cell.

decoder_cell = tf.nn.rnn_cell.GRUCell(128)
decoder_cell = DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)

#### Decoder to Output Vocab Projection Layer

In [None]:
# a fully connected layer is being created using the Dense class from the TensorFlow layers module.
output_projection = tf.layers.Dense(len(hi_lang.word2id))

#### Decoder Training Helper

In [None]:
helper = seq2seq.TrainingHelper(target_emb, target_lens)
decoder = seq2seq.BasicDecoder(decoder_cell, helper, enc_state, output_projection)

#The outputs variable stores the output sequence generated by the decoder, and outputs_lens is a tensor representing the length of each output sequence.
outputs, _, outputs_lens = seq2seq.dynamic_decode(decoder, maximum_iterations=MAX_SEQ_LEN, 
                                                  impute_finished=False, swap_memory=True) # The impute_finished parameter is a boolean that determines whether to use the final state of the decoder when the sequence has ended. The swap_memory parameter is a boolean that determines whether to swap the memory between CPU and GPU during the decoding process.

#The reduce_max function is used to find the maximum length of the output sequence across all the sequences in the batch.
output_max_len = tf.reduce_max(outputs_lens)



#### And Decoder Inference Helper

In [None]:
# Using the decoder_cell without dropout here.
infer_helper = seq2seq.GreedyEmbeddingHelper(hi_word_emb_matrix, tf.fill([BATCH_SIZE, ], hi_lang.isos), hi_lang.ieos)
infer_decoder = seq2seq.BasicDecoder(decoder_cell, infer_helper, enc_state, output_projection)
infer_output = seq2seq.dynamic_decode(infer_decoder, maximum_iterations=MAX_SEQ_LEN, swap_memory=True)



#### Loss and Optimizers

In [None]:
# Sequence mask:
# To make sure we don't back-propagate error from output of length positions
masks = tf.sequence_mask(target_lens, output_max_len, dtype=tf.float32, name='masks')

# Loss function - weighted softmax cross entropy
cost = seq2seq.sequence_loss(
    outputs[0],
    target_ids[:, 1:(output_max_len + 1)],
    masks)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

In [None]:
train_op = optimizer.minimize(cost)

In [None]:
# the global variables are added to a computation graph, but their values are not initialized until the graph is executed.
init = tf.global_variables_initializer()

#### Tensorflow Sessions

In [None]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [None]:
sess = tf.InteractiveSession(config=sess_config)
sess.run(init)



#### Minibatch Training + Validation
- Performance Evaluation using BLEU scores

In [None]:
random.seed(41)

In [None]:
parallel = list(zip(en_sentences, hi_sentences))

In [None]:
random.shuffle(parallel)

In [None]:
parallel[1000]

('काडटना', 'kADaTanA')

In [None]:
train_n = int(0.95*N)
valid_n = N - train_n

In [None]:
train_pairs = parallel[:train_n].copy()
valid_pairs = parallel[train_n:]

In [None]:
def small_test():
    all_bleu = []
    smoothing = nltk.translate.bleu_score.SmoothingFunction().method7
    for m in range(0, valid_n, BATCH_SIZE):
        # print(f"Status: {m}/{N}", end='\r')
        n = m + BATCH_SIZE
        if n > valid_n:
            # print("Epoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(valid_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

    #     target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    #     target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    #     for i in range(m, n):
    #         b,a = hi_lang.encodeSentence2(valid_pairs[i][1], MAX_SEQ_LEN)
    #         target_batch[i-m,:] = a
    #         target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            #target_ids: target_batch,
            #target_lens: target_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
        for k, pred_ in enumerate(pred_batch):
            pred_s = hi_lang.decodeSentence(list(pred_))
        
            ref = valid_pairs[m+k][1]
            try:
                _bx = nltk.translate.bleu_score.sentence_bleu(
                    [ref],
                    pred_s,
                    weights=[1/4]*4,
                    smoothing_function=smoothing)
            except ZeroDivisionError:
                _bx = 0
            all_bleu.append(_bx)

    print(f"BLEU Score: {np.mean(all_bleu)}")

In [None]:
saver = tf.train.Saver()

for _e in range(70):
    # Mix things up a bit.
    random.shuffle(train_pairs)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_pairs[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b

            target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = hi_lang.encodeSentence2(train_pairs[i][1], MAX_SEQ_LEN)
                target_batch[i-m,:] = a
                target_lens_batch[i-m] = b

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                ph_target_ids: target_batch,
                target_lens: target_lens_batch,
                keep_prob: 0.8 
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 100 == 0:
                small_test()

    saver.save(sess, 'C:/Users/Vaibhav/Desktop/data/ModelOutputs')
    saver.save(sess, f'C:/Users/Vaibhav/Desktop/data/ModelOutputs/model_{_e}')



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.01955027078264301


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.060994858643850514


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.11542191180109107


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.1525390244916027


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.1714182470242307


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.18232115814426575


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.18626336626731255


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.1948035644019183


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2008163796729574


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.20869660749675253


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2162474553607648


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2192445568312429


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.22418407606930227


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.23753884846963388


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.23805692814003362


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.24232570962875846


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2511248081943391


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.24915593988385634


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.25787318497331047


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2614290358968383


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.27176945666940516


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2662785985991072


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.27364929501081403


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.27837869744617283


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.27998495465953216


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.2830538142660309


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.30318950254467586


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.308692604803602


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.31865452234291036


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.31286476038560235


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.32123687642250626


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.3331398113519698


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.32734761071338087


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.33638346704655686


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.3478019570414446


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.3564424163381694


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.36876354051191096


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.36813060255035374


  0%|          | 0/117 [00:00<?, ?it/s]

BLEU Score: 0.3788488470715002


  0%|          | 0/117 [00:00<?, ?it/s]

In [None]:
saver = tf.train.Saver()
saver.restore(sess, "C:/Users/Vaibhav/Desktop/data/ModelOutputs/model_69")


Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:/Users/Vaibhav/Desktop/data/ModelOutputs/model_69


### Let's see some real translation examples now!

In [None]:
def transliterate(s):
    input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    b,a = en_lang.encodeSentence2(s, MAX_SEQ_LEN)
    input_batch[0, :] = a
    input_lens_batch[0] = b
    
    feed_dict={
        input_ids: input_batch,
        input_lens: input_lens_batch,
        #target_ids: target_batch,
        #target_lens: target_lens_batch,
        keep_prob: 1.0
    }
    pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
    pred_ = pred_batch[0]
    pred_s = hi_lang.decodeSentence(list(pred_))
    # ref = valid_pairs[m+k][1]
    return pred_s

In [None]:
x="भारतीय जनता पक्षाची तीन दिसांची चिंतन बसका शिमलाचे थंडसाणींत काल सोंपली. बसकेंतले चर्चे परस आदल्या संरक्षण मंत्र्याक धांवडावपाचे करणेचेच चड पडसाद उठले. बसकेचे दोन दीस पयलीं राष्ट्रीय स्वयंसेवक संघाचे सरसंघचालक मोहन भागवतान पक्षान तरणाट्या रगताक वाव दिवपाक जाय म्हूण सांगत लालकृष्ण अडवाणी आनी तांचे पिरोयेच्या फुडाऱयांक कावलांत वचपाची शिटकावणी दिल्ली. संघाच्या राजकारणांत दरेके चाली खातीर म्हूर्त सोदतनाय जतनाय घेतात. देखून भागवत हांणी सोदिल्ल्या म्हुर्ताक व्हड म्हत्व आसा. हालींच जाल्ले लोकसभा वेंचणुकेंत भाजपाचो जो पराभव जालो ताची मिमांसा जावची आनी ते खातीर जापसालदार आशिल्ल्यांक तांची सुवात दाखोवची अशी मागणी जाताली. त्या यत्नांत लालकृष्ण अडवाणीन आडमेळीं हाडलीं. ताचेय फुडें वचत तांणी पक्षाच्या प्रचाराचें नियंत्रण करतल्या अरूण जेटलीक राज्यसभेचो विरोधी पक्ष फुडारी करून प्रशस्तीच दिली. पराभवाची मिमांसा आपल्याच आंगलट येवपाक शकता हें वळखून अडवाणी आक्रमक जाल्यात आनी पक्षान तांचे मुखार दिमी घाल्या अशें चित्र दिसपाक लागिल्लें. तें चित्र पयस करपाक पक्षाध्यक्ष राजनाथ सिंगान राजस्थानांतल्यान आपलें वेगळें राजकारण चालीक लायलें आनी अडवाणींची शिश्या वसुंधरा राजेंचो राजिनामो मागलो. वेंचणुकेंतल्या अपेसा वयले भासाभासेचो प्रस्न अजून सुटावो जावंक ना हो संदेश अडवाणीं मेरेन पावोवपाचो तांचो हो यत्न आशिल्लो. अर्थात तो संकेत पावलो आनी एके पत्रकार परिशदेचें निमित्त करीत अडवाणीन चिंतन बसकेंत आमी वेंचणुके विशीं न्हय तर मुखा वयले वाटे विशीं उलयतले अशें सांगून राजनाथाच्या रथाक जमनीर हाडपाची चाल खेळ्ळे. तिका प्रतिशह दिलो मोहन भागवतान. चिंतन बसकेंत पराभवाचेरूय भासाभास जावपाक जाय म्हूण सांगत तांणी अडवाणींचो अॅजेंडा आनी पर्यायान खुद्द अडवाणीच कालबाह्म जावपाक लागल्यात म्हणपाचे संकेत दिल्यात"
y=x.split()

string1=''
for i in y:
    string1+=transliterate(i) + ' '

print(string1)
    

bhAratIya janatA pakSAcI tIna disAMcI ciMtana basakA shisavAcO thaMDajasAMtI kAla sOMpalIya basakEMtalE carcE parasa AdalyA saMrakSaNa maMtryAka dhAMvaDAvapArEM karaNEcacE caDa paDasAda uThalEM basakEcE dOna dIsa payalIM rASTrIya svayadAMtalETa saMghAcE sarasaMgalhEka mOhana bhAgavatAna pakSAna taraNATyA ragatAka vAva divapAka jAya mhUNa sAMgata lAlakrmal aDavANI AnI tAMcE prilyAM phuDAyakAM kAvalAMta vacapAcI shiTakAvaNI dillIM saMghAcyA rAjakAraNAMta darEkE cAlI khAtIra mhUrta sOdatanAya jatanAya ghEtAta dEkhUna bhAgavata hAMNI sOdillyA mhurAtakO vhaDa mhatva AsAna hAlIMca jAllE lOkasabhA vEMcaNukEMta bhAjapAcO jO parAbhava jAlO tAcI mimAMsA jAvacI AnI tE khAtIra jApasAlarAna AshillyAMka tAMcI suvAta dAkhOvacI ashI mAgaNI jAtAlIta tyA yatnAMta lAlakrmal aDavANIna ADamELIM hADalIMta tAcEya phuDEM vacata tAMNI pakSAcyA pracArAcEM niyaMtraNa karatalyA arUNa jETalIM rAjyasabhEcO virOdhI pakSa phuDArI karUna prashastIca dilIcya parAbhavAcI mimAMsA ApalyAca AMgalaTa yEvapAka shakatA hEM va