In [0]:
%matplotlib inline

debug = False

In [59]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
SOS_token = 0
EOS_token = 1
UNK_token = 2
# MAX_LENGTH = 1000

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"UNK"}
        self.n_words = 3  # Count SOS and EOS and UNK

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def Reverse(lst): 
    return [ele for ele in reversed(lst)]

In [0]:
def readLangs(lang1, lang2, reverse=False, model="dev"):
    
    if model == "dev":
        source = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/dev.en"
        target = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/dev.hi"
    else:
        source = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/train.en"
        target = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/train.hi"
    
    eng_lines = open(source, encoding='utf-8').read().strip().split('\n')
    hin_lines = open(target, encoding='utf-8').read().strip().split('\n')

    lines = []
    for i in range(len(eng_lines)):
        lines.append(eng_lines[i] + '\t' + hin_lines[i])

    # Split every line into pairs and normalize
    # pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    pairs = [[s for s in l.split('\t')] for l in lines]
    #print(pairs)
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [0]:
MAX_LENGTH = 20
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH 


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [0]:
def prepareData(lang1, lang2, reverse=False, model="dev", filter_sentence=False):
    
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse, model)
    #print(input_lang, output_lang, pairs)
    if(filter_sentence):
        pairs = filterPairs(pairs)
    
    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, pairs

In [66]:
input_lang, output_lang, pairs = prepareData('eng', 'hi', reverse=False, model="dev", filter_sentence=True)
print(random.choice(pairs))
orig_pairs = []
for p in pairs:
    temp = str(p[0]).split(' ')
    temp = Reverse(temp)
    listToStr = ' '.join([str(elem) for elem in temp]) 
#     print("listToStr")
#     print(listToStr)
    listToStr = str(listToStr)
    orig_pairs.append(listToStr+'\t'+str(p[1]))
#     print(orig_pairs[-1])
print("orig_pairs")
print(orig_pairs[0])
print(random.choice(orig_pairs))

Read 256 sentence pairs
Trimmed to 256 sentence pairs
Counting words...
Counted words:
eng 1117
hi 1170
['A person starts falling ill again and again and continuously goes on weakening .', 'व्यक्ति बार - बार बीमार पड़ने लगता है और निरन्तर कमजोर होता जाता है ।']
orig_pairs
. diseases eye from safe be also will you but	बल्कि आप नेत्ररोगों से भी बचे रहेंगे ।
. Darjeeling of mountains the among low very placed are places these Both	ये दोनों ही स्थान दार्जिलिंग की पहाड़ियों के बीच बहुत नीचे विराजमान हैं ।


The Encoder
-----------

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.





In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        # self.gru = nn.GRU(self.hidden_size, hidden_size, batch_first=False, bidirectional=True)
        self.gru = nn.GRU(self.hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Attention Decoder
---------------------





In [0]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)

        self.dropout = nn.Dropout(self.dropout_p)
        # self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=False, bidirectional=True)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

        self.attn_coverage = nn.Linear(self.max_length, self.hidden_size)
        self.attn_coverage_cat = nn.Linear(self.hidden_size*3, self.max_length)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
       
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)

        # Q4 Chnage
        lcl_cumulative_sum = torch.cumsum(attn_weights, -1)
        lcl_Y1 = self.attn_coverage(lcl_cumulative_sum)
        lcl_Y2 = torch.cat((torch.cat((embedded[0], hidden[0]), 1), lcl_Y1),1)
        attn_weights_lcl = self.attn_coverage_cat(lcl_Y2)
        attn_weights = F.softmax(attn_weights_lcl, dim = 1)
        # Q4 Chnage

        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we will need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the
EOS token to both sequences.




In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_token for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation -
intuitively it has learned to represent the output grammar and can "pick
up" the meaning once the teacher tells it the first few words, but it
has not properly learned how to create the sentence from the translation
in the first place.

Because of the freedom PyTorch's autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
``teacher_forcing_ratio`` up to use more of it.




In [0]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            # For Paper 2 
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            # For Paper 2 
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [0]:
loss_list = []
epoch_list = []

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [0]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            
            loss_list.append(print_loss_avg)
            epoch_list.append(iter)
            print('epoch = ',epoch_list[-1],'  loss = ',loss_list[-1])

            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.




In [0]:
def showPlot(loss_list, epoch_list):
    plt.plot(epoch_list, loss_list)
    plt.xticks(np.arange(0, 75000, 10000)) 
    plt.yticks(np.arange(0, 5, 0.5)) 
    plt.savefig("test.png")
    plt.show()
    plt.close('all')

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [0]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            try:
              if topi.item() == UNK_token:
                  decoded_words.append('<UNK>')
              if topi.item() == EOS_token:
                  decoded_words.append('<EOS>')
                  break
              else:
                  decoded_words.append(output_lang.index2word[topi.item()])
            except:
              continue
            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [0]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(orig_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [83]:

print(output_lang.n_words)
print(input_lang.n_words)
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 25000, print_every=1000)

1170
1117
epoch =  1000   loss =  4.617335979790813
0m 34s (- 13m 54s) (1000 4%) 4.6173
epoch =  2000   loss =  4.439340661458477
1m 9s (- 13m 20s) (2000 8%) 4.4393
epoch =  3000   loss =  3.812911378598869
1m 46s (- 12m 57s) (3000 12%) 3.8129
epoch =  4000   loss =  2.618166730167369
2m 22s (- 12m 28s) (4000 16%) 2.6182
epoch =  5000   loss =  1.4545907392377073
2m 59s (- 11m 58s) (5000 20%) 1.4546
epoch =  6000   loss =  0.5849378019418099
3m 38s (- 11m 31s) (6000 24%) 0.5849
epoch =  7000   loss =  0.20838278787206815
4m 16s (- 11m 0s) (7000 28%) 0.2084
epoch =  8000   loss =  0.08876294900185863
4m 55s (- 10m 28s) (8000 32%) 0.0888
epoch =  9000   loss =  0.042397074776221226
5m 33s (- 9m 52s) (9000 36%) 0.0424
epoch =  10000   loss =  0.03163008247664407
6m 12s (- 9m 18s) (10000 40%) 0.0316
epoch =  11000   loss =  0.026206732651063613
6m 50s (- 8m 42s) (11000 44%) 0.0262
epoch =  12000   loss =  0.021596345053253693
7m 29s (- 8m 6s) (12000 48%) 0.0216
epoch =  13000   loss =  0.0

Training and Evaluating
=======================

With all these helper functions in place (it looks like extra work, but
it makes it easier to run multiple experiments) we can actually
initialize a network and start training.

Remember that the input sentences were heavily filtered. For this small
dataset we can use relatively small networks of 256 hidden nodes and a
single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
reasonable results.

.. Note::
   If you run this notebook you can train, interrupt the kernel,
   evaluate, and continue training later. Comment out the lines where the
   encoder and decoder are initialized and run ``trainIters`` again.




In [84]:
no_of_epoch = 25000
no_hidden_states = 256
model_type = "AttnDecoder"
model_name_dec =  model_type+"_Model_"+str(no_of_epoch)+"_"+str(no_hidden_states)
torch.save(attn_decoder1.state_dict(), model_name_dec)
# device = torch.device('cpu')
decoder_model = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
decoder_model.load_state_dict(torch.load(model_name_dec, map_location=device))

<All keys matched successfully>

In [0]:
datafile = "dev_"
# epochs
task = "Project_"
model_name = task+datafile+str(no_of_epoch)+"_"+str(no_hidden_states)+".encoder"

torch.save(encoder1.state_dict(), model_name)
device = torch.device('cpu')
# encoder_model = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# encoder_model.load_state_dict(torch.load(model_name, map_location=device))


model_name = task+datafile+str(no_of_epoch)+"_"+str(no_hidden_states)+".attndecoder"
torch.save(attn_decoder1.state_dict(), model_name)
device = torch.device('cpu')
# decoder_model = DecoderRNN(hidden_size, output_lang.n_words).to(device)
# decoder_model.load_state_dict(torch.load(model_name, map_location=device))

# showPlot(plot_losses, plot_epoch)

In [86]:
device = torch.device('cuda')
evaluateRandomly(encoder1, attn_decoder1)

> .
=  
< ये व्यक्ति को एकाग्रचित नहीं होते हैं जो दिमाग को अलग है । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस में हुआ था । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस में हुआ था । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस मिनट तक भी अच्छा होगा । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस में हुआ था । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस करने के लिए । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस का किराया 18 हजार रुपये से भी कम । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ इस बार तम्बाकू की जा सकती है । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस से बचें । <EOS>

> .
=  
< इसका निर्माण उन्नीस , सौ बीस में हुआ है । <EOS>



In [87]:
print(epoch_list)
print(loss_list)

[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000]
[4.617335979790813, 4.439340661458477, 3.812911378598869, 2.618166730167369, 1.4545907392377073, 0.5849378019418099, 0.20838278787206815, 0.08876294900185863, 0.042397074776221226, 0.03163008247664407, 0.026206732651063613, 0.021596345053253693, 0.018867887643336356, 0.016042343215096323, 0.015650530460540046, 0.013359331532879482, 0.011873091339624007, 0.010913109689459702, 0.010103366740719558, 0.009110849553838568, 0.008479037834634343, 0.008041112830916772, 0.007583009874306449, 0.007052608959219093, 0.006790911465345444]


In [0]:
s = "epoch_list_"+model_name_dec+'.pkl' 
with open(s, 'wb') as f:
    pickle.dump(epoch_list, f)

s = "loss_list"+model_name_dec+'.pkl' 
with open(s, 'wb') as f:
    pickle.dump(loss_list, f)

In [91]:
print("after loading pickles")
s = "epoch_list_"+model_name_dec+'.pkl' 
with open(s, 'wb') as f:
    mynewlist = pickle.load(f)
    print(mynewlist)

s = "loss_list"+model_name_dec+'.pkl' 
with open(s, 'wb') as f:
    mynewlist = pickle.load(f)
    print(mynewlist)

after loading pickles


UnsupportedOperation: ignored

In [0]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction    

def calculate_bleu(pred_trg, real_trg):
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(real_trg, pred_trg, smoothing_function=smoothie)
    return score 

def calculate_Result(encoder, decoder,lcl_pairs, n=50):
    device = torch.device('cuda')

    result_value_bleu_score = []
    
    for i in range(n):
        pair = random.choice(lcl_pairs)
        if debug:
          print('>', pair[0])
          print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        if debug:
          print('<', output_sentence)
        reference = [pair[1].split()]
        if debug:
          print('--', reference)
        output_words = output_words[:-1]
        temp  = []
        for ow in output_words:
          if ow!='':
            temp.append(ow)
        output_words = temp
        target_predicted = output_words
        
        if debug:        
          print('<<', output_words)
        
        score = calculate_bleu(target_predicted,reference)
        
        if debug:
          print("---Value",score)
        
        result_value_bleu_score.append((pair[0],pair[1].split(),target_predicted,score))

    return result_value_bleu_score

In [0]:
def readLangsTest(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines  
    
    # lines1 = open(io.StringIO(uploaded['dev.en'].decode('utf-8')),encoding='utf-8').read().strip().split('\n')
    # lines2 = open(io.StringIO(uploaded['dev.hi'].decode('utf-8')),encoding='utf-8').read().strip().split('\n')

#     lines1 = open('/content/gdrive/My Drive/nlpa/test.en',encoding='utf-8').read().strip().split('\n')
#     lines2 = open('/content/gdrive/My Drive/nlpa/test.hi',encoding='utf-8').read().strip().split('\n')

    test_source = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/dev.en"
    test_target = "/content/gdrive/My Drive/NLPA/NLA S20 - Assignment 2 Data/enghin/dev.hi"

    lines1 = open(test_source,encoding='utf-8').read().strip().split('\n')
    lines2 = open(test_target,encoding='utf-8').read().strip().split('\n')

    print("len lines1 ",len(lines1))
    print("len lines2 ",len(lines2))
    print("line2 example")
    print(lines2[0])
 
    lines = []
    for i in range(0,len(lines1)):
        lines.append(lines1[i]+' \t '+lines2[i])
    print("len(lines) ",len(lines))
    print("lines[0] ",lines[0])
    pairs = [[s for s in l.split('\t')] for l in lines]
    print("len(pairs) ",len(pairs))
    reverse = False
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs



In [100]:
def prepareDataTest(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangsTest(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareDataTest('eng', 'hi', True)
print(random.choice(pairs))

Reading lines...
len lines1  401
len lines2  401
line2 example
बल्कि आप नेत्ररोगों से भी बचे रहेंगे ।
len(lines)  401
lines[0]  but you will also be safe from eye diseases . 	 बल्कि आप नेत्ररोगों से भी बचे रहेंगे ।
len(pairs)  401
Read 401 sentence pairs
Trimmed to 245 sentence pairs
Counting words...
Counted words:
eng 1071
hi 1117
['If two elders are there then children less then 15 years of age absolutely free . ', ' और अगर दो बड़े हों तो 15 साल से कम उम्र के बच्चे बिलकुल फ्री ।']


In [101]:
global debug
debug = 1
device = torch.device('cuda')
result_value_bleu_score = calculate_Result(encoder1, attn_decoder1,pairs)
result_value_bleu_score_dict = {}
result_value_bleu_score_dict['result'] = result_value_bleu_score 
# torch.save(result_value_bleu_score_dict, train_result_data_path)
for item in result_value_bleu_score:
  if (item[3]>0):
    print(" Source Language ",item[0])
    print(" Input Target",item[1])
    print(" Output Target",item[2])
    print(" Score ",item[3])

> AIDS spreads from these . 
=  एड्स इनसे फैलता है ।
< प्रवेश कतरन जाँच जानते रहेंगे <EOS>
-- [['एड्स', 'इनसे', 'फैलता', 'है', '।']]
<< ['प्रवेश', 'कतरन', 'जाँच', 'जानते', 'रहेंगे']
---Value 0
> Themain attractions of Rome are it 's fountains . 
=  रोम के खास आकर्षण इसके फाउंटेंस हैं ।
< प्रवेश मानसिक नजर बादाम एंटीआक्सीडेंट प्रयास यूनिवर्सिटी रखने उसे मूर्तिकार माइकल फायदेमंद ईस्वी दिलचस्प रहेंगे <EOS>
-- [['रोम', 'के', 'खास', 'आकर्षण', 'इसके', 'फाउंटेंस', 'हैं', '।']]
<< ['प्रवेश', 'मानसिक', 'नजर', 'बादाम', 'एंटीआक्सीडेंट', 'प्रयास', 'यूनिवर्सिटी', 'रखने', 'उसे', 'मूर्तिकार', 'माइकल', 'फायदेमंद', 'ईस्वी', 'दिलचस्प', 'रहेंगे']
---Value 0
> Prevention of AIDS and propagation . 
=  एड्स से बचाव एवं प्रचार प्रसार ।
< आडियो सुनकर आपके कविता सा सुविधाजनक कहते पहचाने रहेंगे <EOS>
-- [['एड्स', 'से', 'बचाव', 'एवं', 'प्रचार', 'प्रसार', '।']]
<< ['आडियो', 'सुनकर', 'आपके', 'कविता', 'सा', 'सुविधाजनक', 'कहते', 'पहचाने', 'रहेंगे']
---Value 0
> Also also learn about the rules of insurance or loss co

For a better viewing experience we will do the extra work of adding axes
and labels:


