<a href="https://colab.research.google.com/github/dine1717/END2/blob/session11/Session11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Sequence to Sequcen and Attention Mechanish
In this code we'll we will dig much deeper and try and come up with the model ourselves. We will see the steps required to do so.

For the actual model refer to this colab link.

The part we are going to borrow from the code above is the data-preprocessing as that is straight forward.

In [45]:
%matplotlib inline

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#  Download Data

In [None]:
# !wget https://download.pytorch.org/tutorial/data.zip
# !unzip data.zip

In [174]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs



In [175]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['ils sont tous la .', 'they re all here .']


*italicised text*### Randomly pick one of the sample  

In [180]:
sample = random.choice(pairs)
sample

['je suis desireux de vous aider .', 'i am willing to help you .']

In [208]:
input_sentence = sample[0]
output_sentence = sample[1]
## Get the index of the word 
input_lang.word2index["vous"]

118

### Get the indices of the words

In [209]:
input_indices = [input_lang.word2index[word] for word in input_sentence.split(' ')]
target_indices = [output_lang.word2index[word] for word in output_sentence.split(' ')]
input_indices,target_indices

([6, 11, 2997, 101, 118, 1926, 5], [2, 16, 1302, 532, 571, 129, 4])

In [210]:
input_indices.append(EOS_token)
target_indices.append(SOS_token)
input_indices,target_indices

([6, 11, 2997, 101, 118, 1926, 5, 1], [2, 16, 1302, 532, 571, 129, 4, 0])

Oh yes, these are still indeces (not words anymore), we first need to convert them to tensors.

In the original code we wrote these 3 functions to take care of all the work we did till now.


```

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]
 
 
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
 
 
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

```

In [211]:
# input_indices.append(EOS_token)
# target_indices.append(SOS_token)
input_indices,target_indices

([6, 11, 2997, 101, 118, 1926, 5, 1], [2, 16, 1302, 532, 571, 129, 4, 0])

In [212]:
input_tensor = torch.tensor(input_indices,dtype=torch.long,device=device)
output_tensor = torch.tensor(target_indices,dtype=torch.long,device=device)

In [213]:
input_tensor.shape,output_tensor.shape

(torch.Size([8]), torch.Size([8]))

In [214]:
device

device(type='cuda')

## Encoder

In [215]:
input_size = input_lang.n_words
hidden_size = 256

In [216]:
embedding = nn.Embedding(input_size,hidden_size).to(device)
lstm = nn.LSTM(hidden_size,hidden_size).to(device)

In [217]:
embedded_input = embedding(input_tensor)
embedded_input.shape

torch.Size([8, 256])

In [218]:
input_tensor

tensor([   6,   11, 2997,  101,  118, 1926,    5,    1], device='cuda:0')

In [219]:
input_tensor.shape, input_tensor.view(-1,1).shape

(torch.Size([8]), torch.Size([8, 1]))

In [220]:
print(embedded_input.shape)
embedded_input = embedding(input_tensor.view(-1,1))
print(embedded_input.shape)

torch.Size([8, 256])
torch.Size([8, 1, 256])


In [221]:
print(embedded_input.shape)
embedded_input = embedding(input_tensor[0].view(-1,1)) ## sending each word separetly
print(embedded_input.shape)

torch.Size([8, 1, 256])
torch.Size([1, 1, 256])


In [222]:
hidden = torch.zeros(1,1,256,device =device)
cell = torch.zeros(1,1,256,device =device)

In [223]:
embedded_input = embedding(input_tensor[0].view(-1,1)) ## sending each word separetly
output,(hidden,cell) = lstm(embedded_input,(hidden,cell))

In [224]:
output.shape, output[0,0].shape

(torch.Size([1, 1, 256]), torch.Size([256]))

In [225]:
MAX_LENGTH = input_tensor.size()[0]

In [226]:
encoder_outputs = torch.zeros(MAX_LENGTH,256,device = device )
encoder_outputs.shape

torch.Size([8, 256])

In [227]:
input_tensor.size(),input_tensor.size()[0]

(torch.Size([8]), 8)

In [228]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
# >>> output[0, 0].shape
# torch.Size([256]))
encoder_hidden = torch.zeros(1, 1, 256, device=device)
encoder_cell = torch.zeros(1,1,256,device =device)

for i in range(input_tensor.size()[0]):
  # print(input_sentence.split(' ')[i]) why will this cause error?
  embedded_input = embedding(input_tensor[i].view(-1, 1))
  output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
  encoder_outputs[i] += output[0,0]


In [229]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
encoder_hidden = torch.zeros(1, 1, 256, device=device)
encoder_cell = torch.zeros(1,1,256,device =device)

### Feed Forward Encoder

In [230]:
embedded_input = embedding(input_tensor[0].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[0] += output[0,0]

In [231]:
embedded_input = embedding(input_tensor[1].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[1] += output[0,0]

In [232]:
embedded_input = embedding(input_tensor[2].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[2] += output[0,0]


In [233]:
embedded_input = embedding(input_tensor[3].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[3] += output[0,0]


In [234]:
embedded_input = embedding(input_tensor[4].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[4] += output[0,0]


In [235]:
embedded_input = embedding(input_tensor[5].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[5] += output[0,0]


In [236]:
embedded_input = embedding(input_tensor[6].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[6] += output[0,0]

In [237]:
embedded_input = embedding(input_tensor[7].view(-1, 1))
output, (encoder_hidden,encoder_cell) = lstm(embedded_input, (encoder_hidden,encoder_cell))
encoder_outputs[7] += output[0,0]

In [238]:
encoder_outputs[1:8]

tensor([[-0.2009, -0.2267, -0.1101,  ...,  0.1206, -0.1139, -0.0528],
        [-0.1905, -0.1416,  0.0205,  ..., -0.0568, -0.0424, -0.0385],
        [ 0.1225, -0.3578,  0.1118,  ...,  0.0467,  0.0356, -0.1140],
        ...,
        [ 0.0023, -0.4113, -0.0379,  ...,  0.2070,  0.0524,  0.0758],
        [-0.0339, -0.0920, -0.1736,  ...,  0.0192,  0.0396,  0.1170],
        [ 0.1266,  0.1399, -0.2356,  ..., -0.1354,  0.0479,  0.0124]],
       device='cuda:0', grad_fn=<SliceBackward>)

In [239]:
encoder_outputs.shape

torch.Size([8, 256])



Finally our Encoder is fully ready. Now let's look at the class we wrote in the last class to see what we missed!

```
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
```


In [240]:
attn_weight_layer = nn.Linear(256 * 2, 1).to(device)

In [241]:
def getattentioncontext(decoder_hidden ,  encoder_hidden_states , attn_weight_layer = attn_weight_layer):

  
  attn_weights = torch.zeros(encoder_hidden_states.size(0), device=device)
  for i in range(encoder_hidden_states.size(0)):
    
    attn_weight = attn_weight_layer(torch.cat((decoder_hidden[0],encoder_hidden_states[i].unsqueeze(0)), 1))
    attn_weights[i] += attn_weight[0,0]

  attn_weights = attn_weights.unsqueeze(0)
  attn_weights = F.softmax(attn_weights, dim = 1)
  applied_attn = torch.bmm(attn_weights.unsqueeze(0), encoder_hidden_states.unsqueeze(0))

  return applied_attn , attn_weights


Cool! Next let's build out Decoder where we have attention in-built.

# Decoder with Attention

Here is the plan. 

1. First input to the decoder will be SOS_token, later inputs would be the words it predicted (unless we implement teacher forcing)
2. decoder/GRU's hidden state will be initialized with the encoder's last hidden state
3. we will use gru's hidden state and last prediction to generate attention weight using a FC layer. 
4. this attention weight will be used to weigh the encoder_outputs using batch matric multiplication. This will give us a NEW view on how to look at encoder_states.
5. this attention applied encoder_states will then be concatenated with the input, and then sent a linear layer and _then_ sent to the GRU. 
6. GRU's output will be sent to a FC layer to predict one of the output_language words

Let's prepare all the inputs we need to do this


In [242]:
# first input
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell
decoded_words = []

In [243]:
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
embedded.shape

torch.Size([1, 1, 256])

In [244]:
#attn_weight_layer = nn.Linear(256 * 2, 10).to(device)

In [245]:
print(encoder_outputs)

tensor([[-0.1583,  0.0385, -0.2244,  ...,  0.0195, -0.0184,  0.1081],
        [-0.2009, -0.2267, -0.1101,  ...,  0.1206, -0.1139, -0.0528],
        [-0.1905, -0.1416,  0.0205,  ..., -0.0568, -0.0424, -0.0385],
        ...,
        [ 0.0023, -0.4113, -0.0379,  ...,  0.2070,  0.0524,  0.0758],
        [-0.0339, -0.0920, -0.1736,  ...,  0.0192,  0.0396,  0.1170],
        [ 0.1266,  0.1399, -0.2356,  ..., -0.1354,  0.0479,  0.0124]],
       device='cuda:0', grad_fn=<CopySlices>)


In [246]:
embedded.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [247]:
torch.cat((embedded,decoder_hidden),1).shape

torch.Size([1, 2, 256])

In [248]:
torch.cat((embedded[0],decoder_hidden[0]),1).shape

torch.Size([1, 512])

In [249]:
# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights

In [250]:
attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)
attn_applied.shape,attn_weights

(torch.Size([1, 1, 256]),
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [251]:
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
embedded.shape

torch.Size([1, 1, 256])

So, now we have this 256dm attn_applied encoder_outputs capturing what we should focus on on this step. We also have the input we already generated. That's 256dm again. GRU is gonna take 256 only. So we need to concatenate them, send to a linear layer to reduce dimensions, and then send to Gru
![image](https://static.wikia.nocookie.net/mycun-the-movie/images/c/c2/Gru-icon.png/revision/latest/scale-to-width-down/250?cb=20151223171656)

In [252]:
input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)

embedded.shape, attn_applied.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [253]:
torch.cat((embedded, attn_applied), 1).shape, torch.cat((embedded[0], attn_applied[0]), 1).shape

(torch.Size([1, 2, 256]), torch.Size([1, 512]))

In [254]:
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm.shape

torch.Size([1, 256])

In [255]:
lstm = nn.LSTM(256,256).to(device)
input_to_lstm.shape

torch.Size([1, 256])

In [256]:
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_lstm = input_to_lstm.unsqueeze(0)
decoder_hidden.shape,input_to_lstm.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [257]:
decoder_cell=encoder_cell

In [258]:
output,(decoder_hidden,decoder_cell) = lstm(input_to_lstm,(decoder_hidden,decoder_cell))
output.shape,input_to_lstm.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [259]:
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

In [260]:
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
output, output.shape

(tensor([[0.0003, 0.0003, 0.0003,  ..., 0.0004, 0.0003, 0.0003]],
        device='cuda:0', grad_fn=<SoftmaxBackward>), torch.Size([1, 2803]))

In [261]:
output.data.topk(1)

torch.return_types.topk(values=tensor([[0.0004]], device='cuda:0'), indices=tensor([[2798]], device='cuda:0'))

In [262]:
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()]

'compiling'

In [263]:
top_index.item()

2798

In [264]:
def getnextword( targetwordidx , predwordidx):

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    wordidx = targetwordidx
  else:
    wordidx = predwordidx

  print(use_teacher_forcing , wordidx)

  return wordidx

In [265]:
predicted_sentence =[]
teacher_forcing_ratio = 0.5
wordcounter = 0

In [266]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])
output_lang.index2word[top_index.item()],attn_weights

('potatoes',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [267]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1


False 2143


In [268]:
decoder_input = torch.tensor([[nextwordidx]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])
output_lang.index2word[top_index.item()],attn_weights

('oddly',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [269]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1

True 16


In [270]:
decoder_input = torch.tensor([[nextwordidx]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])
output_lang.index2word[top_index.item()],attn_weights

('resentful',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [271]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1

True 1302


In [272]:
decoder_input = torch.tensor([[nextwordidx]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])
output_lang.index2word[top_index.item()],attn_weights

('normal',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [273]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1

False 109


In [274]:
decoder_input = torch.tensor([[nextwordidx]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])

output_lang.index2word[top_index.item()],attn_weights

('trance',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [275]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1

False 2596


In [276]:
decoder_input = torch.tensor([[nextwordidx]], device=device)
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell

output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)

# attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
# attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
# attn_weights = F.softmax(attn_weights, dim = 1)
# attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

attn_applied ,attn_weights = getattentioncontext(decoder_hidden ,  encoder_outputs , attn_weight_layer = attn_weight_layer)

input_to_lstm_layer = nn.Linear(256 * 2, 256).to(device)
input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
lstm = nn.LSTM(256, 256).to(device)
input_to_lstm = input_to_lstm.unsqueeze(0)
output, (decoder_hidden, decoder_cell) = lstm(input_to_lstm, (decoder_hidden, decoder_cell))
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
predicted_sentence.append(output_lang.index2word[top_index.item()])
output_lang.index2word[top_index.item()],attn_weights

('beggar',
 tensor([[0.1322, 0.1255, 0.1174, 0.1298, 0.1230, 0.1209, 0.1258, 0.1253]],
        device='cuda:0', grad_fn=<SoftmaxBackward>))

In [277]:
nextwordidx = getnextword(target_indices[wordcounter], top_index.item())

wordcounter += 1

True 129


In [278]:
target_indices, output_sentence , ' '.join(predicted_sentence) , input_sentence


([2, 16, 1302, 532, 571, 129, 4, 0],
 'i am willing to help you .',
 'potatoes oddly resentful normal trance beggar',
 'je suis desireux de vous aider .')

In [279]:
pred_sentence = []

decoder_hidden = torch.zeros(1, 1, 256, device=device)
decoder_cell = torch.zeros(1, 1, 256, device=device)

for i in range(7):
  decoder_input = torch.tensor([[target_indices[i]]], device=device)
  
  output_size = output_lang.n_words
  embedded = embedding(decoder_input)
  
  attn_applied , attn_weights = getattentioncontext(decoder_hidden , encoder_hidden_states)
  
  input_to_lstm = input_to_lstm_layer(torch.cat((embedded[0], attn_applied[0]), 1))
  input_to_lstm = input_to_lstm.unsqueeze(0)
  output, (decoder_hidden,decoder_cell) = lstm(input_to_lstm, (decoder_hidden,decoder_cell))

  output = F.relu(output)
  output = F.softmax(output_word_layer(output[0]), dim = 1)
  top_value, top_index = output.data.topk(1)
  pred_sentence.append(output_lang.index2word[top_index.item()])
  print(output_sentence.split(" ")[i], target_indices[i], output_lang.index2word[top_index.item()], top_index.item() )
  print(attn_weights)

i 2 downtown 1044
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
am 16 obsessed 2357
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
willing 1302 obsessed 2357
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
to 532 beggar 786
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
help 571 beggar 786
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
you 129 beggar 786
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)
. 4 beggar 786
tensor([[0.1747, 0.1664, 0.1711, 0.1623, 0.1606, 0.1648]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)


In [280]:
target_indices, output_sentence , ' '.join(predicted_sentence) , input_sentence


([2, 16, 1302, 532, 571, 129, 4, 0],
 'i am willing to help you .',
 'potatoes oddly resentful normal trance beggar',
 'je suis desireux de vous aider .')