In [0]:
import torch
from torch import LongTensor
from torch.nn import Embedding, LSTM
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [0]:
# Decoder
#credit: https://gist.githubusercontent.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec/raw/60dc6be30ba57aa5d0d036e6af8ff702782ded18/pad_packed_demo.py
## We want to run LSTM on a batch of 3 character sequences ['long_str', 'tiny', 'medium']
#
#     Step 1: Construct Vocabulary
#     Step 2: Load indexed data (list of instances, where each instance is list of word indices)
#     Step 3: Make Model
#  *  Step 4: Pad instances with 0s till max length sequence
#  *  Step 5: Sort instances by sequence length in descending order
#  *  Step 6: Embed the instances
#  *  Step 7: Call pack_padded_sequence with embeded instances and sequence lengths
#  *  Step 8: Forward with LSTM
#  *  Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector
#  *  Summary of Shape Transformations

In [0]:
# We want to run LSTM on a batch following 3 character sequences
seqs = ['a small sentence',  # len = 3
        'little bit bigger sentence',      # len = 4
        'nothing to say']    # len = 3
test = 'new word'

In [0]:
## Step 1: Construct Vocabulary ##
##------------------------------##
# make sure <pad> idx is 0
vocab = ['<pad>', '<start>', '<end>', '<unk>'] + sorted(set([word for seq in seqs for word in seq.split()]))

In [0]:
vocab

['<pad>',
 '<start>',
 '<end>',
 '<unk>',
 'a',
 'bigger',
 'bit',
 'little',
 'nothing',
 'say',
 'sentence',
 'small',
 'to']

In [0]:
## Step 2: Load indexed data (list of instances, where each instance is list of character indices) ##
##-------------------------------------------------------------------------------------------------##
vectorized_seqs = [[vocab.index(tok) for tok in seq.split()]for seq in seqs]
vectorized_seqs

[[4, 11, 10], [7, 6, 5, 10], [8, 12, 9]]

In [0]:
## Step 3: Make Model ##
##--------------------##
embed = Embedding(len(vocab), 4) # embedding_dim = 4
lstm = LSTM(input_size=4, hidden_size=5, num_layers = 1, batch_first=True) # input_dim = 4, hidden_dim = 5

In [0]:
## Step 4: Pad instances with 0s till max length sequence ##
##--------------------------------------------------------##

# get the length of each seq in your batch
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))

In [0]:
seq_lengths

tensor([3, 4, 3])

In [0]:
# seq_lengths => [3, 4, 3]
# batch_sum_seq_len: 3 + 4 + 3 = 10
# max_seq_len: 4

In [0]:
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
seq_tensor

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [0]:
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)

In [0]:
seq_tensor

tensor([[ 4, 11, 10,  0],
        [ 7,  6,  5, 10],
        [ 8, 12,  9,  0]])

In [0]:
## Step 5: Sort instances by sequence length in descending order ##
##---------------------------------------------------------------##

seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
seq_tensor = seq_tensor[perm_idx]
seq_tensor

tensor([[ 7,  6,  5, 10],
        [ 4, 11, 10,  0],
        [ 8, 12,  9,  0]])

In [0]:
## Step 6: Embed the instances ##
##-----------------------------##

embedded_seq_tensor = embed(seq_tensor)
embedded_seq_tensor

tensor([[[-0.3227, -0.1044, -0.4612, -0.8055],
         [-0.6661, -1.5316,  0.6446, -1.3370],
         [-0.4653, -0.4362,  0.7046, -0.8728],
         [-0.3567, -0.0277,  1.1684,  0.8097]],

        [[ 0.6384,  0.5617,  0.6570,  1.0578],
         [-0.2879,  2.3274,  0.8726,  1.0885],
         [-0.3567, -0.0277,  1.1684,  0.8097],
         [ 0.5068, -0.1829, -0.0915, -1.0838]],

        [[-0.7129,  0.3673,  0.0192, -0.4796],
         [-0.1367, -0.2717, -0.2533, -1.3797],
         [ 0.9794, -0.4929, -1.6183, -0.6653],
         [ 0.5068, -0.1829, -0.0915, -1.0838]]], grad_fn=<EmbeddingBackward>)

In [0]:
embedded_seq_tensor.size()

torch.Size([3, 4, 4])

In [0]:
## Step 7: Call pack_padded_sequence with embeded instances and sequence lengths ##
##-------------------------------------------------------------------------------##

packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes
packed_input.data

tensor([[-0.3227, -0.1044, -0.4612, -0.8055],
        [ 0.6384,  0.5617,  0.6570,  1.0578],
        [-0.7129,  0.3673,  0.0192, -0.4796],
        [-0.6661, -1.5316,  0.6446, -1.3370],
        [-0.2879,  2.3274,  0.8726,  1.0885],
        [-0.1367, -0.2717, -0.2533, -1.3797],
        [-0.4653, -0.4362,  0.7046, -0.8728],
        [-0.3567, -0.0277,  1.1684,  0.8097],
        [ 0.9794, -0.4929, -1.6183, -0.6653],
        [-0.3567, -0.0277,  1.1684,  0.8097]],
       grad_fn=<PackPaddedSequenceBackward>)

In [0]:
packed_input.data.shape #(batch_wise_sum_seq_len X embedding_dim) = (10 X 4)

torch.Size([10, 4])

In [0]:
packed_input.batch_sizes

tensor([3, 3, 3, 1])

In [0]:
# visualization :
# little  bit    longer    sentence
# a       small  sentence 
# nothing to     say
# 3  3  3  1

In [0]:
# tensor([[-0.3227, -0.1044, -0.4612, -0.8055], #little
#         [ 0.6384,  0.5617,  0.6570,  1.0578], #a
#         [-0.7129,  0.3673,  0.0192, -0.4796], #nothing

#         [-0.6661, -1.5316,  0.6446, -1.3370], #bit
#         [-0.2879,  2.3274,  0.8726,  1.0885], #small
#         [-0.1367, -0.2717, -0.2533, -1.3797], #to

#         [-0.4653, -0.4362,  0.7046, -0.8728], #bigger
#         [-0.3567, -0.0277,  1.1684,  0.8097], #sentence
#         [ 0.9794, -0.4929, -1.6183, -0.6653], #say

#         [-0.3567, -0.0277,  1.1684,  0.8097]]) #sentence

In [0]:
## Step 8: Forward with LSTM ##
##---------------------------##

packed_output, (ht, ct) = lstm(packed_input)
# packed_output (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes

In [0]:
# ## Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector ##
# ##------------------------------------------------------------------------------------##

# # unpack your output if required
# output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

# # output.shape : ( batch_size X max_seq_len X hidden_dim) = (3 X 4 X 5)

# # Or if you just want the final hidden state?
# print(ht[-1])

In [0]:
## Summary of Shape Transformations ##
##----------------------------------##

# (batch_size X max_seq_len X embedding_dim) --> Sort by seqlen ---> (batch_size X max_seq_len X embedding_dim)
# (batch_size X max_seq_len X embedding_dim) --->      Pack     ---> (batch_sum_seq_len X embedding_dim)
# (batch_sum_seq_len X embedding_dim)        --->      LSTM     ---> (batch_sum_seq_len X hidden_dim)
# (batch_sum_seq_len X hidden_dim)           --->    UnPack     ---> (batch_size X max_seq_len X hidden_dim)

In [0]:
################ calculate loss ##############
# there are two ways to calculate losses
# using CrossEntropyLoss() = logSoftmax + NLLLoss()
# using NLLLoss()

In [0]:
criterion = nn.CrossEntropyLoss()

In [0]:
#lets assume for the sake of tutorial that targets = packed_input
targets = seq_tensor

In [0]:
targets

tensor([[ 7,  6,  5, 10],
        [ 4, 11, 10,  0],
        [ 8, 12,  9,  0]])

In [0]:
targets = pack_padded_sequence(targets, seq_lengths.cpu().numpy(), batch_first=True)

In [0]:
targets

PackedSequence(data=tensor([ 7,  4,  8,  6, 11, 12,  5, 10,  9, 10]), batch_sizes=tensor([3, 3, 3, 1]), sorted_indices=None, unsorted_indices=None)

In [0]:
linear = nn.Linear(5, len(vocab)) #hidden_size, vocab_size

In [0]:
outputs = linear(packed_output.data)

In [0]:
outputs.size()

torch.Size([10, 13])

In [0]:
loss = criterion(outputs, targets.data) 
loss

tensor(2.6565, grad_fn=<NllLossBackward>)

In [0]:
criterion_2 = nn.NLLLoss()
loss = criterion(F.log_softmax(outputs, dim=1), targets.data) 
loss

tensor(2.6565, grad_fn=<NllLossBackward>)

In [0]:
########### Generation #################
# For generating, you will want to generate one word at a time, but for tutorial's sake we are reusing the outputs generated above 
# to dicuss the main difference between two approaches.

In [0]:
# Deterministic: get the maximum output from output at each step of generation
_, predicted = outputs.max(1)   

In [0]:
predicted

tensor([ 6, 12,  6,  6,  2,  6,  6, 12,  6, 12])

In [0]:
# Stochastic: sample from weighted softmax distribution
temperature = 1
probabilities = F.softmax(outputs.div(temperature).squeeze(0).squeeze(0), dim=1) 

In [0]:
probabilities

tensor([[0.0848, 0.0966, 0.0742, 0.0749, 0.1030, 0.0428, 0.1172, 0.0500, 0.0893,
         0.0898, 0.0450, 0.0534, 0.0792],
        [0.0891, 0.0950, 0.0803, 0.0723, 0.1027, 0.0471, 0.0969, 0.0453, 0.0907,
         0.1023, 0.0546, 0.0536, 0.0700],
        [0.0878, 0.0987, 0.0765, 0.0682, 0.1032, 0.0438, 0.1025, 0.0504, 0.0965,
         0.0934, 0.0488, 0.0509, 0.0792],
        [0.0779, 0.0975, 0.0716, 0.0815, 0.1042, 0.0403, 0.1420, 0.0507, 0.0795,
         0.0838, 0.0392, 0.0562, 0.0756],
        [0.0884, 0.0920, 0.0820, 0.0677, 0.1129, 0.0442, 0.0857, 0.0464, 0.0889,
         0.1124, 0.0541, 0.0527, 0.0727],
        [0.0766, 0.0985, 0.0726, 0.0797, 0.1080, 0.0412, 0.1324, 0.0507, 0.0825,
         0.0835, 0.0402, 0.0549, 0.0793],
        [0.0782, 0.1004, 0.0735, 0.0798, 0.1036, 0.0413, 0.1389, 0.0502, 0.0805,
         0.0850, 0.0411, 0.0561, 0.0715],
        [0.0885, 0.0944, 0.0831, 0.0676, 0.1103, 0.0452, 0.0869, 0.0458, 0.0892,
         0.1116, 0.0556, 0.0529, 0.0687],
        [0.0745,

In [0]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

tensor([[ 5],
        [ 2],
        [ 2],
        [ 3],
        [ 9],
        [11],
        [ 7],
        [ 6],
        [12],
        [ 7]])

In [0]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

tensor([[ 7],
        [12],
        [ 2],
        [ 2],
        [ 0],
        [11],
        [ 0],
        [12],
        [12],
        [ 8]])