## Input, Outputs in a LSTM

In [1]:
#Source: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1087f8a90>

### Uni-directional LSTM

#### Inputs: one by one

In [35]:
lstm = nn.LSTM(3, 3, bidirectional=False)  # Input dim is 3, output dim is 3

In [36]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs

[tensor([[-1.0945,  0.3069,  1.5775]]),
 tensor([[-0.6731,  1.7090,  0.2287]]),
 tensor([[ 0.7136, -1.3605,  0.3917]]),
 tensor([[-0.0278,  0.8739, -2.2294]]),
 tensor([[-0.8750, -2.0791,  0.6078]])]

In [37]:
print(inputs[0].shape)
print(inputs[0].view(1,1,-1).shape)

torch.Size([1, 3])
torch.Size([1, 1, 3])


In [38]:
# initialize the hidden state.
hidden = (torch.randn(1,1,3), torch.randn(1,1,3))
hidden

(tensor([[[-0.6445, -0.2960, -0.5345]]]),
 tensor([[[ 1.7612, -1.0814, -1.1957]]]))

In [39]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [40]:
print(out)
print(out.shape)

tensor(1.00000e-02 *
       [[[-3.8379, -9.7416, -0.0669]]])
torch.Size([1, 1, 3])


In [41]:
hidden

(tensor(1.00000e-02 *
        [[[-3.8379, -9.7416, -0.0669]]]),
 tensor([[[-0.1412, -0.3126, -0.0045]]]))

#### Inputs: Entire seq at once

In [42]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

In [43]:
print(inputs.shape)
inputs

torch.Size([5, 1, 3])


tensor([[[ 0.2728, -1.9361, -1.2940]],

        [[-0.4160, -0.2229,  0.2163]],

        [[ 0.5239,  0.6090,  0.7696]],

        [[ 0.7018, -0.1214, -1.4515]],

        [[-2.2306, -1.7935, -0.5610]]])

In [46]:
hidden = (torch.randn(1,1,3),
          torch.randn(1,1,3))
print(hidden)

(tensor([[[-0.4545,  1.5896,  1.5717]]]), tensor([[[-0.3484,  0.2624, -0.6198]]]))


In [47]:
out, hidden = lstm(inputs, hidden)

In [48]:
print(out.shape)
out

torch.Size([5, 1, 3])


tensor([[[-0.0392,  0.1589, -0.1073]],

        [[-0.0339,  0.0287, -0.1048]],

        [[-0.0896, -0.1103, -0.1776]],

        [[ 0.0309, -0.2100, -0.2136]],

        [[ 0.2139, -0.0597,  0.0131]]])

In [49]:
out.squeeze().shape

torch.Size([5, 3])

In [50]:
print(len(hidden))
print(hidden[0].shape)
print(hidden[1].shape)
hidden

2
torch.Size([1, 1, 3])
torch.Size([1, 1, 3])


(tensor([[[ 0.2139, -0.0597,  0.0131]]]),
 tensor([[[ 0.4685, -0.0998,  0.0767]]]))

In [39]:
out.view(5, -1)

tensor([[ 0.2538, -0.1408, -0.0184,  0.1192, -0.1604,  0.3696],
        [-0.0012, -0.0242, -0.1707,  0.1013, -0.2605,  0.0976],
        [-0.0637,  0.0208, -0.2064,  0.0646, -0.1382,  0.0662],
        [-0.3097,  0.0479, -0.1550,  0.0351,  0.0174,  0.0412],
        [-0.3868, -0.2429, -0.0179,  0.0221,  0.2524, -0.0645]])

In [19]:
hidden

(tensor([[[-0.3868, -0.2429, -0.0179]],
 
         [[ 0.1192, -0.1604,  0.3696]]]),
 tensor([[[-0.6433, -0.4210, -0.0755]],
 
         [[ 0.4048, -0.5653,  0.6024]]]))

### Bi-directional LSTM

In [51]:
lstm = nn.LSTM(3, 3, bidirectional=True)  # Input dim is 3, output dim is 3

#### Inputs: Entire seq at once

In [52]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

In [54]:
print(inputs.shape)


torch.Size([5, 1, 3])


In [55]:
hidden = (torch.randn(2,1,3),
          torch.randn(2,1,3))
print(hidden)

(tensor([[[ 0.6099, -1.1666,  1.1124]],

        [[ 0.3314,  2.9973, -0.2197]]]), tensor([[[-0.6007, -0.4284, -0.5745]],

        [[-0.5534, -0.5951, -0.4152]]]))


In [56]:
out, hidden = lstm(inputs, hidden)

In [64]:
out

tensor([[[ 0.0071, -0.1865, -0.0739, -0.2642, -0.0664,  0.1017]],

        [[ 0.2107, -0.1074, -0.0729, -0.1469, -0.0032,  0.1461]],

        [[ 0.2318, -0.0825, -0.0517, -0.2266,  0.1168,  0.0341]],

        [[ 0.2775, -0.1177, -0.0261, -0.3047, -0.0046, -0.0629]],

        [[ 0.3059,  0.0129, -0.0519, -0.2320, -0.0842, -0.0749]]])

In [65]:
print(out.shape)
out[[1,3]]

torch.Size([5, 1, 6])


tensor([[[ 0.2107, -0.1074, -0.0729, -0.1469, -0.0032,  0.1461]],

        [[ 0.2775, -0.1177, -0.0261, -0.3047, -0.0046, -0.0629]]])

In [58]:
out.squeeze().shape

torch.Size([5, 6])

In [59]:
print(len(hidden))
print(hidden[0].shape)
print(hidden[1].shape)
hidden

2
torch.Size([2, 1, 3])
torch.Size([2, 1, 3])


(tensor([[[ 0.3059,  0.0129, -0.0519]],
 
         [[-0.2642, -0.0664,  0.1017]]]),
 tensor([[[ 0.5001,  0.0386, -0.1360]],
 
         [[-0.4422, -0.1392,  0.1903]]]))

In [60]:
out.view(5, -1)

tensor([[ 0.0071, -0.1865, -0.0739, -0.2642, -0.0664,  0.1017],
        [ 0.2107, -0.1074, -0.0729, -0.1469, -0.0032,  0.1461],
        [ 0.2318, -0.0825, -0.0517, -0.2266,  0.1168,  0.0341],
        [ 0.2775, -0.1177, -0.0261, -0.3047, -0.0046, -0.0629],
        [ 0.3059,  0.0129, -0.0519, -0.2320, -0.0842, -0.0749]])

### nn.Linear example:

In [40]:
m = nn.Linear(20, 30)
input_ex = torch.randn(128, 20)
output = m(input_ex)
print(output.size())

torch.Size([128, 30])


In [43]:
m

Linear(in_features=20, out_features=30, bias=True)

## POS Tagging with LSTM example

In [23]:
#Prepare data:

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]
word_to_ix = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [33]:
x = nn.Embedding(2, 4)

In [50]:
training_data[0][0]

['The', 'dog', 'ate', 'the', 'apple']

In [34]:
x(torch.tensor(0, dtype=torch.long))

tensor([-0.4212, -0.5107, -1.5727, -0.1232])

In [47]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim), 
               torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
                            embeds.view(len(sentence), 1, -1) ,self.hidden)
        
        #converts hidden dimension to tag_dimension (by linear combination)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        
        return tag_scores
        
    

In [51]:
#Train the model:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

#Look at scores before training:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.0424, -1.1267, -1.1292],
        [-1.0872, -1.0830, -1.1262],
        [-1.0664, -1.0748, -1.1570],
        [-1.0724, -1.0588, -1.1681],
        [-1.0673, -1.0743, -1.1567]])


In [56]:
print(inputs)
tags = prepare_sequence(training_data[0][1], tag_to_ix)
print(tags)

tensor([ 0,  1,  2,  3,  4])
tensor([ 0,  1,  2,  0,  1])


In [57]:
for epoch in range(300):
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()
        
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
        
        #Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        

In [58]:
tag_scores

tensor([[-4.3092, -0.0325, -3.9881],
        [-2.7287, -3.7667, -0.0926],
        [-0.1361, -4.3053, -2.1736],
        [-4.3249, -0.0254, -4.4340]])

In [59]:
targets

tensor([ 1,  2,  0,  1])