## Input, Outputs in a LSTM

In [1]:
#Source: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1117c1a70>

### Uni-directional LSTM

#### Inputs: one by one

In [2]:
lstm = nn.LSTM(3, 3, bidirectional=False)  # Input dim is 3, output dim is 3

In [3]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs

[tensor([[-0.5525,  0.6355, -0.3968]]),
 tensor([[-0.6571, -1.6428,  0.9803]]),
 tensor([[-0.0421, -0.8206,  0.3133]]),
 tensor([[-1.1352,  0.3773, -0.2824]]),
 tensor([[-2.5667, -1.4303,  0.5009]])]

In [4]:
print(inputs[0].shape)
print(inputs[0].view(1,1,-1).shape)

torch.Size([1, 3])
torch.Size([1, 1, 3])


In [5]:
# initialize the hidden state.
hidden = (torch.randn(1,1,3), torch.randn(1,1,3))
hidden

(tensor([[[ 0.5438, -0.4057,  1.1341]]]),
 tensor([[[-1.1115,  0.3501, -0.7703]]]))

In [39]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [40]:
print(out)
print(out.shape)

tensor(1.00000e-02 *
       [[[-3.8379, -9.7416, -0.0669]]])
torch.Size([1, 1, 3])


In [41]:
hidden

(tensor(1.00000e-02 *
        [[[-3.8379, -9.7416, -0.0669]]]),
 tensor([[[-0.1412, -0.3126, -0.0045]]]))

#### Inputs: Entire seq at once

In [8]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

In [9]:
print(inputs.shape)
inputs

torch.Size([5, 1, 3])


tensor([[[ 0.5119, -0.6933, -0.1668]],

        [[-0.9999, -1.6476,  0.8098]],

        [[ 0.0554,  1.1340, -0.5326]],

        [[ 0.6592, -1.5964, -0.3769]],

        [[-3.1020, -0.0995, -0.7213]]])

In [10]:
hidden = (torch.randn(1,1,3),
          torch.randn(1,1,3))
print(hidden)

(tensor([[[ 1.2708, -0.0020, -1.0952]]]), tensor([[[ 0.6016,  0.6984, -0.8005]]]))


In [11]:
out, hidden = lstm(inputs)

In [12]:
print(out.shape)
out

torch.Size([5, 1, 3])


tensor([[[-0.0702, -0.0369, -0.0676]],

        [[-0.3659,  0.0107, -0.0394]],

        [[-0.1039,  0.0108, -0.0589]],

        [[-0.2986, -0.0103, -0.1454]],

        [[-0.1021,  0.0418, -0.1159]]])

In [49]:
out.squeeze().shape

torch.Size([5, 3])

In [13]:
print(len(hidden))
print(hidden[0].shape)
print(hidden[1].shape)
hidden

2
torch.Size([1, 1, 3])
torch.Size([1, 1, 3])


(tensor([[[-0.1021,  0.0418, -0.1159]]]),
 tensor([[[-0.7117,  0.1708, -0.1390]]]))

In [39]:
out.view(5, -1)

tensor([[ 0.2538, -0.1408, -0.0184,  0.1192, -0.1604,  0.3696],
        [-0.0012, -0.0242, -0.1707,  0.1013, -0.2605,  0.0976],
        [-0.0637,  0.0208, -0.2064,  0.0646, -0.1382,  0.0662],
        [-0.3097,  0.0479, -0.1550,  0.0351,  0.0174,  0.0412],
        [-0.3868, -0.2429, -0.0179,  0.0221,  0.2524, -0.0645]])

In [19]:
hidden

(tensor([[[-0.3868, -0.2429, -0.0179]],
 
         [[ 0.1192, -0.1604,  0.3696]]]),
 tensor([[[-0.6433, -0.4210, -0.0755]],
 
         [[ 0.4048, -0.5653,  0.6024]]]))

### Bi-directional LSTM

In [71]:
lstm = nn.LSTM(3, 3, bidirectional=True)  # Input dim is 3, output dim is 3

#### Inputs: Entire seq at once

In [72]:
inputs = [torch.randn(1,3) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(len(inputs), 1, -1)

In [73]:
print(inputs.shape)
inputs

torch.Size([5, 1, 3])


tensor([[[ 0.0140,  1.1385, -1.4325]],

        [[ 0.2623,  1.2968,  1.0816]],

        [[ 0.4075, -0.9404, -1.3203]],

        [[ 1.6002,  1.3146, -0.6118]],

        [[-0.9419, -0.1675, -1.6990]]])

In [74]:
hidden = (torch.randn(2,1,3),
          torch.randn(2,1,3))
print(hidden)

(tensor([[[-2.0724,  1.5600, -0.5075]],

        [[-1.6533, -0.0907, -1.0677]]]), tensor([[[-0.4728, -0.0388, -0.0063]],

        [[-0.1100,  0.1423,  0.2453]]]))


In [75]:
out, hidden = lstm(inputs, hidden)

In [76]:
out

tensor([[[ 0.0693,  0.0419, -0.2024,  0.2507,  0.1534,  0.1960]],

        [[ 0.0568, -0.0754,  0.1704,  0.1111, -0.1453,  0.2441]],

        [[ 0.2176,  0.0963,  0.1359, -0.0114,  0.1027,  0.3792]],

        [[ 0.2683,  0.0519,  0.4203, -0.1651,  0.1405,  0.2349]],

        [[-0.1558,  0.0759,  0.0083, -0.5699,  0.2753,  0.0393]]])

In [77]:
print(out.shape)
out

torch.Size([5, 1, 6])


tensor([[[ 0.0693,  0.0419, -0.2024,  0.2507,  0.1534,  0.1960]],

        [[ 0.0568, -0.0754,  0.1704,  0.1111, -0.1453,  0.2441]],

        [[ 0.2176,  0.0963,  0.1359, -0.0114,  0.1027,  0.3792]],

        [[ 0.2683,  0.0519,  0.4203, -0.1651,  0.1405,  0.2349]],

        [[-0.1558,  0.0759,  0.0083, -0.5699,  0.2753,  0.0393]]])

In [78]:
out.squeeze().shape

torch.Size([5, 6])

In [79]:
print(len(hidden))
print(hidden[0].shape)
print(hidden[1].shape)
hidden

2
torch.Size([2, 1, 3])
torch.Size([2, 1, 3])


(tensor([[[-0.1558,  0.0759,  0.0083]],
 
         [[ 0.2507,  0.1534,  0.1960]]]),
 tensor([[[-0.1996,  0.1990,  0.0272]],
 
         [[ 0.3213,  0.3222,  0.5198]]]))

In [60]:
out.view(5, -1)

tensor([[ 0.0071, -0.1865, -0.0739, -0.2642, -0.0664,  0.1017],
        [ 0.2107, -0.1074, -0.0729, -0.1469, -0.0032,  0.1461],
        [ 0.2318, -0.0825, -0.0517, -0.2266,  0.1168,  0.0341],
        [ 0.2775, -0.1177, -0.0261, -0.3047, -0.0046, -0.0629],
        [ 0.3059,  0.0129, -0.0519, -0.2320, -0.0842, -0.0749]])

### nn.Linear example:

In [40]:
m = nn.Linear(20, 30)
input_ex = torch.randn(128, 20)
output = m(input_ex)
print(output.size())

torch.Size([128, 30])


In [43]:
m

Linear(in_features=20, out_features=30, bias=True)

## POS Tagging with LSTM example

In [26]:
#Prepare data:

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]
word_to_ix = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [27]:
x = nn.Embedding(2, 4)

In [28]:
training_data[0][0]

['The', 'dog', 'ate', 'the', 'apple']

In [29]:
x(torch.tensor(0, dtype=torch.long))

tensor([ 0.7778,  0.1472,  0.0677, -0.2742])

In [30]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim), 
               torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
                            embeds.view(len(sentence), 1, -1) )
        
        #converts hidden dimension to tag_dimension (by linear combination)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        
        return tag_scores
        
    

In [36]:
#Train the model:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

#Look at scores before training:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.0327, -1.2594, -1.0212],
        [-0.9935, -1.2533, -1.0667],
        [-0.9605, -1.3036, -1.0621],
        [-0.9812, -1.2666, -1.0690],
        [-0.9688, -1.3231, -1.0380]])


In [37]:
print(inputs)
tags = prepare_sequence(training_data[0][1], tag_to_ix)
print(tags)

tensor([ 0,  1,  2,  3,  4])
tensor([ 0,  1,  2,  0,  1])


In [38]:
for epoch in range(10):
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()
        
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
        
        #Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        

In [39]:
tag_scores

tensor([[-1.1036, -0.9739, -1.2354],
        [-0.9640, -1.1439, -1.2038],
        [-1.0250, -1.0419, -1.2434],
        [-1.0593, -0.9839, -1.2749]])

In [35]:
targets

tensor([ 1,  2,  0,  1])