# 基础层介绍

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## RNN

https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN

In [2]:
# 10 input size
# 20 hidden size
# 2 layers
rnn = nn.RNN(10, 20, 2, bidirectional=True)

# 5 seqence length
# 3 batch size
# 10 input size
input = torch.randn(5, 3, 10)
output, hn = rnn(input)

In [3]:
output.shape

torch.Size([5, 3, 40])

## LSTM

https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM

In [4]:
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))

## GRU

https://pytorch.org/docs/stable/generated/torch.nn.GRU.html#torch.nn.GRU

In [5]:
rnn = nn.GRU(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

## Embedding

https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding

In [6]:
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(input)

tensor([[[-0.2817,  2.5020,  0.6570],
         [ 1.2463, -1.1309,  0.1036],
         [ 0.8586,  0.7305, -0.2988],
         [-2.5806,  0.8022,  0.9109]],

        [[ 0.8586,  0.7305, -0.2988],
         [-1.3265, -0.8797, -1.1366],
         [ 1.2463, -1.1309,  0.1036],
         [-0.5149, -1.4234, -0.0981]]], grad_fn=<EmbeddingBackward0>)

In [7]:
# example with padding_idx
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0,2,0,5]])
embedding(input)

tensor([[[ 0.0000,  0.0000,  0.0000],
         [ 0.3676, -0.2730,  0.1105],
         [ 0.0000,  0.0000,  0.0000],
         [-0.8949, -0.1575,  0.7414]]], grad_fn=<EmbeddingBackward0>)

In [8]:
# example of changing `pad` vector
padding_idx = 0
embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
embedding.weight
with torch.no_grad():
    embedding.weight[padding_idx] = torch.ones(3)
embedding.weight

Parameter containing:
tensor([[ 1.0000,  1.0000,  1.0000],
        [-1.4291, -0.7111,  2.0442],
        [-0.3766,  0.2660, -1.0601]], requires_grad=True)

## EmbeddingBag

https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag

Computes sums or means of ‘bags’ of embeddings, without instantiating the intermediate embeddings. For bags of constant length, no per_sample_weights, no indices equal to padding_idx, and with 2D inputs, this class

- with mode="sum" is equivalent to Embedding followed by torch.sum(dim=1),
- with mode="mean" is equivalent to Embedding followed by torch.mean(dim=1),
- with mode="max" is equivalent to Embedding followed by torch.max(dim=1).

input (Tensor) – Tensor containing bags of indices into the embedding matrix.

offsets (Tensor, optional) – Only used when input is 1D. offsets determines the starting index position of each bag (sequence) in input.


In [9]:
# an EmbeddingBag module containing 10 tensors of size 3
embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
# a batch of 2 samples of 4 indices each
input = torch.tensor([1,2,4,5,4,3,2,9], dtype=torch.long)
offsets = torch.tensor([0,4], dtype=torch.long)
embedding_sum(input, offsets)

tensor([[ 6.7699e-04,  5.5135e-01, -1.9214e+00],
        [ 1.5817e+00,  2.1055e+00, -2.4439e+00]],
       grad_fn=<EmbeddingBagBackward0>)

In [10]:
# Example with padding_idx
embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
offsets = torch.tensor([0,3, 6], dtype=torch.long)
embedding_sum(input, offsets)

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 1.0605, -0.1163,  1.4681],
        [-0.0485,  0.2389,  2.0982]], grad_fn=<EmbeddingBagBackward0>)

In [11]:
embedding_sum.weight

Parameter containing:
tensor([[ 0.1235, -1.6323, -0.9337],
        [-1.2385,  1.9082, -0.9013],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.4862, -0.5019,  0.4076],
        [ 0.5743,  0.3856,  1.0605],
        [ 0.7429,  0.3365, -0.2873],
        [-0.3412,  1.2754,  0.2810],
        [-0.1451,  0.4013,  1.2669],
        [ 0.6047, -0.3546, -1.8041],
        [-0.0485,  0.2389,  2.0982]], requires_grad=True)

In [12]:
embedding_sum.weight[[2, 2, 2, 2]].sum(0)

tensor([0., 0., 0.], grad_fn=<SumBackward1>)

In [13]:
embedding_sum.weight[[ 4, 3, 2, 9]].sum(0)

tensor([1.0120, 0.1226, 3.5663], grad_fn=<SumBackward1>)

# Tagging案例

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [15]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [16]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.3294, -0.9177, -1.0908],
        [-1.2932, -0.8999, -1.1425],
        [-1.3017, -0.9165, -1.1148],
        [-1.3686, -0.8720, -1.1164],
        [-1.4069, -1.0429, -0.9096]])
tensor([[-0.0309, -3.6581, -5.3773],
        [-3.9614, -0.0268, -4.9020],
        [-4.3294, -4.0429, -0.0312],
        [-0.0606, -3.3325, -3.7691],
        [-4.6619, -0.0145, -5.3108]])
