In [1]:
import torch
import torch.nn as nn

In [2]:
# 参数含义完全类似RNN
lstm = nn.LSTM(input_size=100,
               hidden_size=15,
               dropout=0.5,
               bias=True,
               batch_first=False,
               num_layers=2,
               bidirectional=True)

x = torch.randn(20, 3, 100)  # 输入的默认形状为(T,N,C),其中T为序列的长度,N为min-batch的大小,C为输入的特征数目

# (L*D, N, hidden_size),其中L为循环神经网络层数,D为1(单向)或2(双向)
h_0 = torch.ones((4, 3, 15))
c_0 = torch.ones((4, 3, 15))

out, (h, c) = lstm(x, hx=(h_0, c_0))  # 自定h_0,c_0,默认h_0,c_0为全0张量
# out为每个序列最后一层h的输出(双向则2维度*2);h,c为最后一个序列每层的输出(双向则0维度*2)

print("*************************")
print(h)
print("*************************")
print(c)
print("*************************")
print(out.shape, h.shape, c.shape)  # out.shape=(T, N, hidden_size * (1 or 2))

*************************
tensor([[[-0.0098,  0.0012,  0.1691,  0.4326,  0.5942, -0.7486,  0.0080,
          -0.0923,  0.1352,  0.1408,  0.5962, -0.2057, -0.1318, -0.1806,
           0.4041],
         [-0.3637, -0.0946, -0.6212, -0.2414,  0.5027, -0.8230,  0.8895,
          -0.0685,  0.0331,  0.0682,  0.0448,  0.0039,  0.1407, -0.7683,
           0.0399],
         [ 0.0332,  0.1562, -0.0280,  0.0450, -0.3809,  0.2577, -0.0266,
          -0.0845,  0.1121,  0.0688, -0.2330, -0.0334, -0.1280,  0.1132,
           0.3145]],

        [[ 0.2470,  0.0552,  0.1643,  0.2234,  0.4384, -0.0057,  0.0944,
           0.7099, -0.0050, -0.1675, -0.2325, -0.0625,  0.5927, -0.6400,
          -0.0778],
         [-0.4883,  0.0447,  0.2443, -0.0943, -0.1532, -0.0668,  0.0751,
           0.0209,  0.1994, -0.0493, -0.1111, -0.4012,  0.1440, -0.1130,
          -0.5076],
         [-0.0334,  0.3452,  0.0634,  0.0654, -0.1726,  0.2377, -0.0068,
          -0.2936, -0.1943, -0.4787,  0.3279, -0.0997,  0.1684,  0.14

In [3]:
# 注意参数W,b和记忆h,c的区别
# weight_ih_l0:第一层[W_ih]_c,[W_ih]_i,[W_ih]_f,[W_ih]_o权重矩阵的堆叠
# weight_hh_l0:第一层[W_hh]_c,[W_hh]_i,[W_hh]_f,[W_hh]_o权重矩阵的堆叠
# weight_ih_l0_reverse:第一层反向[W_ih]_c,[W_ih]_i,[W_ih]_f,[W_ih]_o权重矩阵的堆叠
# weight_hh_l0_reverse:第一层反向[W_hh]_c,[W_hh]_i,[W_hh]_f,[W_hh]_o权重矩阵的堆叠
# weight_ih_l1:第二层[W_ih]_c,[W_ih]_i,[W_ih]_f,[W_ih]_o权重矩阵的堆叠
# weight_hh_l1:第二层[W_hh]_c,[W_hh]_i,[W_hh]_f,[W_hh]_o权重矩阵的堆叠
for name, param in lstm.named_parameters():
    print(name, '  shape=', param.shape)

weight_ih_l0   shape= torch.Size([60, 100])
weight_hh_l0   shape= torch.Size([60, 15])
bias_ih_l0   shape= torch.Size([60])
bias_hh_l0   shape= torch.Size([60])
weight_ih_l0_reverse   shape= torch.Size([60, 100])
weight_hh_l0_reverse   shape= torch.Size([60, 15])
bias_ih_l0_reverse   shape= torch.Size([60])
bias_hh_l0_reverse   shape= torch.Size([60])
weight_ih_l1   shape= torch.Size([60, 30])
weight_hh_l1   shape= torch.Size([60, 15])
bias_ih_l1   shape= torch.Size([60])
bias_hh_l1   shape= torch.Size([60])
weight_ih_l1_reverse   shape= torch.Size([60, 30])
weight_hh_l1_reverse   shape= torch.Size([60, 15])
bias_ih_l1_reverse   shape= torch.Size([60])
bias_hh_l1_reverse   shape= torch.Size([60])


In [4]:
lstm.weight_ih_l0  # all the weights and biases are initialized from U(-\sqrt{k}, \sqrt{k}), where k=1/hidden_size

Parameter containing:
tensor([[-0.1862, -0.0267,  0.0782,  ..., -0.2249,  0.2192, -0.0939],
        [-0.2106, -0.1479,  0.1631,  ...,  0.1549, -0.0248,  0.0265],
        [ 0.1605,  0.0817, -0.1525,  ...,  0.1590, -0.0721, -0.1663],
        ...,
        [-0.2537, -0.0684, -0.1438,  ..., -0.0392, -0.0655,  0.1848],
        [-0.0364, -0.1411, -0.0901,  ...,  0.1642,  0.2404,  0.2294],
        [ 0.2247, -0.0577,  0.0722,  ..., -0.1829, -0.0912,  0.0052]],
       requires_grad=True)