In [1]:
import torch
import torch.nn as nn

In [2]:
embedding = nn.Embedding(num_embeddings=10,  # 单词表的单词数目
                         embedding_dim=4,  # 输出词向量的维度大小
                         # 默认为False;这个参数的作用是为了加快非常用单词的优化,
                         # 在深度学习模型中,当反向传播获取词向量的梯度以后,如果设置这个参数为True,
                         # 则词向量的梯度会除以这个单词在mini-batch中出现的频率,通过这个缩放操作,
                         # 可以让出现频率较少的单词的梯度比较大,这样可以加快低频词的词向量参数的收敛速度
                         scale_grad_by_freq=True,
                         # 如果这个参数设置为True,则词嵌入矩阵在反向传播中计算得到的矩阵为稀疏矩阵
                         # 这个参数在单词表中单词数目巨大的时候非常有用.
                         # 如果使用稀疏的梯度矩阵,则必须使用稀疏的优化器,如optim.SGD,optim.SparseAdam,optim.AdaGrad
                         sparse=True,  # 默认为False
                         # If specified, the entries at padding_idx do not contribute to the gradient;
                         # therefore, the embedding vector at padding_idx is not updated during training, i.e.
                         # it remains as a fixed “pad”. For a newly constructed Embedding, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector.
                         padding_idx=0)

In [3]:
embedding.weight  # 内部实现了torch.nn.init.normal_随机初始化

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [-0.1920, -1.2236,  0.4971, -0.3048],
        [-0.6763, -0.4996,  0.3797,  1.4260],
        [ 0.3425, -0.1098,  0.5617, -1.2866],
        [-0.7909, -1.4648,  0.6293, -0.0617],
        [-1.4245,  1.8285,  0.0571, -0.8648],
        [-0.6446,  1.3633,  1.8080, -0.5659],
        [ 1.1007,  0.2975, -1.1786,  0.3688],
        [-0.6017,  0.0611,  2.1240,  0.2194],
        [ 1.0645,  0.2728,  0.1876,  1.8023]], requires_grad=True)

In [4]:
print(embedding.num_embeddings)
print(embedding.embedding_dim)

10
4


In [5]:
pre_em = torch.arange(40, dtype=torch.float32).reshape(10, 4)
embedding_pre = nn.Embedding.from_pretrained(pre_em, sparse=True,
                                             scale_grad_by_freq=True)  # 从预训练的词嵌入矩阵得到词嵌入模块(类方法,参数含义与上相同)
embedding_pre.weight

Parameter containing:
tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.],
        [20., 21., 22., 23.],
        [24., 25., 26., 27.],
        [28., 29., 30., 31.],
        [32., 33., 34., 35.],
        [36., 37., 38., 39.]])

In [6]:
print(embedding_pre.num_embeddings)
print(embedding_pre.embedding_dim)

10
4


In [7]:
entry = torch.tensor([[0, 1, 2],
                      [5, 6, 7]], dtype=torch.long)

In [8]:
print(embedding_pre(entry))  # 通过取出对应元素索引序号的行,来获取某个元素对应的词向量
print(embedding_pre(entry).shape)  # (2, 3) <--> (10, 4) --> (2, 3, 4)

tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[20., 21., 22., 23.],
         [24., 25., 26., 27.],
         [28., 29., 30., 31.]]])
torch.Size([2, 3, 4])


In [9]:
entry1 = torch.tensor([1, 2])
print(embedding_pre(entry1))
print(embedding_pre(entry1).shape)  # (2,) <-->(10, 4) --> (2, 4)

tensor([[ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
torch.Size([2, 4])
