## 代码实现

In [19]:
import torch
import math
import numpy
import torch.nn as nn
import torch.nn.functional as F

## word embedding

以序列模型为例构建序列（如机器翻译），序列的字符用其再词表中的索引的形式表示
考虑 source sentence 和 target sentence

在真实项目中，关键需要拿到词表大小，特征大小方可生成embedding

In [2]:
# batch_size = 2
# src_len = torch.randint(2, 5, (batch_size,)) # randint(low, high, shape)
# tgt_len = torch.randint(2, 5, (batch_size,))

# 句子长度
src_len = torch.tensor([2, 4]).to(torch.int32) # batch_size 个句子，每个句子的长度为张量中数字，这里{hard code} batch_size = 2
tgt_len = torch.tensor([4, 3]).to(torch.int32)

# 词表大小
max_num_src_words = 8
max_num_tgt_words = 8

# 特征大小
model_dim = 8

# 序列最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5

# 句子索引序列生成
src_seq = [F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len-L)) for L in src_len] # 词表长度为 8，生成两个句子，每个句子中的元素都是单词在词表中的索引，如 [4, 7] 就是长度为 2 的句子
tgt_seq = [F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len-L)) for L in tgt_len] # pad 将长度补齐 (0, max_tgt_seq_len-L)代表左边不补齐，右边补齐代码中长度的 0
src_seq = torch.stack(src_seq) # 变为 batch_size * max_src_seq_len 作为输入
tgt_seq = torch.stack(tgt_seq) # stack(input, dim) 先扩张维度再合并

# embedding，最终生成的 word embedding size = batch_size * seq_len(padding) * 词表大小 
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim) # 这里 +1 是因为有 pad，咱们写死了序列长度，真实序列长度可能就是词表大小 + padding
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim) # embedding_table 留给 pad
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(src_seq)

src_seq.shape, src_embedding.shape

(torch.Size([2, 5]), torch.Size([2, 5, 8]))

## position embedding

$p_{pos, 2i} = \sin(\frac {pos} {10000^{2i/d}}), p_{pos, 2i + 1} = \cos(\frac {pos} {10000^{2i/d}})$

In [72]:
# 假设所有序列（source 和 target）最大长度为 5
max_position_len = 5

# 两个矩阵分别就是公式中三角函数里面的分子和分母矩阵
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape(1, -1) / model_dim)

# tensor 方法构造 position embedding table
pe_embedding_table = torch.zeros(max_position_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat) # 使用 broadcast 机制
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

# 普通方法 构造 position embedding table
# pe_embedding_table = torch.zeros(max_position_len, model_dim)
# for pos in range(max_position_len):
#     for i in range(0, model_dim, 2):
#         pe_embedding_table[pos][i] = math.sin(pos / math.pow(10000, i / model_dim))
#         pe_embedding_table[pos][i+1] = math.cos(pos / math.pow(10000, i / model_dim))

# 构造 position embedding，可能还有相加的方法
pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

src_pos = torch.stack([torch.arange(max_position_len) for _ in src_len]).to(torch.int32)
tgt_pos = torch.stack([torch.arange(max_position_len) for _ in tgt_len]).to(torch.int32)
src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

src_pe_embedding, tgt_pe_embedding

(torch.Size([2, 5, 8]),
 tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
            1.0000e+00,  0.0000e+00,  1.0000e+00],
          [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
            9.9995e-01,  1.0000e-03,  1.0000e+00],
          [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
            9.9980e-01,  2.0000e-03,  1.0000e+00],
          [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
            9.9955e-01,  3.0000e-03,  1.0000e+00],
          [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
            9.9920e-01,  4.0000e-03,  9.9999e-01]],
 
         [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
            1.0000e+00,  0.0000e+00,  1.0000e+00],
          [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
            9.9995e-01,  1.0000e-03,  1.0000e+00],
          [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
        