In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
import torch.nn.functional as F

In [4]:
import os

In [5]:
import requests

In [6]:
import tiktoken

In [7]:
import math

# 1、获取数据集-由于无法访问huggingface手动上传

In [8]:
# todo nothing

# 2、读取sales_textbook.txt文件

In [9]:
with open('trans_instance.txt','r') as f:
    text = f.read()

In [10]:
text[0:1000]

'小沈阳江西演唱会邀请沈春阳，\n明星刀郎的歌火遍大江南北\n2002年的第一场雪比2001年来得\n大模型张老师的粉丝全是正能量'

In [11]:
len(text)

61

# 3、引入tiktoken 将文字token化

In [12]:
encoding = tiktoken.get_encoding("cl100k_base")

In [13]:
tokenized_text = encoding.encode(text)

In [14]:
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)

In [15]:
max_token_value = tokenized_text.max().item()

In [16]:
len(tokenized_text)

70

In [17]:
max_token_value

92877

# 4、参数设置

In [18]:
context_length = 16

In [19]:
d_model = 64

In [20]:
batch_size = 4

In [21]:
num_heads = 4 # 表示的含义是共16个token（对应文字），分成4个头来处理

In [22]:
data = tokenized_text

In [23]:
high = len(data) - context_length

In [24]:
high

54

# 5、初始化张量

## 参数low随机整数，最小值为零

## 参数high随机整数，最大值为49

In [25]:
idxs = torch.randint(low=0, high=high, size=(batch_size,))

In [26]:
#idxs tensor([53, 25, 13,  7])

## 说明初始化一维张量的目的

### 在trans_instance.txt文件中，中文字数59， token数65

### 随机张量tensor([41, 1, 32, 50]) 以41，1，32，50为token索引， 获取分别41，1，32，50为起始索引的token

### 例如41为起点的索引，取长度为10的文本，其它索引的值为41，42，43，44，45，46，47，48，49，50 每个索引对应了具体的token

# 6、初始化4批数据

## 数据结构是4行10列， 即4行token， 每行10个token

In [27]:
x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])

In [28]:
x_batch

tensor([[70277, 61786, 78256,   242, 84150,   109, 38093, 45932,   222, 15225,
         31106,   230, 11881,    98, 83175, 42553],
        [78519,  6701,   222, 31938,   236,  9554, 15722,   234, 80699, 30250,
           235, 27384, 70277, 59563, 49409,   198],
        [ 1049,    17,  8107,  9554, 30537, 15120, 83324, 25132,   103, 57106,
          1049,    16,  8107, 37507, 50928,   198],
        [   17,  8107,  9554, 30537, 15120, 83324, 25132,   103, 57106,  1049,
            16,  8107, 37507, 50928,   198, 27384]])

In [29]:
y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])

In [30]:
y_batch

tensor([[61786, 78256,   242, 84150,   109, 38093, 45932,   222, 15225, 31106,
           230, 11881,    98, 83175, 42553, 31958],
        [ 6701,   222, 31938,   236,  9554, 15722,   234, 80699, 30250,   235,
         27384, 70277, 59563, 49409,   198,  1049],
        [   17,  8107,  9554, 30537, 15120, 83324, 25132,   103, 57106,  1049,
            16,  8107, 37507, 50928,   198, 27384],
        [ 8107,  9554, 30537, 15120, 83324, 25132,   103, 57106,  1049,    16,
          8107, 37507, 50928,   198, 27384, 54872]])

# 7、引入pandas查看原始数据

In [31]:
import pandas as pd

In [32]:
encoding.decode(x_batch[0].numpy())

'江西演唱会邀请沈春阳，\n'

In [33]:
encoding.decode(x_batch[1].numpy())

'星刀郎的歌火遍大江南北\n'

In [34]:
encoding.decode(x_batch[2].numpy())

'2002年的第一场雪比2001年来得\n'

In [35]:
encoding.decode(x_batch[3].numpy())

'2年的第一场雪比2001年来得\n大'

In [36]:
encoding.decode(y_batch[0].numpy())

'西演唱会邀请沈春阳，\n明'

In [37]:
encoding.decode(y_batch[1].numpy())

'刀郎的歌火遍大江南北\n200'

In [38]:
encoding.decode(y_batch[2].numpy())

'2年的第一场雪比2001年来得\n大'

In [39]:
encoding.decode(y_batch[3].numpy())

'年的第一场雪比2001年来得\n大模'

# 8、input Enbedding 初始化

In [40]:
encoding.decode([83175]) 

'阳'

## 创建一个Embedding table （92877， 64）即行为92877 列为64列

In [41]:
input_embedding_lookup_table = nn.Embedding(
    num_embeddings=max_token_value + 1,  # 词汇表大小（含未知标记）92877+1 = 92878
    embedding_dim=d_model               # 嵌入向量维度 64纬
)

In [42]:
input_embedding_lookup_table

Embedding(92878, 64)

In [43]:
input_embedding_lookup_table.weight.data #初始化的权重，这些初始值在训练过程中修正

tensor([[-1.6119,  0.2164, -0.3254,  ..., -1.0184, -0.2164, -2.6727],
        [-0.1531, -1.4887, -0.0038,  ..., -0.5432,  0.2821, -0.2274],
        [-0.2000,  0.5862, -0.6680,  ..., -1.2948,  1.9357, -1.1341],
        ...,
        [-0.3051, -0.7145, -1.7872,  ...,  0.7477, -0.6250, -2.0174],
        [ 0.8880, -1.8811, -1.6866,  ...,  1.5089, -1.5790,  1.0241],
        [-0.5316, -0.1321,  0.9559,  ..., -0.2754,  0.6680,  0.1626]])

In [44]:
x_batch_embedding = input_embedding_lookup_table(x_batch)
y_batch_embedding = input_embedding_lookup_table(y_batch)

In [45]:
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [46]:
x_batch_embedding

tensor([[[ 2.2273,  0.2084, -0.8678,  ...,  0.5926,  0.0512,  1.3771],
         [ 0.3578,  0.4508, -0.0893,  ...,  0.6174, -1.6597,  1.0343],
         [-1.2180, -0.7694, -0.2508,  ...,  0.7185, -1.3977,  0.0856],
         ...,
         [-1.4082,  1.1521, -1.9055,  ..., -2.2631,  0.7061, -0.1041],
         [ 0.5288, -0.1233,  0.6985,  ...,  1.3189, -1.4232, -1.1154],
         [ 0.4350, -0.1887, -0.9183,  ...,  1.2932, -0.1872,  0.6981]],

        [[ 0.4197,  1.0569, -0.2055,  ..., -1.3404, -0.4475, -0.2596],
         [ 0.7627, -0.1299,  1.3250,  ..., -1.8814,  0.1129, -0.9940],
         [-0.5610, -0.6925,  0.0296,  ..., -1.6117, -1.4970, -0.1494],
         ...,
         [ 0.7085,  1.3735,  1.8415,  ..., -1.6378, -0.6527, -0.1728],
         [ 0.7814,  0.4007, -0.2219,  ...,  1.0320, -0.7807,  1.5704],
         [ 1.2776, -0.8719,  2.4681,  ..., -0.3040,  0.9005, -0.5709]],

        [[-0.6146, -0.1115, -0.8440,  ..., -1.5480,  0.3118, -1.0434],
         [ 0.7364,  0.0763,  0.5302,  ...,  0

## 4 代表的是4个批次， 16是行数即16个token， 64即64列64个纬度

## 现在来说明input_embedding_lookup_table(x_batch)业务含义 ；
1. x_batch[3].numpy() = [31106,   230, 83175, 70277, 61786, 78256,   242, 84150,   109, 38093]
2. 第一个token= 31106 代表的是中文字 “沈”
3. input_embedding_lookup_table(x_batch) 将token=31106关联到enbedding第31106行， 每一行都代表第token行的toekn，表示一个文字

In [47]:
y_batch_embedding.shape

torch.Size([4, 16, 64])

# 9、positional embedding 加入位置信息

In [48]:
position_encoding_lookup_table = torch.zeros(context_length, d_model)

In [49]:
position_encoding_lookup_table

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## 目标是初始一个10x64的二维矩阵，目的就是给每一个

In [50]:
# 2. 生成位置序列 [0, 1, 2, ..., context_length-1]
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)

# 3. 计算频率缩放因子（指数衰减）
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

# 4. 交替应用正弦和余弦函数
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)  # 偶数位置
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)  # 奇数位置

# 5. 添加batch维度 [batch_size, seq_len, d_model]
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)

In [51]:
position_encoding_lookup_table

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  6.8156e-01,  ...,  1.0000e+00,
           1.3335e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  9.9748e-01,  ...,  1.0000e+00,
           2.6670e-04,  1.0000e+00],
         ...,
         [ 4.2017e-01,  9.0745e-01, -3.1822e-01,  ...,  1.0000e+00,
           1.7336e-03,  1.0000e+00],
         [ 9.9061e-01,  1.3674e-01, -8.7899e-01,  ...,  1.0000e+00,
           1.8669e-03,  1.0000e+00],
         [ 6.5029e-01, -7.5969e-01, -9.6821e-01,  ...,  1.0000e+00,
           2.0003e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  6.8156e-01,  ...,  1.0000e+00,
           1.3335e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  9.9748e-01,  ...,  1.0000e+00,
           2.6670e-04,  1.0000e+00],
         ...,
         [ 4.2017e-01,  9

## 增加位置编码

In [52]:
x = x_batch_embedding + position_encoding_lookup_table

In [53]:
x

tensor([[[ 2.2273,  1.2084, -0.8678,  ...,  1.5926,  0.0512,  2.3771],
         [ 1.1992,  0.9911,  0.5923,  ...,  1.6174, -1.6596,  2.0343],
         [-0.3087, -1.1855,  0.7467,  ...,  1.7185, -1.3974,  1.0856],
         ...,
         [-0.9880,  2.0595, -2.2237,  ..., -1.2631,  0.7078,  0.8959],
         [ 1.5194,  0.0135, -0.1805,  ...,  2.3189, -1.4213, -0.1154],
         [ 1.0853, -0.9484, -1.8865,  ...,  2.2932, -0.1852,  1.6981]],

        [[ 0.4197,  2.0569, -0.2055,  ..., -0.3404, -0.4475,  0.7404],
         [ 1.6042,  0.4104,  2.0066,  ..., -0.8814,  0.1130,  0.0060],
         [ 0.3483, -1.1086,  1.0271,  ..., -0.6117, -1.4968,  0.8506],
         ...,
         [ 1.1287,  2.2809,  1.5233,  ..., -0.6378, -0.6509,  0.8272],
         [ 1.7720,  0.5375, -1.1009,  ...,  2.0319, -0.7788,  2.5704],
         [ 1.9279, -1.6316,  1.4998,  ...,  0.6960,  0.9025,  0.4291]],

        [[-0.6146,  0.8885, -0.8440,  ..., -0.5480,  0.3118, -0.0434],
         [ 1.5779,  0.6166,  1.2118,  ...,  1

In [54]:
y = y_batch_embedding + position_encoding_lookup_table

In [55]:
y

tensor([[[ 0.3578,  1.4508, -0.0893,  ...,  1.6174, -1.6597,  2.0343],
         [-0.3766, -0.2291,  0.4308,  ...,  1.7185, -1.3976,  1.0856],
         [ 0.8002, -0.9211,  0.6341,  ...,  1.6799, -1.0962,  0.5426],
         ...,
         [ 0.9490,  0.7842,  0.3802,  ...,  2.3189, -1.4215, -0.1154],
         [ 1.4256, -0.0520, -1.7973,  ...,  2.2932, -0.1853,  1.6981],
         [-0.0983, -1.2139, -2.0702,  ...,  2.2897,  1.8374,  0.3341]],

        [[ 0.7627,  0.8701,  1.3250,  ..., -0.8814,  0.1129,  0.0060],
         [ 0.2805, -0.1522,  0.7111,  ..., -0.6117, -1.4969,  0.8506],
         [-0.2667, -0.5526,  0.4538,  ...,  1.2144,  1.7487,  2.1098],
         ...,
         [ 1.2015,  1.3082, -0.5401,  ...,  2.0320, -0.7789,  2.5704],
         [ 2.2682, -0.7351,  1.5891,  ...,  0.6960,  0.9024,  0.4291],
         [ 0.0357, -0.8712, -1.8122,  ..., -0.5480,  0.3138, -0.0434]],

        [[ 0.7364,  1.0763,  0.5302,  ...,  1.2749, -1.6080,  0.2292],
         [ 1.5710, -0.9185,  0.5799,  ...,  0

In [56]:
x.shape, y.shape

(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [57]:
pd.DataFrame(x[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,2.227319,1.208381,-0.867762,-0.291035,-0.343142,-1.160986,0.202412,0.810499,1.222083,1.075176,...,1.310896,-0.040836,-1.200476,1.212839,0.039117,-0.290366,2.635434,1.592633,0.051152,2.37705
1,1.199244,0.991056,0.592263,0.933342,0.045801,1.601769,-0.794732,1.100391,0.536851,0.342772,...,-1.421795,0.430002,-2.296274,0.682391,-1.125544,1.03978,-0.736582,1.617381,-1.659551,2.03428
2,-0.308749,-1.185501,0.746684,-0.780624,0.083542,1.153145,-0.036034,0.175532,-0.025557,1.172675,...,-1.304787,0.517356,-0.768915,2.453286,-0.132233,1.682389,-0.506289,1.718456,-1.397431,1.085632
3,0.032044,-1.494931,0.414858,-0.925142,2.629792,-1.14453,-0.685785,-0.326149,0.158879,-0.114806,...,0.345901,0.649032,2.280771,0.029028,0.074907,-0.605091,1.007647,1.679901,-1.096116,0.542577
4,-1.060789,-0.412721,-0.923882,-0.723193,-0.184083,0.67869,1.529256,0.018259,0.129565,0.68082,...,-0.431877,0.779191,0.585883,0.382702,-0.148482,2.479179,1.081517,0.404456,-1.075124,1.786806
5,-0.94892,-1.361446,-0.797091,-0.989228,0.189803,-2.075689,1.784138,-0.658245,1.799834,-1.738389,...,-1.264379,1.780471,-0.324576,0.633097,0.180765,0.485122,-0.88918,-0.075535,0.407052,1.805238
6,-0.597395,1.875761,-1.45078,0.821889,0.657613,-0.428291,-1.10694,-1.103467,2.903076,0.899298,...,0.127273,1.778618,0.187178,-0.191826,-0.566161,0.562522,0.79593,1.216661,0.23063,1.741593
7,0.359223,0.206921,1.131576,-1.144079,-1.432628,-1.236437,-0.190696,-1.427646,0.236537,-1.275339,...,-0.135128,-0.305889,1.485477,-0.562883,0.143539,0.067994,0.391188,1.431948,-0.940127,1.029929
8,0.428367,-0.837962,-0.250654,1.326563,0.97491,0.246996,0.599119,0.319862,1.503827,1.437559,...,-1.598417,-1.088883,1.673712,-0.45488,0.851233,2.68314,0.223131,-0.611667,-1.49596,0.850562
9,0.761187,0.178431,-0.929669,0.898708,0.026683,2.011811,0.097315,-1.31495,-1.103001,0.943769,...,-0.419356,0.280893,-0.413504,0.646797,-0.182306,3.690764,0.536161,2.294566,-0.245955,1.193938


In [58]:
pd.DataFrame(y[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.357773,1.450754,-0.089299,1.201581,-0.487368,1.75576,-1.204041,1.187995,0.225868,0.392357,...,-1.422216,0.430002,-2.29659,0.682391,-1.125781,1.03978,-0.73676,1.617381,-1.659685,2.03428
1,-0.376576,-0.229051,0.430766,-0.119812,-0.28542,1.567691,-0.373629,0.422995,-0.3057,1.316512,...,-1.305209,0.517357,-0.769232,2.453286,-0.13247,1.682389,-0.506467,1.718456,-1.397565,1.085632
2,0.800221,-0.921085,0.634066,-0.226267,2.53867,-0.597101,-0.892516,0.037816,-0.062642,0.109018,...,0.34548,0.649032,2.280455,0.029028,0.07467,-0.605091,1.007469,1.679901,-1.096249,0.542577
3,-0.162866,-0.74907,-0.287149,-0.361187,0.030699,1.190404,1.489609,0.434956,-0.011366,0.962436,...,-0.432298,0.779191,0.585567,0.382702,-0.148719,2.479179,1.081339,0.404456,-1.075257,1.786806
4,-0.746798,-2.298752,-0.084425,-1.158299,0.644339,-1.75729,1.918523,-0.261824,1.753469,-1.426909,...,-1.2648,1.780472,-0.324892,0.633097,0.180528,0.485122,-0.889358,-0.075535,0.406919,1.805238
5,-1.276904,1.199253,-1.044511,0.212444,1.211916,-0.401266,-0.822069,-0.79678,2.955874,1.209752,...,0.126851,1.778619,0.186862,-0.191826,-0.566398,0.562522,0.795752,1.216661,0.230497,1.741593
6,-0.577179,0.413189,1.013493,-1.866944,-0.949274,-1.509111,0.194748,-1.264426,0.383264,-0.996698,...,-0.13555,-0.305888,1.48516,-0.562882,0.143301,0.067994,0.39101,1.431948,-0.94026,1.029929
7,0.095995,0.06144,-0.82974,0.878079,1.238451,-0.241397,1.017604,0.311017,1.729931,1.656754,...,-1.598838,-1.088881,1.673396,-0.454879,0.850996,2.683141,0.222953,-0.611666,-1.496093,0.850562
8,1.338427,0.944061,-1.659091,0.965207,-0.010755,1.458114,0.475519,-1.49431,-0.819942,1.081781,...,-0.419777,0.280895,-0.41382,0.646798,-0.182543,3.690764,0.535984,2.294566,-0.246089,1.193938
9,0.837001,-0.865825,-1.15234,0.101857,1.312144,-0.793945,0.721348,-0.83736,-0.370198,-0.014134,...,-1.80527,1.317155,0.458655,2.032854,0.741658,0.862796,1.827389,1.268273,0.015452,0.840279


# 10、Q，K，V 初始化

In [59]:
Wq = nn.Linear(d_model, d_model)

In [60]:
Wq

Linear(in_features=64, out_features=64, bias=True)

# 使用Linear实现线性变变换
## 数学应用方程：
y = x * W^T + b，其中 W 是权重矩阵，b 是偏置向量。

## 业务意义在于：
给出多个样本特殊，调整权重和偏置，输出新的特殊

## 举例应用说明：
房价预测应该场景场景，给出多套房子的样本 1、[80.0, 2, 2010],   # 80㎡, 2室, 2010年建 2、[120.0, 3, 2015],  # 120㎡, 3室, 2015年建， 3、[95.0, 2, 2005],   # 95㎡, 2室, 2005年建，4、[150.0, 4, 2020]   # 150㎡, 4室, 2020年建；1，2，3，4 房屋对应的售价为[180.0, 320.0, 220.0, 450.0]（单位万元）

## 目标应用说明：
预测[110.0, 3, 2018] # 110㎡, 3室, 2018年建 房屋的售价

## Linear函数说明：
Wq = nn.Linear(输入特征, 输出特征) ， 在本实例中输入特殊是3个，分别是1、房屋面积，2、房屋室数，3、房屋的建造时间；输出特征是1个，房屋售价


# 在transfomer 应用中的说明
nn.Linear(d_model, d_model) 输入特征64个，输出特征64个；

In [61]:
Wk = nn.Linear(d_model, d_model)

In [62]:
Wk

Linear(in_features=64, out_features=64, bias=True)

In [63]:
Wv = nn.Linear(d_model, d_model)

In [64]:
Wv

Linear(in_features=64, out_features=64, bias=True)

In [65]:
Q = Wq(x)

In [66]:
Q, Q.shape

(tensor([[[-0.3730,  0.2596,  0.0551,  ..., -1.1778, -0.4900,  0.2701],
          [ 0.0948, -0.0452, -1.4833,  ..., -1.0527,  0.3641, -0.9317],
          [ 1.3326, -0.1049,  0.7766,  ..., -0.7566,  1.0303, -1.4135],
          ...,
          [ 0.7365, -0.3461,  0.7025,  ...,  0.0836,  0.3765, -0.1521],
          [ 0.5027,  0.8307,  0.1504,  ..., -1.0211, -0.0636,  0.1331],
          [ 0.8030, -1.1224, -0.3981,  ..., -0.0471, -0.1667,  1.0940]],
 
         [[ 0.7128,  0.5455,  0.6433,  ...,  0.5088,  1.4039,  0.4895],
          [ 1.0800,  1.1886,  0.3522,  ..., -1.7433,  1.3344, -1.0574],
          [-0.0990,  0.0650,  0.3896,  ..., -2.1305, -0.0659, -0.4703],
          ...,
          [ 0.4792,  1.1859,  0.0338,  ...,  0.3888,  1.1047, -0.9642],
          [-0.6032, -1.5424,  0.1622,  ...,  0.2111, -0.2859, -0.1583],
          [ 1.8390,  0.9554, -0.0900,  ..., -1.2859, -0.2285, -0.3110]],
 
         [[-0.3380,  0.5130,  0.0695,  ..., -0.9158,  0.6343, -0.2454],
          [ 0.1843, -0.3056,

In [67]:
K = Wk(x)

In [68]:
K, K.shape

(tensor([[[ 1.8046e-01,  1.1139e+00,  2.2179e-02,  ...,  1.0466e-01,
            4.3003e-01,  3.4449e-01],
          [ 3.5371e-01, -7.3697e-02,  2.1858e-01,  ...,  5.7132e-02,
            1.2799e+00,  3.1169e-01],
          [ 9.3529e-01,  4.0174e-01,  1.3239e+00,  ..., -6.9774e-01,
            2.2835e-01, -3.3770e-01],
          ...,
          [ 6.6475e-01,  6.4982e-01, -5.3610e-01,  ...,  1.2636e-01,
           -1.0674e-01, -1.0265e+00],
          [-1.8350e-01,  2.5860e-01,  7.2985e-01,  ..., -6.8019e-01,
            1.3254e+00,  1.1527e+00],
          [-1.2009e+00,  6.1653e-01, -3.5848e-01,  ...,  2.8922e-01,
           -1.3731e-01,  9.8321e-02]],
 
         [[-1.6129e-01,  4.3212e-01,  4.1891e-01,  ...,  2.1533e-01,
            5.2450e-01, -6.8731e-01],
          [ 2.8928e-01,  1.8965e+00,  3.4286e-01,  ..., -4.9676e-01,
           -4.6857e-01,  6.1583e-01],
          [-1.1168e-01,  2.0521e-01,  9.8527e-01,  ...,  1.0844e+00,
            7.2728e-01,  9.9427e-01],
          ...,
    

In [69]:
V = Wv(x)

In [70]:
V, V.shape

(tensor([[[ 0.2101, -0.2642, -0.0187,  ..., -0.3991,  1.3007,  0.2192],
          [ 0.9700,  0.2160,  0.9357,  ...,  0.0524, -0.2690, -0.7381],
          [ 0.8299,  1.0521, -0.3755,  ..., -0.5164,  0.2159,  0.9648],
          ...,
          [-0.5894, -0.2769,  0.2222,  ..., -1.0624,  0.3319, -0.3580],
          [ 0.6967, -0.2212,  0.1251,  ..., -0.9125,  1.0350, -0.2970],
          [ 0.4239, -0.9956,  0.5543,  ..., -0.8051,  0.7492, -0.1003]],
 
         [[ 1.2150, -0.0367,  0.3736,  ..., -0.1700,  0.3376,  0.7344],
          [ 0.8544,  0.0643, -0.3343,  ..., -0.1194, -0.0587, -0.2945],
          [ 0.8146, -0.7014,  0.8302,  ...,  0.9869, -0.4773, -0.4093],
          ...,
          [ 0.8314,  0.8159,  0.5155,  ..., -0.1332,  1.2528,  0.6942],
          [ 0.3311, -1.5904,  1.1245,  ..., -1.5752,  0.9364, -0.1469],
          [-0.2969,  0.9127,  0.0955,  ..., -0.4721,  1.1855,  0.9283]],
 
         [[ 0.9342, -1.3837,  0.4140,  ..., -0.7778,  0.6217,  0.7750],
          [ 1.2607,  0.1890,

# 矩阵相乘法则
左矩阵A:纬度m x n (m行，n列)
右矩阵B:纬度n x p（n行，p列）
乘矩阵C:伟度m x p（m行，p列） C = A X B；可以参考https://www.processon.com/diagraming/689f04fbacb0cb4e0f75bbb1 实例

# 11、multi head Q,K,V多头注意力机制

In [71]:
Q = Q.view(batch_size, context_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)

## 多头注意力业务含义说明
极其重要，注意深刻理解其实现的目标