In [1]:
pip install torch torchvision torchaudio

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import requests

In [3]:
#导入需要运用的训练数据集
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_fro_convincing_and_selling/resolve/main/sales_textbook.txt?download=true'
    r = requests.get(url)
    with open('sales_textbook.txt', 'wb') as f:
        f.write(r.content)

with open('sales_textbook.txt', 'r') as f:
    text = f.read()

In [4]:
pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
#hyperparameters超参数
batch_size = 4#一次训练的样本数量
context_length = 16#文本中的单词或短语的数量
d_model = 64#模型维度，即每个单词或短语的特征数量
num_heads = 4#注意力头的数量

In [6]:
#导入OpenAI的tiktoken
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

In [7]:
tokenized_text = encoding.encode(text)
#将文本数据转换为token(即文本中的单词或短语赋予一个唯一的数字)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)#将token转换为tensor
max_token_value = tokenized_text.max().item()#样本数据集中可能出现的文字所对应token的最大值

数据集中的数据会切成两份，80%-90%用于训练，10%-20%用于测试

In [8]:
#将数据集分为训练集和测试集
tokenized_text_train = tokenized_text[:int(len(tokenized_text)*0.9)]#取前90%
tokenized_text_test = tokenized_text[int(len(tokenized_text)*0.9):]#取后10%作为测试

In [9]:
data = tokenized_text_train
idxs = torch.randint(low=0, high=len(data)-context_length, size=(batch_size,))
#torch.randint 是 PyTorch 中的一个函数，用于生成一定范围内的随机整数
#low=0 表示随机数的最小值是 0
#high=len(data)-context_length 表示随机数的最大值是 len(data)-context_length。这里的 context_length 是一个超参数，表示每个子序列的长度。通过减去 context_length，确保生成的索引不会超出数据范围，从而避免在抽取子序列时出现越界错误。
#size=(batch_size,) 表示生成一个大小为 batch_size 的一维张量，其中 batch_size 是另一个超参数，表示每次训练所用的样本数量。
#因此，idxs 是一个包含 batch_size 个随机整数的张量，这些整数表示从 data 中随机抽取子序列的起始索引。



x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])
#torch.stack 是 PyTorch 中的一个函数，用于将一系列张量堆叠成一个更高维度的张量。
#x_batch 是一个包含 batch_size 个子序列的张量，每个子序列的长度为 context_length。
#idxs 是一个包含 batch_size 个随机整数的张量，这些整数表示从 data 中随机抽取子序列的起始索引。
#通过使用列表推导式，我们可以根据 idxs 中的索引从 data 中抽取子序列，并将这些子序列存储在 x_batch 中。
#x_batch.shape#查看x_batch的形状

y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])
#y_batch 是一个包含 batch_size 个子序列的张量，每个子序列的长度为 context_length。
#idxs 是一个包含 batch_size 个随机整数的张量，这些整数表示从 data 中随机抽取子序列的起始索引。
#通过使用列表推导式，我们可以根据 idxs 中的索引从 data 中抽取子序列，并将这些子序列存储在 y_batch 中。
#y_batch.shape#查看y_batch的形状


查看一下选出来的x_batch

In [10]:
import pandas as pd
pd.DataFrame(x_batch[0].numpy())
#将x_batch的第一个子序列转换为numpy数组，并使用pandas的DataFrame函数将其转换为数据框，以便于查看和可视化。
encoding.decode(x_batch[0].numpy())#将数字对应的文字解码出来

' trust in your ability to deliver.\n6. Communication and Active Listening: Effective communication'

In [11]:
#定义input embedding table
input_embedding_lookup_table = nn.Embedding(max_token_value+1, d_model)
#input_embedding_lookup_table.weight.data#这些数值就是weight，也就是后面我们需要更新的概率值

x_batch_embedded = input_embedding_lookup_table(x_batch)
#x_batch_embedded 是一个形状为 (batch_size, context_length, d_model) 的张量，其中每个元素都是对应的嵌入向量。
y_batch_embedded = input_embedding_lookup_table(y_batch)
#x_batch_embedded.shape，y_batch_embedded.shape

到此已经完成input embedding步骤

In [12]:
#对input embedding添加位置信息
import math
position_embedding_lookup_table = torch.zeros(context_length, d_model)#先建一个 16*64的全0矩阵
position = torch.arange(0,context_length,dtype=torch.float).unsqueeze(1)
#利用论文中的位置信息函数，将位置信息添加到position_embedding_lookup_table中
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_embedding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_embedding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_embedding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)



In [13]:
x = x_batch_embedded + position_encoding_lookup_table
y = y_batch_embedded + position_encoding_lookup_table
pd.DataFrame(x[0].detach().numpy())


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-1.631578,1.208147,1.066374,-0.972546,-0.073038,1.1184,1.228194,1.530867,-0.372193,0.453301,...,0.288694,2.137652,-0.955524,0.199224,1.807435,0.134501,-1.083578,1.46164,-0.113456,0.216361
1,1.769613,1.860013,0.687189,0.804351,1.192128,0.111826,-1.53433,1.699373,1.291381,-0.384467,...,0.213289,1.274977,1.178442,2.828774,0.405725,3.375164,0.038646,1.713534,0.419395,0.892609
2,0.902156,-0.105907,2.373974,-1.097951,1.714396,0.225811,1.618515,1.364705,0.563326,0.633518,...,-2.137517,1.462792,0.79149,-1.694142,1.925355,-0.138269,0.802537,2.137997,0.190213,-0.005623
3,-0.541566,-1.644048,0.839332,0.314808,0.496985,0.720602,1.251098,1.269858,3.854401,1.213707,...,-0.328406,1.117209,-2.315888,2.12185,-1.246513,0.435249,0.510102,0.015933,0.033678,1.914363
4,-1.145767,-0.783517,0.58097,-0.843287,-0.555262,1.211721,-0.398026,-0.738013,1.274947,0.844461,...,0.465796,1.748554,0.773532,-0.673086,-1.853015,-0.72134,0.029307,1.414728,1.515731,1.791397
5,0.000873,0.573091,-3.899822,-1.318832,0.821145,-2.036038,-1.315277,-0.147531,1.655343,1.411178,...,-0.473734,0.012276,0.615168,1.455012,0.231166,-0.556186,-0.301538,1.740819,-1.555349,0.448388
6,-1.409212,1.525051,-0.78159,-0.262635,-0.232667,-1.322772,-0.294316,-1.710057,0.265677,1.402672,...,-0.058053,-1.883985,1.47731,0.663658,0.462848,0.06333,-0.647197,1.843816,-0.924792,2.334642
7,-0.745395,0.65147,0.752452,0.150641,0.855287,0.204695,0.064684,1.403398,-0.219055,-0.361115,...,1.035372,0.410317,-0.211262,1.59267,-0.468504,0.736954,-0.729816,0.98982,-0.486954,0.624851
8,-0.265687,0.02439,0.265828,0.89207,-1.941305,0.417459,0.033597,0.100921,0.95981,-0.923657,...,-1.32717,0.836273,1.840666,0.094523,-0.093598,0.574309,0.209138,2.339784,-1.325809,2.078108
9,-0.721756,0.326487,1.209053,-0.079823,-2.108743,1.039866,-1.303569,-1.742439,-0.324238,-1.230549,...,-1.818912,0.43732,-1.455636,1.711977,0.149808,1.188665,-0.223703,3.268373,2.548481,-0.178433


竖行代表维度，横行代表对应文字的数字，其中的元素是对应的位置信息与概率之和。已经得到X

In [14]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)
#计算QKV
Q = Wq(x)
K = Wk(x)
V = Wv(x)

#Q.shape,K.shape,V.shape

In [15]:
#将QKV切成4份
Q = Q.view(batch_size, context_length, num_heads, d_model // num_heads).permute(0, 2, 1, 3)
#num_heads：注意力头的数量，在多头注意力机制中，查询矩阵会被分成多个头。
#d_model // num_heads：每个注意力头的维度，d_model 是模型的维度，通过除以 num_heads 来确定每个头的维度。
#permute(0, 2, 1, 3)：将 Q 的维度从 (batch_size, context_length, num_heads, d_model // num_heads) 转换为 (batch_size, num_heads, context_length, d_model // num_heads)。也即转置
K = K.view(batch_size, context_length, num_heads, d_model // num_heads).permute(0, 2, 1, 3)
V = V.view(batch_size, context_length, num_heads, d_model // num_heads).permute(0, 2, 1, 3)

scale步骤

In [16]:
output = Q @ K.transpose(-2, -1)/math.sqrt(d_model // num_heads)


mask蒙版步骤

In [17]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1).bool()
#torch.ones(context_length, context_length):这个函数创建一个形状为 (context_length, context_length) 的二维张量，其中所有元素都是 1。context_length 是一个整数，表示矩阵的行数和列数。
#torch.triu 函数返回一个上三角矩阵，即保留输入张量主对角线及其上方的元素，其他元素设为 0。参数 diagonal=1 表示保留主对角线上方一行（即次对角线）及其上方的元素，主对角线及其下方的元素设为 0。
#bool():这个方法将上三角矩阵中的所有元素转换为布尔类型。在 PyTorch 中，非零值会被转换为 True，零值会被转换为 False。
output.masked_fill_(mask, float('-inf'))
#masked_fill_：这是PyTorch张量的一个原地（in-place）操作方法。原地操作意味着这个方法会直接修改调用它的张量，而不是创建一个新的张量。
#float('-inf')：这是Python中表示负无穷大的方式。在数值计算中，负无穷大通常用于表示一个非常小的值，以确保在后续计算中这些位置的元素不会对结果产生影响。
pd.DataFrame(output[0,0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.42642,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
1,0.261584,0.789852,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
2,0.480668,0.547434,-0.626966,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
3,0.058816,0.494193,-0.286644,-0.498757,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,-0.203071,-0.536371,-0.376008,-0.56416,0.095143,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
5,0.613564,0.427384,0.419123,0.241304,0.411436,-0.105694,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
6,0.521915,0.111966,0.400483,0.59321,0.275912,0.283811,-0.014465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
7,0.287379,0.464065,-0.275591,0.000477,0.371527,-0.195241,0.846219,-0.145945,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
8,0.721783,0.044788,0.161681,0.12616,0.548578,-0.117208,0.048876,0.081696,-0.021005,-inf,-inf,-inf,-inf,-inf,-inf,-inf
9,0.235345,0.313546,0.101877,0.078027,0.150398,-0.648046,0.013173,-0.470336,-0.264434,-0.438374,-inf,-inf,-inf,-inf,-inf,-inf


In [18]:
attention_scroe = F.softmax(output, dim=-1)
#将上面矩阵通过softmax把inf值变成0
attention_scroe

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3709, 0.6291, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4168, 0.4455, 0.1377,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.1015, 0.0746, 0.0693,  ..., 0.0696, 0.0000, 0.0000],
          [0.1114, 0.0428, 0.0624,  ..., 0.0463, 0.0202, 0.0000],
          [0.1153, 0.0872, 0.0323,  ..., 0.0527, 0.0816, 0.0226]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.7123, 0.2877, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2690, 0.1894, 0.5416,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0448, 0.0583, 0.0299,  ..., 0.0979, 0.0000, 0.0000],
          [0.0680, 0.0453, 0.0546,  ..., 0.0382, 0.1680, 0.0000],
          [0.0448, 0.0932, 0.0238,  ..., 0.0261, 0.0301, 0.0375]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5003, 0.4997, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4000, 0.2539, 0.3461,  ..., 0

In [19]:
#和原矩阵相乘
A = attention_scroe @ V
A.shape

torch.Size([4, 4, 16, 16])

Contatenate步骤

In [20]:
A = A.transpose(1,2).reshape(batch_size, -1, d_model)

In [21]:
#创造权重矩阵
Wo = nn.Linear(d_model, d_model)
Output = Wo(A)
Output.shape

torch.Size([4, 16, 64])

残差连接1

In [22]:
Output = Output + x

In [30]:
mean = torch.mean(Output)
print(mean.item())

0.0005442940164357424


LayerNorm步骤 第一次归一化

In [24]:
layer_norm = nn.LayerNorm(d_model)
layer_norm_Output = layer_norm(Output)

前馈网络

In [31]:
mean = torch.mean(layer_norm_Output)
print(mean.item())

0.0


In [25]:
Output = nn.Linear(d_model, d_model*4)(layer_norm_Output)
Output = nn.ReLU()(Output)
Output = nn.Linear(d_model*4, d_model)(Output)
#残差连接2，这里连接的是LayerNorm后的输出
Output = Output + layer_norm_Output
Output.shape


torch.Size([4, 16, 64])

LayerNorm第二次归一化

In [26]:
Output = layer_norm(Output)

至此完成一个完整的Transformer Block，假设这是最后一次的Block，那么输出就是最终的输出。接下来进行Linear层和Softmax层，得到最终的输出。

In [27]:
Output = nn.Linear(d_model, max_token_value)(Output)
Output.shape

torch.Size([4, 16, 100069])

Softmax层,变成概率值

In [28]:
Logits = F.softmax(Output, dim=-1)
#Logits[0,0].sum() 验证保证概率和为1
#max(Logits[0,0])找到最大的概率值
predicted_index = torch.argmax(Logits[0,0]).item()
#torch.argmax 是 PyTorch 中的一个函数，用于返回输入张量中最大值的索引。
#由于 torch.argmax 返回的是一个张量，使用 .item() 可以将其转换为一个普通的 Python 整数。
encoding.decode([predicted_index])#解码该数字所对应的字符

'Replacing'