In [2]:
import math

# Transform从零实现
import torch
import torch.nn as nn
import os
import requests
import torch.nn.functional as F

In [3]:
# 获取数据

# 如果文件不存在则取路径出下载
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/resolve/2ed62cfc15e9698dd99db4470c1c18599704f710/sales_textbook.txt?download=true'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).content)
# 读取文件
with open('sales_textbook.txt', 'r') as f:
    text = f.read()

In [4]:
text[:1000]

'Chapter 1: Building Rapport and Capturing Attention\nSubpoint: Understanding the Importance of Building Rapport\nBuilding rapport is a fundamental skill in sales that cannot be underestimated. It lays the foundation for establishing a connection with your potential customers, gaining their trust, and ultimately convincing them to make a purchase. Rapport can be defined as a harmonious relationship based on mutual understanding and empathy. When you build rapport with someone, you create a sense of familiarity, comfort, and shared interests, making it easier to communicate and influence their decision-making process.\nOne of the main reasons why building rapport is crucial in sales is that people are more likely to buy from someone they like and trust. By establishing a positive and genuine connection with your customers, you increase their confidence in you and your product or service. People want to do business with individuals they feel comfortable with, those who understand their n

In [19]:
# 超参数
# 数据的长度(16个单词)
context_length = 16
# 每个词扩大到64维
d_model = 64
batch_size = 4
# 多头注意力机制
num_heads = 4

In [6]:
# 将文本数据tokennize化

# 一种编码机制
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

In [7]:
# 将我们的texttokennize化
Tokennize_text = encoding.encode(text)
Tokennize_text = torch.tensor(Tokennize_text , dtype=torch.long)
# 获取编码的最大值
max_token_value = Tokennize_text.max().item()
len(Tokennize_text) 

77919

In [8]:
# 将数据集分为训练集和验证集
train_index = int(len(Tokennize_text)*0.9)
train_data = Tokennize_text[:train_index]
valid_data = Tokennize_text[train_index:]

In [9]:
data = train_data
# 随机获取四个索引，为了获得四个批量大小的数据(减16是因为防止左后几个索引必须大于16)
idxs = torch.randint(low=0 , high=len(train_data) - context_length , size=(batch_size , ))
idxs

tensor([62289, 36360,  9301,  7193])

In [10]:
# 获取四个批量
# x为训练数据，y为验证数据
x_batch = torch.stack([data[idx:idx+context_length]for idx in idxs])
y_batch = torch.stack([data[idx+1:idx+context_length+1]for idx in idxs])
x_batch.shape, y_batch.shape

(torch.Size([4, 16]), torch.Size([4, 16]))

In [11]:
# 数据可视化
import pandas as pd
pd.DataFrame(x_batch.numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,3619,430,54111,3629,19646,505,90700,826,477,6996,315,64784,13,15636,11,433
1,649,1977,7095,11,38769,11,323,13967,11,3345,279,6412,13,15957,220,18
2,7493,649,1101,1520,41468,279,7720,323,27375,315,701,2027,477,2532,11,3339
3,1115,1207,2837,24400,389,5370,15174,430,649,387,20011,311,13750,1893,264,5647


In [12]:
# 样本对用编码的最大值
max_token_value
# 现在要创建一个真正输出到模型的数据，100069*64维的矩阵,初始化参数都是随机的
input_embedding_lookup = nn.Embedding(max_token_value + 1, d_model)
input_embedding_lookup

Embedding(100070, 64)

In [13]:
# 这是真正要输出到模型的一个批量的数据
x_batch_embedding = input_embedding_lookup(x_batch)
y_batch_embedding = input_embedding_lookup(y_batch)
x_batch_embedding.shape, y_batch_embedding.shape

(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [14]:
# 加入位置信息
# 获取位置编码
import math
position_encoding_lookup_table = torch.zeros(context_length,d_model)
position = torch.arange(0, context_length).unsqueeze(1)
position.shape

torch.Size([16, 1])

In [15]:
div_term = torch.exp(torch.arange(0, d_model, 2).float() *(-math.log(10000.0) / d_model)) # (1 , 64)
position_encoding_lookup_table[:, 0::2] = torch.sin(div_term * position)
position_encoding_lookup_table[:, 1::2] = torch.cos(div_term * position)
position_encoding_lookup_table.shape # (16 , 64)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)
# 得到位置编码，他的形状和一个批量数据的形状是一致的
position_encoding_lookup_table.shape

torch.Size([4, 16, 64])

In [16]:
x =x_batch_embedding + position_encoding_lookup_table
y =y_batch_embedding + position_encoding_lookup_table
pd.DataFrame(x[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,1.276481,1.839012,0.559645,2.45077,-1.029373,1.076318,-1.287502,0.032846,1.537491,1.011404,...,-1.107739,-0.37904,0.321741,1.742949,0.950201,0.663481,0.467162,1.649482,-0.21504,1.549865
1,1.484535,-0.457457,0.766868,0.81172,0.682822,0.031945,0.017044,0.138833,1.032876,1.845717,...,-0.416106,1.205028,-2.338741,0.547787,-1.306301,0.508856,0.15578,0.181872,2.165472,1.25103
2,0.8255,-0.205452,0.89776,-0.215477,0.816492,2.306995,1.363712,0.54143,2.271329,1.925077,...,-0.133777,0.346315,0.605186,2.934829,0.66832,3.11029,0.24589,0.650463,-0.712466,0.667542
3,0.157724,-0.708044,0.260668,-1.155751,3.040759,-3.002851,0.675826,0.985944,0.687211,-0.098874,...,0.608616,0.693763,1.556209,1.74255,-0.764934,-1.342643,0.567373,0.083828,-1.381561,0.483935
4,-1.521272,-0.451515,0.297309,-2.032782,-0.639166,1.105254,1.482915,0.582957,2.21605,0.117371,...,0.967035,1.904876,-0.352462,0.493314,1.052984,2.109513,-0.364296,2.337722,0.884573,1.007147
5,-1.081494,0.474884,-2.172528,-2.301731,0.727162,-0.699883,1.403226,-0.357652,1.787365,0.176613,...,-1.686214,-0.510234,0.287951,1.138397,0.631802,0.959266,-0.125287,0.860611,0.63962,1.43276
6,0.027183,0.693831,-1.99101,-0.062006,-0.128525,-0.540139,0.291514,-0.386952,0.080956,-0.088347,...,-0.782671,0.680809,-1.338567,0.789683,-0.247522,1.021406,0.660085,1.819066,-0.434189,0.673211
7,1.873141,-0.526163,-0.852715,0.851401,-2.371512,-0.398174,-1.270765,-0.878473,2.73856,-0.363268,...,2.099948,3.438209,0.22502,0.609418,-1.585478,1.45187,-0.221738,-0.33153,0.959764,0.391086
8,-1.17713,0.550239,0.789616,0.240691,-1.051917,-2.203415,-1.381389,-0.182029,-1.091455,-0.031889,...,-0.118357,1.796229,0.741282,1.091237,-0.773383,0.703869,1.442684,1.286311,0.243659,0.576277
9,0.673214,-1.161587,-1.714437,1.383661,-0.384133,-1.627306,-0.823563,-0.685881,-0.821943,-1.38694,...,0.136673,0.392303,0.359202,-0.138655,0.352661,-1.436725,-0.294376,0.171984,0.906031,1.33329


In [18]:
# 获取Q , K , V 
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(x)
K = Wk(y)
V = Wv(x)
Q.shape , K.shape , V.shape

(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [25]:
# 多头注意力
Q = Q.reshape(batch_size, context_length, num_heads ,d_model//num_heads ).permute(0,2,1,3)
K = K.reshape(batch_size, context_length, num_heads ,d_model//num_heads ).permute(0,2,1,3)
V = V.reshape(batch_size, context_length, num_heads ,d_model//num_heads ).permute(0,2,1,3)
Q.shape , K.shape , V.shape

(torch.Size([4, 4, 16, 16]),
 torch.Size([4, 4, 16, 16]),
 torch.Size([4, 4, 16, 16]))

In [29]:
# 注意力分数计算
output = Q@K.transpose(-2 , -1) / math.sqrt(d_model//num_heads)
output.shape

torch.Size([4, 4, 16, 16])

In [36]:
# mask
mask = torch.triu(torch.ones(context_length , context_length) , diagonal=1).bool()
output = output.masked_fill(mask , -float('inf'))
pd.DataFrame(output[0,0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.41891,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
1,0.242419,-0.344068,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
2,-0.321933,-0.479533,0.698267,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
3,-0.658239,-0.271404,-0.445759,0.274006,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,-0.082385,-0.224698,0.060744,0.089017,-0.780732,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
5,0.30494,-0.174152,0.107227,0.014535,-0.018124,-0.277364,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
6,-0.502951,-0.180806,-0.116237,0.266125,-0.556534,-0.397589,0.116003,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
7,0.790694,1.227676,0.145053,-0.0321,0.407161,0.880613,-0.949844,-1.30659,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
8,0.563889,0.201184,-0.078867,-0.085551,0.402837,-0.01495,-0.236816,-0.651463,0.501676,-inf,-inf,-inf,-inf,-inf,-inf,-inf
9,0.45628,-0.371287,-0.449853,0.435829,-0.215313,-0.259612,-0.836351,-0.717855,0.82789,0.094698,-inf,-inf,-inf,-inf,-inf,-inf


In [39]:
# 得到注意力分数
attention_score = torch.softmax(output,dim=-1)
pd.DataFrame(attention_score[0,0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.642559,0.357441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.216079,0.184573,0.599349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.160019,0.235598,0.197902,0.406481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.212513,0.184323,0.245214,0.252246,0.105705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.223691,0.138542,0.183562,0.167312,0.161936,0.124956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.100746,0.139039,0.148312,0.217388,0.09549,0.111941,0.187084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.179041,0.277161,0.093876,0.078635,0.122007,0.195886,0.031408,0.021984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.171201,0.119121,0.090025,0.089425,0.145735,0.095967,0.076872,0.050779,0.160875,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.152651,0.066726,0.061684,0.149561,0.077989,0.074609,0.04191,0.047182,0.221355,0.106333,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# 注意力分数和V再相乘
A = attention_score@V
A.shape

torch.Size([4, 4, 16, 16])

In [54]:
# 将输出转换为原来的形状（4 ， 16 ， 64）
A = A.transpose(1 , 2).reshape(batch_size , context_length , d_model)
A.shape

torch.Size([4, 16, 64])

In [86]:
Wo = nn.Linear(d_model, d_model)
output = Wq(A)
output.shape

torch.Size([4, 16, 64])

In [87]:
# 残差链接
output = output + x
# 归一化
layer_norm = nn.LayerNorm(d_model)
layer_output = layer_norm(output)

In [88]:
# 前馈网络Feed Forword
feed_forward = nn.Sequential(
    nn.Linear(d_model, d_model * 4),
    nn.ReLU(),
    nn.Linear(d_model * 4, d_model),    
)
output = feed_forward(output)
output = output + layer_output
output.shape

torch.Size([4, 16, 64])

In [89]:
# 再做一次层归一化
output = layer_norm(output)
output.shape

torch.Size([4, 16, 64])

In [90]:
# 最后的全连接层
Linear = nn.Linear(d_model, max_token_value+1)
output = Linear(output)
output.shape

torch.Size([4, 16, 100070])

In [93]:
# 最终输出
logits = F.softmax(output, dim=-1)
logits.shape

torch.Size([4, 16, 100070])

In [110]:
# 打印预测的结果
predict_index = torch.argmax(logits[0,0], dim=-1).item()
encoding.decode([predict_index])


'227'