In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
import torch.nn.functional as F

In [4]:
import os

In [5]:
import requests

In [6]:
import tiktoken

In [7]:
import math

# 1、获取数据集-由于无法访问huggingface手动上传

In [8]:
# todo nothing

# 2、读取sales_textbook.txt文件

In [9]:
with open('trans_instance.txt','r') as f:
    text = f.read()

In [10]:
text[0:1000]

'小沈阳江西演唱会邀请沈春，\n明星刀郎的歌火遍大江南北\n2002年的第一场雪比2001年来得\n大模型张老师的粉丝全是正能量'

In [11]:
len(text)

60

# 3、引入tiktoken 将文字token化

In [12]:
encoding = tiktoken.get_encoding("cl100k_base")

In [13]:
tokenized_text = encoding.encode(text)

In [14]:
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)

In [15]:
max_token_value = tokenized_text.max().item()

In [16]:
len(tokenized_text)

69

In [17]:
max_token_value

92877

# 4、参数设置

In [18]:
context_length = 16

In [19]:
d_model = 64

In [20]:
batch_size = 4

In [21]:
data = tokenized_text

In [22]:
high = len(data) - context_length

In [23]:
high

53

# 5、初始化张量

## 参数low随机整数，最小值为零

## 参数high随机整数，最大值为49

In [24]:
idxs = torch.randint(low=0, high=high, size=(batch_size,))

In [25]:
#idxs tensor([53, 25, 13,  7])

## 说明初始化一维张量的目的

### 在trans_instance.txt文件中，中文字数59， token数65

### 随机张量tensor([41, 1, 32, 50]) 以41，1，32，50为token索引， 获取分别41，1，32，50为起始索引的token

### 例如41为起点的索引，取长度为10的文本，其它索引的值为41，42，43，44，45，46，47，48，49，50 每个索引对应了具体的token

# 6、初始化4批数据

## 数据结构是4行10列， 即4行token， 每行10个token

In [26]:
x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])

In [27]:
x_batch

tensor([[  230, 11881,    98, 42553, 31958, 78519,  6701,   222, 31938,   236,
          9554, 15722,   234, 80699, 30250,   235],
        [70277, 59563, 49409,   198,  1049,    17,  8107,  9554, 30537, 15120,
         83324, 25132,   103, 57106,  1049,    16],
        [ 8107, 37507, 50928,   198, 27384, 54872, 25287, 87441, 92877, 13821,
           230,  9554, 90397,   231,  3574,   251],
        [  236,  9554, 15722,   234, 80699, 30250,   235, 27384, 70277, 59563,
         49409,   198,  1049,    17,  8107,  9554]])

In [28]:
y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])

In [29]:
y_batch

tensor([[11881,    98, 42553, 31958, 78519,  6701,   222, 31938,   236,  9554,
         15722,   234, 80699, 30250,   235, 27384],
        [59563, 49409,   198,  1049,    17,  8107,  9554, 30537, 15120, 83324,
         25132,   103, 57106,  1049,    16,  8107],
        [37507, 50928,   198, 27384, 54872, 25287, 87441, 92877, 13821,   230,
          9554, 90397,   231,  3574,   251, 37087],
        [ 9554, 15722,   234, 80699, 30250,   235, 27384, 70277, 59563, 49409,
           198,  1049,    17,  8107,  9554, 30537]])

# 7、引入pandas查看原始数据

In [30]:
import pandas as pd

In [31]:
encoding.decode(x_batch[0].numpy())

'�春，\n明星刀郎的歌火遍'

In [32]:
encoding.decode(x_batch[1].numpy())

'江南北\n2002年的第一场雪比2001'

In [33]:
encoding.decode(x_batch[2].numpy())

'年来得\n大模型张老师的粉丝'

In [34]:
encoding.decode(x_batch[3].numpy())

'�的歌火遍大江南北\n2002年的'

In [35]:
encoding.decode(y_batch[0].numpy())

'春，\n明星刀郎的歌火遍大'

In [36]:
encoding.decode(y_batch[1].numpy())

'南北\n2002年的第一场雪比2001年'

In [37]:
encoding.decode(y_batch[2].numpy())

'来得\n大模型张老师的粉丝全'

In [38]:
encoding.decode(y_batch[3].numpy())

'的歌火遍大江南北\n2002年的第'

# 8、input Enbedding 初始化

In [39]:
encoding.decode([83175]) 

'阳'

## 创建一个Embedding table （92877， 64）即行为92877 列为64列

In [40]:
input_embedding_lookup_table = nn.Embedding(
    num_embeddings=max_token_value + 1,  # 词汇表大小（含未知标记）92877+1 = 92878
    embedding_dim=d_model               # 嵌入向量维度 64纬
)

In [41]:
input_embedding_lookup_table

Embedding(92878, 64)

In [42]:
input_embedding_lookup_table.weight.data #初始化的权重，这些初始值在训练过程中修正

tensor([[-0.5780,  1.6609,  0.1632,  ...,  0.2959, -0.2797,  0.9779],
        [ 0.7430, -1.3938,  0.6316,  ...,  1.3084,  0.2072, -0.5817],
        [ 0.4162, -0.3061,  0.1173,  ..., -0.2112,  0.2564,  0.4808],
        ...,
        [-0.1850, -2.2917, -2.3920,  ..., -1.1244,  0.5728,  0.3276],
        [ 1.1549, -1.0915, -1.2170,  ..., -0.4748, -0.5551, -0.9152],
        [-1.6030, -1.0925,  1.1802,  ...,  1.1817, -1.0096, -0.3890]])

In [43]:
x_batch_embedding = input_embedding_lookup_table(x_batch)
y_batch_embedding = input_embedding_lookup_table(y_batch)

In [44]:
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [45]:
x_batch_embedding

tensor([[[ 1.0707, -0.4468, -0.8520,  ...,  0.0923,  0.1597, -0.9516],
         [ 0.0337,  1.4017, -1.4048,  ..., -1.2651,  0.0346,  0.0758],
         [ 1.0686,  0.4825,  0.7246,  ...,  0.0688, -1.6042, -0.3969],
         ...,
         [-0.1253,  1.5412, -0.0727,  ...,  0.6935,  1.0645,  1.0084],
         [-1.7950, -0.3195, -0.2953,  ...,  1.0053, -0.1796,  0.4493],
         [ 1.6761,  0.1702, -0.1364,  ...,  0.1556, -0.3729, -0.6825]],

        [[-0.5167,  0.8715,  0.2613,  ..., -0.9310, -0.4571, -0.4582],
         [ 1.2399,  2.1289, -1.3575,  ...,  0.9277,  0.5227, -0.0994],
         [-0.6833, -0.0347, -1.3029,  ...,  0.2252, -0.9638,  0.1589],
         ...,
         [-1.5317,  1.3492,  1.4461,  ...,  0.0987,  0.3917, -0.4518],
         [ 0.4378,  0.4854, -0.8767,  ..., -0.4123, -0.0226, -1.7436],
         [ 0.7326,  0.4549, -0.0190,  ..., -1.2537, -1.1128, -0.6554]],

        [[-0.5713,  0.5345,  0.8107,  ...,  0.3134, -0.1900, -0.7947],
         [ 0.6898, -0.3686, -0.2822,  ..., -0

## 4 代表的是4个批次， 16是行数即16个token， 64即64列64个纬度

## 现在来说明input_embedding_lookup_table(x_batch)业务含义 ；
1. x_batch[3].numpy() = [31106,   230, 83175, 70277, 61786, 78256,   242, 84150,   109, 38093]
2. 第一个token= 31106 代表的是中文字 “沈”
3. input_embedding_lookup_table(x_batch) 将token=31106关联到enbedding第31106行， 每一行都代表第token行的toekn，表示一个文字

In [58]:
y_batch_embedding.shape

torch.Size([4, 16, 64])

# 9、positional embedding 加入位置信息

In [47]:
position_encoding_lookup_table = torch.zeros(context_length, d_model)

In [48]:
position_encoding_lookup_table

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## 目标是初始一个10x64的二维矩阵，目的就是给每一个

In [49]:
# 2. 生成位置序列 [0, 1, 2, ..., context_length-1]
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)

# 3. 计算频率缩放因子（指数衰减）
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

# 4. 交替应用正弦和余弦函数
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)  # 偶数位置
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)  # 奇数位置

# 5. 添加batch维度 [batch_size, seq_len, d_model]
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)

In [50]:
position_encoding_lookup_table

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  6.8156e-01,  ...,  1.0000e+00,
           1.3335e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  9.9748e-01,  ...,  1.0000e+00,
           2.6670e-04,  1.0000e+00],
         ...,
         [ 4.2017e-01,  9.0745e-01, -3.1822e-01,  ...,  1.0000e+00,
           1.7336e-03,  1.0000e+00],
         [ 9.9061e-01,  1.3674e-01, -8.7899e-01,  ...,  1.0000e+00,
           1.8669e-03,  1.0000e+00],
         [ 6.5029e-01, -7.5969e-01, -9.6821e-01,  ...,  1.0000e+00,
           2.0003e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  6.8156e-01,  ...,  1.0000e+00,
           1.3335e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  9.9748e-01,  ...,  1.0000e+00,
           2.6670e-04,  1.0000e+00],
         ...,
         [ 4.2017e-01,  9

## 增加位置编码

In [51]:
x = x_batch_embedding + position_encoding_lookup_table

In [52]:
x

tensor([[[ 1.0707,  0.5532, -0.8520,  ...,  1.0923,  0.1597,  0.0484],
         [ 0.8752,  1.9420, -0.7232,  ..., -0.2651,  0.0347,  1.0758],
         [ 1.9779,  0.0663,  1.7221,  ...,  1.0688, -1.6040,  0.6031],
         ...,
         [ 0.2948,  2.4487, -0.3909,  ...,  1.6935,  1.0662,  2.0084],
         [-0.8044, -0.1828, -1.1743,  ...,  2.0053, -0.1777,  1.4493],
         [ 2.3263, -0.5895, -1.1046,  ...,  1.1556, -0.3709,  0.3175]],

        [[-0.5167,  1.8715,  0.2613,  ...,  0.0690, -0.4571,  0.5418],
         [ 2.0814,  2.6692, -0.6760,  ...,  1.9277,  0.5228,  0.9006],
         [ 0.2260, -0.4509, -0.3054,  ...,  1.2252, -0.9635,  1.1589],
         ...,
         [-1.1115,  2.2567,  1.1279,  ...,  1.0987,  0.3934,  0.5482],
         [ 1.4284,  0.6221, -1.7557,  ...,  0.5877, -0.0207, -0.7436],
         [ 1.3829, -0.3048, -0.9872,  ..., -0.2537, -1.1108,  0.3446]],

        [[-0.5713,  1.5345,  0.8107,  ...,  1.3134, -0.1900,  0.2053],
         [ 1.5313,  0.1717,  0.3994,  ...,  0

In [53]:
y = y_batch_embedding + position_encoding_lookup_table

In [54]:
y

tensor([[[ 0.0337,  2.4017, -1.4048,  ..., -0.2651,  0.0346,  1.0758],
         [ 1.9101,  1.0228,  1.4062,  ...,  1.0688, -1.6041,  0.6031],
         [ 2.0754,  0.4042,  2.2460,  ...,  1.4674,  0.1222,  1.8013],
         ...,
         [-1.3748,  0.5879, -0.6135,  ...,  2.0053, -0.1779,  1.4493],
         [ 2.6667,  0.3069, -1.0154,  ...,  1.1556, -0.3711,  0.3175],
         [-0.0609, -2.5063,  0.8733,  ...,  0.3753,  0.4652,  0.0864]],

        [[ 1.2399,  3.1289, -1.3575,  ...,  1.9277,  0.5227,  0.9006],
         [ 0.1582,  0.5056, -0.6213,  ...,  1.2252, -0.9637,  1.1589],
         [ 0.3071, -1.2332,  2.3207,  ...,  0.4757, -0.3268,  1.7862],
         ...,
         [ 0.8579,  1.3928, -1.1949,  ...,  0.5877, -0.0209, -0.7436],
         [ 1.7232,  0.5916, -0.8980,  ..., -0.2537, -1.1109,  0.3446],
         [ 0.0790, -0.2252, -0.1575,  ...,  1.3134, -0.1880,  0.2053]],

        [[ 0.6898,  0.6314, -0.2822,  ...,  0.1530, -0.2296,  1.7614],
         [ 1.0442,  0.7802, -1.0243,  ...,  0

In [55]:
x.shape, y.shape

(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [56]:
pd.DataFrame(x[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,1.070652,0.553198,-0.851963,-0.319861,-0.52904,0.942364,-0.637911,0.79238,0.857789,0.770201,...,0.254611,0.332331,0.721469,1.69329,0.873619,-0.152157,0.844965,1.092265,0.159677,0.048377
1,0.875163,1.94204,-0.723217,1.371266,2.521662,1.866796,-0.078784,1.216764,0.681742,1.56105,...,0.736463,0.446045,0.908376,1.090495,1.206476,1.519725,0.447973,-0.265075,0.034708,1.075806
2,1.977931,0.066348,1.722088,-0.834678,1.011282,1.03394,0.565909,0.394565,0.759725,0.969948,...,-0.933124,-0.626546,-1.571673,-0.002953,0.140017,-0.134487,-0.922008,1.068821,-1.603981,0.603122
3,1.307241,-0.169625,2.02683,1.179281,0.741384,-0.456651,1.210368,-0.120287,1.574893,0.543,...,0.755719,-0.299773,1.435957,0.193814,-1.078188,0.248084,0.46728,1.467364,0.122304,1.801259
4,-1.179754,-0.649103,0.193952,-0.062931,1.547482,-0.580412,2.519892,1.52463,-0.282805,-1.636536,...,-0.686128,2.888756,0.379104,1.302801,-0.639535,0.322805,-0.314615,2.883026,2.323025,2.333615
5,-1.251513,-0.983003,0.315711,-2.662877,-0.477959,-2.529489,1.138389,1.163526,0.979777,2.072133,...,0.274205,1.914037,1.839754,-0.175969,-2.179759,1.569786,-0.948764,-0.457335,0.625957,2.535669
6,1.508204,1.013694,-1.31143,-0.23739,0.299233,-2.193677,0.935845,-1.332642,2.340673,-1.439728,...,-0.252182,0.123048,0.162652,0.611883,-0.036612,1.79633,0.582246,0.250248,-0.281787,-0.71289
7,0.537846,0.397206,-0.894905,0.617004,-0.45234,-0.881734,1.133551,-0.995513,1.049162,-0.262692,...,1.051534,1.186461,-1.067721,1.258784,0.967621,0.361644,0.97115,0.258724,0.065297,-0.884608
8,0.429165,0.324527,0.591671,0.749725,-0.737783,0.377465,0.041821,-0.579644,2.293993,1.157515,...,1.448025,1.539738,-2.641353,0.269691,-1.444083,0.611037,0.326495,1.737991,-0.541025,1.327647
9,2.077146,0.679966,-0.260202,0.684418,-1.286441,0.172257,-2.562258,0.396132,-0.075899,-1.021532,...,-1.753314,4.348763,0.895697,-0.445916,0.474179,2.449888,-0.534662,0.602241,0.757603,1.019817


In [57]:
pd.DataFrame(y[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.033692,2.401738,-1.404779,1.639505,1.988493,2.020787,-0.488093,1.304368,0.370758,1.610635,...,0.736042,0.446045,0.90806,1.090495,1.206239,1.519725,0.447795,-0.265075,0.034574,1.075806
1,1.910105,1.022797,1.406169,-0.173865,0.64232,1.448486,0.228314,0.642029,0.479582,1.113785,...,-0.933546,-0.626546,-1.57199,-0.002953,0.13978,-0.134487,-0.922185,1.068822,-1.604115,0.603122
2,2.075419,0.40422,2.246038,1.878156,0.650261,0.090778,1.003637,0.243678,1.353371,0.766825,...,0.755298,-0.299773,1.435641,0.193814,-1.078425,0.248084,0.467102,1.467364,0.122171,1.801259
3,-0.281832,-0.985452,0.830685,0.299075,1.762264,-0.068699,2.480245,1.941327,-0.423737,-1.35492,...,-0.68655,2.888757,0.378788,1.302802,-0.639772,0.322806,-0.314792,2.883026,2.322892,2.333615
4,-1.049392,-1.920309,1.028377,-2.831948,-0.023422,-2.211089,1.272774,1.559946,0.933411,2.383613,...,0.273783,1.914038,1.839438,-0.175968,-2.179996,1.569786,-0.948942,-0.457335,0.625824,2.535669
5,0.828695,0.337186,-0.905161,-0.846835,0.853536,-2.166653,1.220716,-1.025955,2.393471,-1.129274,...,-0.252604,0.123049,0.162336,0.611883,-0.03685,1.79633,0.582068,0.250248,-0.281921,-0.71289
6,-0.398556,0.603474,-1.012988,-0.105862,0.031014,-1.154408,1.518996,-0.832293,1.195889,0.015949,...,1.051112,1.186463,-1.068037,1.258785,0.967384,0.361645,0.970972,0.258725,0.065164,-0.884608
7,0.096793,1.22393,0.012586,0.301241,-0.474242,-0.110929,0.460306,-0.588488,2.520097,1.37671,...,1.447603,1.53974,-2.64167,0.269692,-1.44432,0.611038,0.326317,1.737991,-0.541158,1.327647
8,2.654386,1.445596,-0.989623,0.750917,-1.32388,-0.38144,-2.184054,0.216773,0.20716,-0.88352,...,-1.753736,4.348764,0.895381,-0.445915,0.473942,2.449888,-0.53484,0.602241,0.757469,1.019818
9,1.646483,-0.873697,1.107458,2.739313,-0.607212,1.000783,-2.103047,-2.81903,-0.994165,0.415559,...,0.611345,0.872226,0.237498,-0.580864,0.316403,0.248648,0.233967,1.302146,-1.055769,0.840174


# 10、Q，K，V 初始化

In [59]:
Wq = nn.Linear(d_model, d_model)

In [60]:
Wq

Linear(in_features=64, out_features=64, bias=True)