In [54]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# get datasets
if not os.path.exists('sales_textbook.txt'):
  url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/resolve/main/sales_textbook.txt?download=true'
  with open('sales_textbook.txt','wb') as f:
    f.write(requests.get(url).content)

# read content to memory
with open('sales_textbook.txt','r') as f:
  text = f.read()


In [22]:
# tokenize origin datasets
encoding = tiktoken.get_encoding("cl100k_base")

tokenized_text = encoding.encode(text)
# list to tensor
tokenized_text = torch.tensor(tokenized_text,dtype=torch.long)
max_token_value = tokenized_text.max().item()
len(tokenized_text)

77919

In [18]:
# split train sets and validate sets
train_idx = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_idx]
valid_data = tokenized_text[train_idx:]
train_data

tensor([26072,   220,    16,  ...,  1501, 48451,  7119])

In [38]:
# hyperparameters
context_length = 16 # split input ant the input include 16 token
d_model = 64
batch_size = 4 # 4 train parallel
num_heads = 4 # multi head num

In [25]:
# train datastruct should be [4,16,64] 4 batch 16 token 64 dimension
data = train_data
idxs = torch.randint(low=0,high=len(data)-context_length,size=(batch_size,)) # rand 4 batch start index position
x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])
y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])
y_batch

tensor([[ 1139,   279, 16188, 13189,   315,  8830,   279,  6444,     6,  3966,
           323,  6784,  3585,    13,  1115, 16996],
        [   13, 81745, 25363, 36870, 10758,  7512,   374,   264, 77975,  1920,
           430,  7612,  6725,    11, 11302,    11],
        [  449,   701, 10877,    11,   499,   649, 16988,   872, 21958,    11,
         51077,   872, 28899,    11,   323,  5387],
        [12207, 18885,  4642, 14624,  7512,   927,   892,   627,   644, 17102,
            11,  4642, 14624,   374,   264, 89328]])

In [29]:
# input embedding table
# row express One of 16 tokens, column express 64 dimension
# step 1 : create a full table row of all token, and column is 64 dimension
input_embedding_lookup_table = nn.Embedding(max_token_value + 1,d_model)

x_batch_embedding = input_embedding_lookup_table(x_batch)
y_batch_embedding = input_embedding_lookup_table(y_batch)
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [30]:
# positional encoding sin cos TODO: here idk so copy paper
position_encoding_lookup_table = torch.zeros(context_length,d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
# apply the sine & cosine
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) # add batch to the first dimension

In [33]:
# add positional encoding to the input embedding
x = x_batch_embedding + position_encoding_lookup_table
y = y_batch_embedding + position_encoding_lookup_table
pd.DataFrame(x[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.009514,-0.628647,0.453952,-0.693385,-0.392355,1.039178,0.393323,1.536048,-2.246852,1.840393,...,0.029328,0.325352,1.426127,3.196872,0.181568,2.140222,0.856732,2.35305,1.329569,1.651698
1,1.04488,1.463692,0.42404,0.575627,2.17273,2.429986,0.185697,1.63648,-0.394112,0.74141,...,0.840068,1.271349,-0.106082,0.573165,-1.227755,0.144123,-0.628551,-0.643179,1.833568,0.420245
2,0.186652,0.540543,1.420622,-0.821325,1.654497,2.48825,0.513854,-0.01221,2.487984,1.625387,...,-0.426523,0.695596,-1.201532,1.859443,-0.280295,1.759851,0.5362,1.185389,-0.409233,2.586292
3,-0.139578,-1.003603,1.547197,-1.083511,-0.713684,0.5461,-0.324615,-0.173023,0.566914,0.39026,...,-0.72641,1.038835,-2.591173,1.021517,1.101539,0.925657,-0.786015,-0.014163,-0.087064,1.004659
4,-3.049731,0.043569,-0.268855,-0.236893,0.258206,1.006418,2.750966,0.706177,2.160859,1.020835,...,0.421504,1.139157,0.258226,1.917334,-1.025656,2.881215,-0.037726,2.391368,-0.054268,1.35207
5,-0.329391,0.875787,-1.437143,0.17507,-0.779667,-0.3617,1.139137,0.736458,0.874199,1.042657,...,1.179075,-0.056524,-0.669338,1.493032,-1.126546,0.748562,0.533761,1.701273,-0.498121,0.812271
6,-0.817968,-0.275213,-0.938052,0.741385,-0.943942,-0.752198,0.279236,-2.37734,2.462071,0.851587,...,-0.110642,3.283916,2.338055,-0.062029,-1.372887,3.070702,-1.518887,0.345257,1.796947,0.146887
7,-0.065659,1.710592,-0.436171,-0.380824,0.038645,1.356357,-0.044468,-1.6592,2.697279,0.219371,...,-0.424414,0.695592,-1.199951,1.859441,-0.27911,1.75985,0.537089,1.185388,-0.408566,2.586291
8,0.504581,-1.383468,-2.202643,0.00855,-1.016984,0.619911,1.188565,-0.495199,1.78355,-1.712065,...,-0.261004,1.194813,0.72848,0.825866,0.950326,1.124104,-0.34964,-0.299419,-1.133275,2.203186
9,0.125427,0.285392,2.284018,2.042556,0.599608,0.54471,0.628323,-0.510135,1.131524,-1.588181,...,0.111559,2.023352,0.155175,1.203321,-0.119158,1.648694,0.761911,0.502889,-2.132461,0.572767


# Multi-Head Attention

In [34]:
# generate Q,K,V
Wq = nn.Linear(d_model,d_model)
Wk = nn.Linear(d_model,d_model)
Wv = nn.Linear(d_model,d_model)
# default last [16 ,64] * [64,64] => [16,64], execute 4 batch auto
Q = Wq(x)
K = Wk(x)
V = Wv(x)
Q.shape,K.shape,V.shape


(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [51]:
# apply multi-head
Q = Q.reshape(batch_size,context_length,num_heads,d_model//num_heads)
K = K.reshape(batch_size,context_length,num_heads,d_model//num_heads)
V = V.reshape(batch_size,context_length,num_heads,d_model//num_heads)

Q = Q.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
K = K.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
V = V.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
Q.shape



torch.Size([4, 4, 16, 16])

In [53]:
# tensor @
output = Q @ K.transpose(-2,-1) / math.sqrt(d_model//num_heads) # transform token * token  & scale
# # apply mask
mask = torch.triu(torch.ones(context_length,context_length),diagonal=1).bool()
output = output.masked_fill(mask,float('-inf')) # ignore predict section such as i am tom, here only i am
output

tensor([[[[-0.3606,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.3952, -0.7748,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.5949, -0.1296, -0.1801,  ...,    -inf,    -inf,    -inf],
          ...,
          [-0.4082, -0.0957, -0.1093,  ...,  0.3589,    -inf,    -inf],
          [-0.2295, -0.0230, -0.3824,  ...,  1.1017,  0.2625,    -inf],
          [ 0.4246,  0.2365, -0.1537,  ...,  0.9713,  0.3656, -0.0315]],

         [[-1.2565,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.6804,  0.0429,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.4348, -0.1344, -0.5002,  ...,    -inf,    -inf,    -inf],
          ...,
          [-0.6650, -0.9835, -1.3109,  ..., -1.0226,    -inf,    -inf],
          [ 0.5416,  0.1189,  0.1980,  ...,  1.0053, -0.1624,    -inf],
          [ 0.7271, -0.0145, -0.3190,  ...,  0.6860, -0.2814, -0.0558]],

         [[ 0.5502,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [ 0.5849,  0.3819,  

In [56]:
# apply softmax
attention_score = F.softmax(output,dim=-1) # score transform equal to [0,1] 
attention_score

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5938, 0.4062, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2435, 0.3878, 0.3687,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0435, 0.0594, 0.0586,  ..., 0.0936, 0.0000, 0.0000],
          [0.0489, 0.0601, 0.0420,  ..., 0.1852, 0.0800, 0.0000],
          [0.0910, 0.0754, 0.0510,  ..., 0.1571, 0.0858, 0.0576]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3267, 0.6733, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3042, 0.4108, 0.2850,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0747, 0.0543, 0.0392,  ..., 0.0522, 0.0000, 0.0000],
          [0.1181, 0.0774, 0.0837,  ..., 0.1877, 0.0584, 0.0000],
          [0.1636, 0.0779, 0.0575,  ..., 0.1570, 0.0597, 0.0748]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5506, 0.4494, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3975, 0.3288, 0.2736,  ..., 0

In [65]:
A =  attention_score @ V #  here not use transpose(-2,-1) may be attention_score has the token**

In [68]:
# concatenate
A = A.transpose(1,2).reshape(batch_size,-1,d_model)

Wo = nn.Linear(d_model,d_model)
output = Wo(A)
output

tensor([[[ 0.4663, -0.0166, -0.4167,  ..., -0.0401, -0.1132,  0.3632],
         [-0.1466,  0.2371, -0.2531,  ...,  0.0874, -0.1336,  0.1388],
         [-0.0896,  0.0519, -0.0591,  ..., -0.0029, -0.0654, -0.3196],
         ...,
         [ 0.2636, -0.2210,  0.1402,  ..., -0.0442, -0.1109,  0.1679],
         [ 0.0358,  0.1989, -0.0560,  ...,  0.0591, -0.2251, -0.0618],
         [-0.4959,  0.5770, -0.3035,  ...,  0.1140, -0.3024,  0.1511]],

        [[ 0.2800,  0.0649, -0.3240,  ...,  0.0558,  0.0565,  0.0300],
         [-0.3127, -0.1461,  0.0296,  ..., -0.0818,  0.0963, -0.0012],
         [ 0.0286,  0.1108, -0.2235,  ..., -0.0060, -0.0615, -0.2877],
         ...,
         [ 0.0754, -0.1843,  0.1082,  ...,  0.0102, -0.0855, -0.1104],
         [ 0.1757,  0.1801, -0.1643,  ...,  0.0994, -0.3482,  0.0986],
         [-0.4449,  0.1440,  0.1089,  ..., -0.0631,  0.1413,  0.0937]],

        [[ 0.3564, -0.0333, -0.3294,  ...,  0.1741,  0.4237,  0.0613],
         [-0.0877, -0.2983,  0.1739,  ..., -0

In [69]:
# residual connection
output = output + x

In [74]:
# layer normalization
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)



In [76]:
# feed forward network TODO: idk
output = nn.Linear(d_model,d_model * 4)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4,d_model)(output)

output = output + layer_norm_output

In [77]:
# layer normalization
output = layer_norm(output)

In [78]:
# final liner layer
output = nn.Linear(d_model,max_token_value+1)(output)
output.shape

torch.Size([4, 16, 100070])

In [82]:
logits = F.softmax(output,dim=-1)
logits

tensor([[[6.1120e-06, 8.1475e-06, 2.9549e-06,  ..., 5.7906e-06,
          6.7421e-06, 8.4356e-06],
         [6.6594e-06, 6.1117e-06, 6.9420e-06,  ..., 1.8280e-05,
          1.1392e-05, 1.3180e-05],
         [3.7792e-06, 1.4917e-05, 7.2099e-06,  ..., 6.8160e-06,
          6.6577e-06, 2.0404e-05],
         ...,
         [5.9275e-06, 6.6291e-06, 3.4847e-06,  ..., 9.1963e-06,
          8.9139e-06, 1.6315e-05],
         [4.5611e-06, 6.1672e-06, 5.8078e-06,  ..., 8.7649e-06,
          1.1520e-05, 1.1059e-05],
         [6.7396e-06, 8.3190e-06, 1.5697e-05,  ..., 6.8150e-06,
          2.0611e-05, 1.6387e-05]],

        [[4.0020e-06, 2.8324e-06, 8.4202e-06,  ..., 2.5911e-05,
          1.1612e-05, 9.1395e-06],
         [3.1889e-06, 9.1816e-06, 1.9994e-06,  ..., 7.9454e-06,
          1.0197e-05, 1.9684e-05],
         [8.3090e-06, 2.8057e-06, 4.4327e-06,  ..., 2.5911e-05,
          4.4125e-06, 8.0696e-06],
         ...,
         [2.4293e-05, 5.5617e-06, 7.9825e-06,  ..., 6.1025e-06,
          3.710

In [89]:
print(sum(logits[0,0]))

predicted_idx = torch.argmax(logits[0,0]).item() # get max score index

encoding.decode([predicted_idx])

tensor(1.0000, grad_fn=<AddBackward0>)


'关'