In [90]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn
import torch.nn.functional as F

In [91]:
# get datasets
if not os.path.exists('sales_textbook.txt'):
  url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/resolve/main/sales_textbook.txt?download=true'
  with open('sales_textbook.txt','wb') as f:
    f.write(requests.get(url).content)

# read content to memory
with open('sales_textbook.txt','r') as f:
  text = f.read()


In [92]:
# tokenize origin datasets
encoding = tiktoken.get_encoding("cl100k_base")

tokenized_text = encoding.encode(text)
# list to tensor
tokenized_text = torch.tensor(tokenized_text,dtype=torch.long)
max_token_value = tokenized_text.max().item()
len(tokenized_text)

77919

In [93]:
# split train sets and validate sets
train_idx = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_idx]
valid_data = tokenized_text[train_idx:]
train_data

tensor([26072,   220,    16,  ...,  1501, 48451,  7119])

In [94]:
# hyperparameters
context_length = 16 # split input ant the input include 16 token
d_model = 64
batch_size = 4 # 4 train parallel
num_heads = 4 # multi head num

In [95]:
# train datastruct should be [4,16,64] 4 batch 16 token 64 dimension
data = train_data
idxs = torch.randint(low=0,high=len(data)-context_length,size=(batch_size,)) # rand 4 batch start index position
x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])
y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])
y_batch

tensor([[ 4443, 94066,   430, 11415,   701,  1866, 11704,   323,  1268,   814,
         29243,   311,   279,  6130,   596,  6671],
        [  315,   701, 10209,    11,   499,   649, 61705, 40017,   323,  1798,
           484, 12410,   304,   701,  4754,  6444],
        [ 3339,   264,  4062,  6412,   627,    19,    13, 67118, 36755,   323,
         61913,    25,  2057,  4726, 18885, 38769],
        [  892,    11,  5376, 26206,    11,  8108,  7194,    11,   477,  6493,
           904,  1023,  5199, 20124,   430,  5398]])

In [96]:
# input embedding table
# row express One of 16 tokens, column express 64 dimension
# step 1 : create a full table row of all token, and column is 64 dimension
input_embedding_lookup_table = nn.Embedding(max_token_value + 1,d_model)

x_batch_embedding = input_embedding_lookup_table(x_batch)
y_batch_embedding = input_embedding_lookup_table(y_batch)
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [97]:
# positional encoding sin cos TODO: here idk so copy paper
position_encoding_lookup_table = torch.zeros(context_length,d_model)
position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
# apply the sine & cosine
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1) # add batch to the first dimension

In [98]:
# add positional encoding to the input embedding
x = x_batch_embedding + position_encoding_lookup_table
y = y_batch_embedding + position_encoding_lookup_table
pd.DataFrame(x[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-0.322218,-0.409389,2.18854,0.902386,0.842244,1.357249,0.376062,1.452828,-0.147984,1.99345,...,-1.211121,0.208891,1.96057,0.402269,-0.091994,0.989842,0.460002,1.804185,-0.810581,1.048166
1,1.672771,0.354527,1.620251,-0.174871,1.348436,0.234261,0.614191,2.486705,-0.349069,1.929204,...,-0.773519,0.294397,-0.880421,0.194865,-0.433929,1.150474,0.672945,0.023306,-0.934006,2.711809
2,0.908097,-1.060022,1.584182,0.579031,1.20313,1.127968,1.182322,1.61096,0.619247,2.679089,...,0.450202,-0.293924,-1.25655,2.188431,-0.790716,-1.366608,-0.111572,1.116148,-0.669431,1.7165
3,0.487071,0.524812,0.870774,-0.572864,1.849003,0.477404,1.363102,0.873452,0.763274,0.746941,...,1.143813,0.537079,1.759755,1.016758,0.141407,-0.38131,0.001009,1.125835,-1.243398,0.474337
4,-1.51986,0.411349,1.014437,-1.386681,1.210713,0.377782,0.13428,-0.459338,1.058506,-0.503252,...,-1.076302,1.20998,0.884415,1.615863,-2.223127,2.164112,-0.214202,1.881279,-0.946214,1.283735
5,-0.396897,1.098407,-0.731618,-0.477528,-0.815435,-1.185984,1.537297,-0.309508,2.694764,-0.646601,...,-0.054133,0.790007,0.250778,2.261299,0.654142,2.518198,0.02484,1.194373,0.514472,1.918137
6,-0.392771,-0.373341,-0.25194,-0.014452,0.443205,-0.281049,0.683129,-1.289763,0.071701,0.881678,...,-1.230057,0.363818,-0.743048,1.470696,-0.453912,1.795517,1.661132,-0.178232,-1.31818,1.084545
7,1.43774,0.037849,-1.743038,0.640131,1.359085,0.614568,1.659745,1.367679,0.181814,1.297775,...,0.359023,-0.538674,-0.98143,2.012137,-1.642203,0.385722,1.761279,1.354689,-0.668779,1.353627
8,0.984924,0.349465,-1.364149,0.147823,-0.334512,-0.99661,-1.080562,-0.989027,-0.60967,-0.733291,...,0.704268,0.727815,-2.259962,-0.146118,-0.091895,1.799253,0.52122,2.500714,-1.53743,0.976813
9,0.509491,-0.941041,1.538565,0.585983,-0.444309,0.850889,1.327218,-1.755941,0.969801,-1.157845,...,-1.320791,2.203557,0.003226,1.589249,-1.114987,1.971091,-0.618653,1.556947,-0.296786,2.578598


# Multi-Head Attention

In [99]:
# generate Q,K,V
Wq = nn.Linear(d_model,d_model)
Wk = nn.Linear(d_model,d_model)
Wv = nn.Linear(d_model,d_model)
# default last [16 ,64] * [64,64] => [16,64], execute 4 batch auto
Q = Wq(x)
K = Wk(x)
V = Wv(x)
Q.shape,K.shape,V.shape


(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [100]:
# apply multi-head
Q = Q.reshape(batch_size,context_length,num_heads,d_model//num_heads)
K = K.reshape(batch_size,context_length,num_heads,d_model//num_heads)
V = V.reshape(batch_size,context_length,num_heads,d_model//num_heads)

Q = Q.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
K = K.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
V = V.permute(0,2,1,3) # [4,16,4,16] to [4,4,16,16]
Q.shape



torch.Size([4, 4, 16, 16])

In [101]:
# tensor @
output = Q @ K.transpose(-2,-1) / math.sqrt(d_model//num_heads) # transform token * token  & scale
# # apply mask
mask = torch.triu(torch.ones(context_length,context_length),diagonal=1).bool()
output = output.masked_fill(mask,float('-inf')) # ignore predict section such as i am tom, here only i am
output

tensor([[[[-3.1179e-01,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-8.4912e-01, -2.9320e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-6.8314e-01, -6.0089e-01, -6.0636e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
          [-7.2487e-02, -3.3676e-01,  4.4331e-01,  ...,  9.9694e-01,
                  -inf,        -inf],
          [ 2.7911e-01, -2.3307e-01,  1.2442e+00,  ...,  1.1212e+00,
            6.8239e-01,        -inf],
          [-1.9782e-01, -8.2563e-02,  6.4668e-01,  ...,  1.0883e+00,
            1.0151e+00,  7.0050e-01]],

         [[ 5.3915e-01,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-6.1488e-02, -3.4297e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [ 7.7564e-02,  1.1729e-01,  3.6037e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
     

In [102]:
# apply softmax
attention_score = F.softmax(output,dim=-1) # score transform equal to [0,1] 
attention_score

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3645, 0.6355, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3159, 0.3430, 0.3411,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0443, 0.0340, 0.0742,  ..., 0.1290, 0.0000, 0.0000],
          [0.0552, 0.0331, 0.1450,  ..., 0.1282, 0.0827, 0.0000],
          [0.0309, 0.0347, 0.0719,  ..., 0.1118, 0.1039, 0.0758]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5699, 0.4301, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2970, 0.3090, 0.3940,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0374, 0.0677, 0.0496,  ..., 0.0915, 0.0000, 0.0000],
          [0.2032, 0.0473, 0.0470,  ..., 0.0609, 0.0397, 0.0000],
          [0.0284, 0.0961, 0.0603,  ..., 0.0380, 0.0305, 0.0300]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4558, 0.5442, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2810, 0.3604, 0.3587,  ..., 0

In [103]:
A =  attention_score @ V #  here not use transpose(-2,-1) may be attention_score has the token**

In [104]:
# concatenate
A = A.transpose(1,2).reshape(batch_size,-1,d_model)

Wo = nn.Linear(d_model,d_model)
output = Wo(A)
output

tensor([[[-0.0398,  0.4818,  0.0963,  ...,  0.1679,  0.2287,  0.6676],
         [-0.0863,  0.2418, -0.0479,  ..., -0.0519,  0.1836,  0.5052],
         [-0.1069,  0.3283,  0.1011,  ..., -0.0190,  0.0015,  0.6509],
         ...,
         [ 0.2268, -0.0513, -0.3399,  ..., -0.1393, -0.0869,  0.1384],
         [ 0.1988, -0.1359, -0.2928,  ..., -0.0974, -0.0921,  0.0950],
         [ 0.2695, -0.0724, -0.3124,  ..., -0.1781, -0.1108,  0.0394]],

        [[ 0.6502,  0.0718,  0.0830,  ..., -0.4850, -0.3659, -0.0209],
         [ 0.3004,  0.1372, -0.0650,  ..., -0.2453, -0.0962,  0.3371],
         [ 0.3175, -0.0472, -0.2013,  ..., -0.2026, -0.1292,  0.2658],
         ...,
         [ 0.2170, -0.1850, -0.4572,  ..., -0.0892, -0.1838, -0.0369],
         [ 0.2548, -0.2806, -0.5512,  ..., -0.0559, -0.1754, -0.0636],
         [ 0.2907, -0.2069, -0.4666,  ..., -0.0120, -0.1782, -0.1019]],

        [[ 0.3059,  0.0707, -0.3349,  ...,  0.0121, -0.2921,  0.3436],
         [ 0.2044,  0.3064, -0.6218,  ...,  0

In [105]:
# residual connection
output = output + x

In [106]:
# layer normalization
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)



In [107]:
# feed forward network TODO: idk
output = nn.Linear(d_model,d_model * 4)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4,d_model)(output)

output = output + layer_norm_output

In [108]:
# layer normalization
output = layer_norm(output)

In [109]:
# final liner layer
output = nn.Linear(d_model,max_token_value+1)(output)
output.shape

torch.Size([4, 16, 100070])

In [110]:
logits = F.softmax(output,dim=-1)
logits

tensor([[[9.1726e-06, 6.8594e-06, 3.7497e-06,  ..., 1.5194e-05,
          8.9999e-06, 2.5022e-05],
         [7.5116e-06, 7.2905e-06, 2.7375e-06,  ..., 1.1020e-05,
          8.9646e-06, 1.1110e-05],
         [8.6013e-06, 6.7701e-06, 1.1040e-05,  ..., 1.6123e-05,
          2.4762e-05, 2.1542e-05],
         ...,
         [3.5195e-06, 4.5448e-06, 8.1129e-06,  ..., 5.1826e-06,
          1.0819e-05, 1.1436e-05],
         [5.4967e-06, 2.3233e-06, 3.5886e-06,  ..., 1.0309e-05,
          6.7655e-06, 4.0197e-06],
         [7.2078e-06, 4.1557e-06, 6.6817e-06,  ..., 1.5137e-05,
          2.0015e-05, 2.3652e-06]],

        [[6.5819e-06, 6.4126e-06, 3.9845e-06,  ..., 2.4906e-05,
          8.9351e-06, 8.9222e-06],
         [5.4351e-06, 7.4536e-06, 3.7279e-06,  ..., 1.6725e-05,
          6.8678e-06, 7.2662e-06],
         [7.1137e-06, 1.5165e-05, 3.8613e-06,  ..., 1.4368e-05,
          1.2998e-05, 1.2725e-05],
         ...,
         [7.7805e-06, 4.1522e-06, 4.7183e-06,  ..., 8.6784e-06,
          5.207

In [111]:
print(sum(logits[0,0]))

predicted_idx = torch.argmax(logits[0,0]).item() # get max score index

encoding.decode([predicted_idx])

tensor(1.0000, grad_fn=<AddBackward0>)


'ocular'