In [2]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [None]:
"""
This notebook is to illustrate the Transformer architecture.
Steps are followed by my article post: 
https://waylandzhang.github.io/en/let-s-code-llm.html
"""

In [None]:
# Hyperparameters
batch_size = 4  # How many batches per training step
context_length = 16  # Length of the token chunk each batch
d_model = 64  # The vector size of the token embeddings
num_layers = 8  # Number of transformer blocks
num_heads = 4  # Number of heads in Multi-head attention # 我们的代码中通过 d_model / num_heads = 来获取 head_size
learning_rate = 1e-3  # 0.001
dropout = 0.1 # Dropout rate
max_iters = 500  # Total of training iterations
eval_interval = 50  # How often to evaluate the model 
eval_iters = 20  # How many iterations to average the loss over when evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Instead of using the cpu, we'll use the GPU if it's available.

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

In [5]:
# download a sample txt file from https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt
if not os.path.exists('data/sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).text)

with open('data/sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
    

In [None]:
# Using TikToken to tokenize the source text
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long, device=device) # Convert tokens into a tensor
max_token_value = tokenized_text.max().item() # the maximum index value in our vocabulary

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

In [None]:
tokenized_text.max().item()

In [None]:
# Illustration purpose
print(encoding.encode('测试aaa中国你好'))



In [None]:
print(encoding.decode([82805, 33746, 59795])) # "Rapport" is tokenized as two tokens: "Rap"[23097] and "port"[403]

In [10]:
# Split train and validation
split_idx = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]

In [None]:
train_data

In [None]:
# Prepare data for training batch
data = train_data
idxs = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
x_batch = torch.stack([data[idx:idx + context_length] for idx in idxs])
y_batch = torch.stack([data[idx + 1:idx + context_length + 1] for idx in idxs])
print(x_batch.shape, x_batch.shape)

In [None]:
# Illustration purpose
pd.set_option('display.expand_frame_repr', False)
print("Our batches:\n", pd.DataFrame(x_batch.data.detach().cpu().numpy()))

In [None]:
# Define Token Embedding look-up table
token_embedding_lookup_table = nn.Embedding(max_token_value+1, d_model)
print("Token Embedding Look-up table: ", token_embedding_lookup_table)

In [None]:
x_batch.data.numpy().shape

In [None]:
# Get x and y embedding
x_batch_embedding = token_embedding_lookup_table(x_batch.data) # [4, 16, 64] [batch_size, context_length, d_model]
y_batch_embedding = token_embedding_lookup_table(y_batch.data)

x_batch_embedding.shape, y_batch_embedding.shape

In [17]:
P = torch.zeros(context_length, d_model)
position = torch.arange(0, context_length, dtype=torch.float)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                     (-math.log(10000)/d_model))
P[:,0::2] = torch.sin(position.unsqueeze(1) * div_term)
P[:,1::2] = torch.cos(position.unsqueeze(1) * div_term)
position_encoding_lookup_table = P.unsqueeze(0).expand(batch_size, -1, -1)



In [None]:
P.shape

In [None]:
position.unsqueeze(1).shape

In [None]:
a=position.unsqueeze(1) * div_term
a.shape

In [None]:
# Illustration Purpose Only
def visualize_pe(pe):
    plt.imshow(pe, aspect="auto")
    plt.title("Positional Encoding")
    plt.xlabel("Encoding Dimension")
    plt.ylabel("Position Index")
    plt.colorbar()
    plt.show()

position_encoding_lookup_table2_np = position_encoding_lookup_table[0].cpu().numpy()
visualize_pe(position_encoding_lookup_table2_np)

In [26]:
input_embedding_x = x_batch_embedding + position_encoding_lookup_table
input_embedding_y = y_batch_embedding + position_encoding_lookup_table

In [None]:
X = input_embedding_x
query = key = value = X
query.shape

In [36]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(query)
Q = Q.view(batch_size, context_length, num_heads, d_model//num_heads)


K = Wk(key)
K = K.view(batch_size, context_length, num_heads, d_model//num_heads)


V= Wv(value)
V = V.view(batch_size, context_length, num_heads, d_model//num_heads)



In [None]:
Q.shape

In [41]:
Q = Q.transpose(1,2) # head和length转置
K = K.transpose(1,2) # head和length转置
V = V.transpose(1,2) # head和length转置

In [None]:
Q.shape

In [None]:

attenstion_score = torch.matmul(Q, K.transpose(2,3)/math.sqrt(d_model//num_heads))

attenstion_score[0,0].detach().cpu().numpy()
# Illustration only
plt.imshow(attenstion_score[0, 0].detach().cpu().numpy(), "Accent", aspect="auto")


In [100]:
tr = torch.triu(torch.ones(attenstion_score.shape[-2:]),diagonal=1).bool()
attention_score = attenstion_score.masked_fill(tr, float('-inf'))


In [None]:
attention_score.shape

In [None]:

# Illustration only
# -inf 是因为要取e，然后才是0
plt.imshow(attention_score[1, 1].detach().cpu().numpy(), "Accent", aspect="auto")


In [103]:
attention_score = torch.softmax(attenstion_score, dim=-1)
A = torch.matmul(attention_score, V) 

In [None]:
pd.DataFrame(attention_score[0,0].detach().numpy())

In [None]:
attention_score[0,0]

In [None]:
A.shape

In [107]:
A = A.transpose(1,2).contiguous()
A = A.view(batch_size, -1, d_model)

A.shape

In [None]:
A.shape

In [109]:
Wo = nn.Linear(d_model, d_model)
output = Wo(A)


In [110]:
output = output + X

In [None]:
output

In [112]:
layer_norm = nn.LayerNorm(d_model)
output_layernorm = layer_norm(output)

In [None]:
outout  = nn.Linear(d_model, d_model*4)(output_layernorm)
output = nn.ReLU()(output)
output = nn.Linear(d_model*4, d_model)(output)
output = output + output_layernorm
output = layer_norm(output)


In [None]:
output

In [None]:
logits  = nn.Linear(d_model, max_token_value+1)(output)
logits.shape

In [None]:
prob = torch.softmax(logits, dim=-1)
pd.DataFrame(prob[0].detach().cpu().numpy())


In [None]:
predicted_index = torch.argmax(logits[0,15]).item()
encoding.decode([predicted_index])

In [None]:
# Let's see the original input sentence
encoding.decode(x_batch[0].tolist())