In [None]:
import sys
sys.path.append("..")

from pathlib import Path
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader

from model.llm import LLM
from model.tokenizer import Tokenizer, train_tokenizer

from helpers.dataset import NextTokenPredictionDataset
from helpers.trainer import train
from helpers.config import LLMConfig, TrainingConfig, get_device

print(f"pytorch version: {torch.__version__}")

In [None]:
llm_config = LLMConfig(
    vocab_size = 4096,
    seq_len = 128,
    dim_emb = 256,
    num_layers = 4,
    num_heads = 8,
    emb_dropout = 0.0,
    ffn_dim_hidden = 4 * 256,
    ffn_bias = False
)
train_config = TrainingConfig(
    retrain_tokenizer = False,
    device = get_device(),
    batch_size = 64,
    learning_rate = 3e-4,
    weight_decay = 1e-5,
    max_epochs = 1,
    log_frequency = 1
)

In [None]:
input_file = "../data/tinyshakespeare.txt"
output_file = Path(input_file).with_suffix(".model")

if not output_file.exists() or train_config.retrain_tokenizer:
    train_tokenizer(input_file, llm_config.vocab_size)

tokenizer = Tokenizer(str(output_file))

In [None]:
sentence = "Before we proceed any further, hear me speak. Acquaintances. Dog Dogs. ance"
print(tokenizer.sp.EncodeAsPieces(sentence))

In [None]:
ds_train = NextTokenPredictionDataset(input_file, llm_config.seq_len, tokenizer)
dl_train = DataLoader(ds_train, batch_size=train_config.batch_size, shuffle=True)
##
for inputs, labels in dl_train:
    print(inputs.shape, labels.shape)
    break

device = get_device()
model = LLM(
    vocab_size = tokenizer.vocab_size,
    seq_len = llm_config.seq_len,
    dim_emb = llm_config.dim_emb,
    num_layers = llm_config.num_layers,
    attn_num_heads = llm_config.num_heads,
    emb_dropout = llm_config.emb_dropout,
    ffn_hidden_dim = llm_config.ffn_dim_hidden,
    ffn_bias = llm_config.ffn_bias
)

In [None]:
# loss_history = train(
#     model,
#     dl_train,
#     train_config.device,
#     lr = train_config.learning_rate,
#     max_epochs = train_config.max_epochs,
#     weight_decay = train_config.weight_decay,
#     log_every = train_config.log_frequency
# )

Un-comment block above to train model (takes a few minutes on the AMD GPU cluster) **or** just use pre-trained model using block below (must upload this to your home directory for this work):

In [None]:
model.to(device)
model.load_state_dict(torch.load("shakespeare_llm.pt",
                                 weights_only = True,
                                 map_location = device))
model.eval()

In [None]:
# empty prompt to generate random stuff
prompt = torch.full((1, llm_config.seq_len), tokenizer.eos_id, dtype=torch.int32)
prompt = prompt.to(train_config.device)
out = model.generate(prompt, max_seq_len=64, top_p=1)
tokenizer.decode(out.tolist())

In [None]:
# generate from a prompt
prompt = tokenizer.encode(
    "KING HENRY VI:",
    beg_of_string = True,
    pad_seq = True,
    seq_len = llm_config.seq_len
)
inputs = torch.tensor(prompt, dtype=torch.int32).unsqueeze(0).to(train_config.device)
out = model.generate(inputs, max_seq_len=64, top_p=1, temperature=1000)
tokenizer.decode(out.tolist())

In [None]:
parList = list(model.parameters())
len(parList) ## 35
parShapes = [list(el.shape) for el in parList]

parameters consist of:

- **0** :: weight matrix for **token embeddings**

<!-- -->

- **1** :: **RMSNorm** parameter vector
- **2** :: Q, K, V matrices (concatenated) for **MultiHeadAttention**
- **3** :: weight matrix for projout part of **MultiHeadAttention**
- **4** :: **RMSNorm** parameter vector
- **5** :: initial weight matrix for **FeedForward (SwiGLU)** part
- **6** :: **SwiGLU** weight matrices (concatened)
- **7** :: **SwiGLU** bias vector
- **8** :: final weight matrix for **FeedForward (SwiGLU)** part

<!-- -->

-  **9-16** :: as 1-8 but for second TransformerBlock
- **17-24** ::        "       third         "
- **25-32** ::        "       fourth        "

<!-- -->

- **33** :: **RMSNorm** parameter vector
- **34** :: final **projection_head** bias vector

**NOTE**: there is no weight matrix for the final projection head b/c it is "weight-tied" to the token embeddings weight matrix (0 above)

In [None]:
import numpy as np
import pandas as pd

tokens = np.array([tokenizer.sp.id_to_piece(i)
                   for i in range(llm_config.vocab_size)])

tokenizer.sp.piece_to_id("▁perforce")
tokenizer.sp.piece_to_id("▁basilisk")

emb = pd.DataFrame(parList[0].cpu().detach(), index=tokens)
emb

In [None]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model.token_embedding.register_forward_hook(get_activation('token_embedding'))
model.transformer[0].norm_attn.register_forward_hook(get_activation('rmsnorm0'))
model.transformer[0].multihead_attn.register_forward_hook(get_activation('mha0'))
model.transformer.register_forward_hook(get_activation('transformer'))
output = model(inputs)

In [None]:
promptN = tokenizer.encode(
    "Such an act",
    beg_of_string = True,
    pad_seq = True,
    seq_len = llm_config.seq_len
)
inputsN = torch.tensor(promptN, dtype=torch.int32).unsqueeze(0).to(train_config.device)
outputN = model(inputsN)
en = activation['token_embedding'][0, 127, :].cpu().detach().numpy()
tn = activation['transformer'][0, 127, :].cpu().detach().numpy()

In [None]:
import plotnine as gg; gg.theme_set(gg.theme_bw())
gg.qplot(en, tn)

In [None]:
promptV = tokenizer.encode(
    "I have act",
    beg_of_string = True,
    pad_seq = True,
    seq_len = llm_config.seq_len
)
inputsV = torch.tensor(promptV, dtype=torch.int32).unsqueeze(0).to(train_config.device)
outputV = model(inputsV)
ev = activation['token_embedding'][0, 127, :].cpu().detach().numpy()
tv = activation['transformer'][0, 127, :].cpu().detach().numpy()

In [None]:
gg.qplot(en, ev)

In [None]:
gg.qplot(tn, tv)

In [None]:
import numpy as np
import pandas as pd
import torch.nn.functional as F

def nextTokenProbs(inputString, temperature=0.6):
    prompt = tokenizer.encode(inputString,
                              beg_of_string = True,
                              pad_seq = True,
                              seq_len = llm_config.seq_len)
    inputs = torch.tensor(prompt, dtype=torch.int32)\
                  .unsqueeze(0)\
                  .to(train_config.device)
    logits = model(inputs)[:, -1, :]                 ## (bs, vocab_size)
    probs = F.softmax(logits / temperature, dim=-1)  ## (bs, vocab_size)
    return pd.Series(probs.cpu().detach().numpy()[0, :], index=tokens)

In [None]:
nextTokenProbs("Such an act").sort_values().tail(10)

In [None]:
nextTokenProbs("I have act").sort_values().tail(10)

In [None]:
mha0 = model.transformer[0].multihead_attn
qkv = mha0.proj_qkv(activation['rmsnorm0'])

q, k, v = qkv.chunk(3, dim=-1)
# (bs, seq_len, dim_k), (bs, seq_len, dim_k), (bs, seq_len, dim_v)

# split projections between heads -> (bs, num_heads, seq_len, dim_k/dim_v):
q = q.view(-1, mha0.seq_len, mha0.num_heads, mha0.dim_head).permute(0, 2, 1, 3)
k = k.view(-1, mha0.seq_len, mha0.num_heads, mha0.dim_head).permute(0, 2, 1, 3)
v = v.view(-1, mha0.seq_len, mha0.num_heads, mha0.dim_head).permute(0, 2, 1, 3)

q = mha0.positional_encoding(q)  # (bs, num_heads, seq_len, dim_k)
k = mha0.positional_encoding(k)  # (bs, num_heads, seq_len, dim_k)

attn_scores = (q @ k.permute(0, 1, 3, 2)) * mha0.dim_k**-0.5

attn_scores.masked_fill_(mha0.causal_mask[None, None, ...], -torch.inf)

# attention scores are used to build a weighted linear combination of values vectors:
attn_scores = torch.softmax(attn_scores, dim=-1)
out = attn_scores @ v

# merge heads:
out = out.permute(0, 2, 1, 3).contiguous().view(-1, mha0.seq_len, mha0.dim_v)
# project to output space:
out = mha0.proj_out(out)
out

In [None]:
activation['mha0']