# Building a GPT Decoder

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# we always start with a dataset to train. let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

# read it in to inspect
with open('input.txt', 'r', encoding='utf-8') as f:
    text= f.read()

--2024-09-29 11:41:31--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-09-29 11:41:31 (23.3 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [None]:
print('Length of dataset in characters: ', len(text))

Length of dataset in characters:  1115394


In [None]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
# here are all the unique characters that occur in this text
chars= sorted(list(set(text)))
vocab_size= len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


# Tokenization

Converting the raw text as a string to some sequences of integers according to some vocabulary of possible elements.

Tokenizer types:
- caracter level - small vocabularies, long sequences of integers
- sub-word level - very large vocabularies, short sequences of integers

In [None]:
# creating an example of a mapping from characters to integers - caracter level encoding
stoi= {ch:i for i, ch in enumerate(chars)}
itos= {i:ch for i, ch in enumerate(chars)}

encode= lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode= lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
!pip install tiktoken

In [None]:
# example of encoding using a sub-word leval large vocabulary
import tiktoken
enc= tiktoken.get_encoding('gpt2')
print(enc.n_vocab)

print(enc.encode('hii there'))
print(enc.decode(enc.encode('hii there')))

50257
[71, 4178, 612]
hii there


In [None]:
# let's now encode the entire text dataset and store it into a tensor
data= torch.tensor(encode(text))
print(data.shape, data.dtype)
# the 1000 characters we looked at earlier will to the GPT look like this
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [None]:
# let's now split up the data into train and validation sets
n = int(0.9 * len(data)) # first 90% will be train, rest val
train_data= data[:n]
val_data= data[n:]

Creating batches. We never feed the entire text into the model at once. It would be computationally very expensive. We work on chunks of the dataset and we train the model basically sampling little random chunks out of the training set and train on just chunks at a time. These chunks have some kind of length and some kind of maximum length we are going to call block_size.

The targets of train data are offset by one and that is because. We train this way to make the transformer network be used to seeing contexts all the way from as little as one all the way to block size. Transformer will predict up to block_size characters and have to start truncating because it will never receice more than block_size inputs when predicting the next charactere.

In [None]:
block_size= 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x= train_data[:block_size]
y= train_data[1:block_size+1]

for t in range(block_size):
    context= x[:t+1]
    target= y[t]
    print(f'When input is {context} the target: {target}')

When input is tensor([18]) the target: 47
When input is tensor([18, 47]) the target: 56
When input is tensor([18, 47, 56]) the target: 57
When input is tensor([18, 47, 56, 57]) the target: 58
When input is tensor([18, 47, 56, 57, 58]) the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


We feed transformer with many batches of multiple chunks of text that are all like stacked up in a single tensor for efficiency and keep GPUs busy because they are very good at parallel processing of data. The chunks are processed independently, they don't talk to each other.

In [None]:
torch.manual_seed(1337)
batch_size= 4 # how many independent sequenes will we process in parallel?
block_size= 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and target y
    data= train_data if split== 'train' else val_data
    ix= torch.randint(len(data) - block_size, (batch_size,))
    x= torch.stack([data[i:i+block_size] for i in ix])
    y= torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y= x.to(device), y.to(device)

    return x, y


xb, yb= get_batch('train')
print('Inputs')
print(xb.shape)
print(xb)
print('Target')
print(yb.shape)
print(yb)
print('------')

for b in range(batch_size):     # batch dimension
    for t in range(block_size): # time dimension
        context= xb[b, :t+1]
        target= yb[b, t]
        print(f'When input is {context.tolist()} the target is: {target}')

Inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
Target
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
------
When input is [24] the target is: 43
When input is [24, 43] the target is: 58
When input is [24, 43, 58] the target is: 5
When input is [24, 43, 58, 5] the target is: 57
When input is [24, 43, 58, 5, 57] the target is: 1
When input is [24, 43, 58, 5, 57, 1] the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
When input is [44] the target is: 53
When input is [44, 53] the target is: 56
When input is [44, 53, 56] the target is: 1
When input is [44, 53, 56, 1] the target is: 58
W

In [None]:
# our input to the transformer
xb

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')

# Bulding Transformer

We implemented here a decoder-only transformer. So, there is no an encoder component and there is no cross attention block (the in between piece connecting decoder and encoder). The reason is because we are generating text accordint to a dataset and it is unconditioned on anything.

A decoder block uses the triangular mask to mask out the attention and it can be used for language modeling.

The transformer paper (Attention is All You Need) presents a encoder-decoder transformer used for text translation. Unlike our model an encoder-decoder transformer wants to condition the generation on some additional information, the original sentence that should be translated. The encoder reads the sentence to translate and creates tokens from it without triangular mask and so, all the tokens are allowed to talk to each other as they much as they want. Once encodded the output is connected to a cross attention block. The queries are still generated from x, but now the keys and values are coming from the encoder.

In [None]:
torch.manual_seed(1337)

# ----- hyperparameters setting -----
batch_size= 64 # how many independent sequenes will we process in parallel?
block_size= 256 # what is the maximum context length for predictions?
max_iters= 5000
eval_interval= 500
learning_rate= 3e-4
eval_iters= 200
n_embed= 384
n_heads= 6
n_layers= 6
dropout= 0.1

In [None]:
class Head(nn.Module):
    """
    One head of Scaled Self-Attention.
    """

    def __init__(self, head_size) -> None:
        super(Head, self).__init__()
        self.key  = nn.Linear(n_embed, head_size, bias=False)
        self.query= nn.Linear(n_embed, head_size, bias=False)
        self.value= nn.Linear(n_embed, head_size, bias=False)
        # tril is not a parameter of the module; so, we store it in a PyTorch register_buffer
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout= nn.Dropout(p=dropout)


    def forward(self, x):
        B, T, C= x.shape
        k= self.key(x)   # (B, T, C) -- what do I contain?
        q= self.query(x) # (B, T, C) -- what am I looking for?
        v= self.value(x) # (B, T, C) -- private information to this token, what will communicate
        # conpute attention scores ("affinities") -- if the key and query are sort of aligned
        # they will interact to a very high amount and then we will get to learn more about that
        # specific token as opposet ot any other token in the sequence
        wei= q @ k.transpose(-2,-1) * C**(-0.5) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei= wei.masked_fill(self.tril[:T, :T]== 0, float('-inf')) # (B, T, T)
        wei= F.softmax(wei, dim=-1) # (B, T, T)
        wei= self.dropout(wei)
        # perform the weighted aggregation of the values
        attn= wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)

        return attn


Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries (i.e., from x). In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module). So, cross-attention is used when there is a separate source of node we'd like to pull information from into our nodes, and in self-attention we just have nodes that would like to look at each other and talk to each other.
- "Scaled" attention additional divides `wei` by $\sqrt{head\_size}$, which is the same to multiply by $head\_size^{-\frac{1}{2}}$. This makes it so when input Q, K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much.

In [None]:
class MultiHeadAttention(nn.Module):
    """
    Multiple heads of self-attention in parallel.
    """

    def __init__(self, n_heads, head_size) -> None:
        super(MultiHeadAttention, self).__init__()
        self.heads= nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout= nn.Dropout(p=dropout)


    def forward(self, x):
        # we run all heads in parallel into a list and simply concatenate all of the
        # outputs. we are concatenating over the channel dimension
        out= torch.cat([h(x) for h in self.heads], dim=-1)
        out= self.dropout(self.proj(out))

        return out


It helps to have multiple communication channels because, obviously, the tokens have a lot to talk about. They want to find the consonants, the vowels, they want to find the vowels just from certain positions, they want to find any kinds of different things. So, multiple parallel attention heads help to create multiple independent channels of communications, gather lots of different types of data, and then decode the output.

The self-attention is the communication and then once they have gathered all the data, now they needd to think on that data individually. That is what feef forward is doing (computation) and that is why we added it just after attention heads.

Dropout is something that we can add right before the connection back into the residual pathway. Using dropout is kind of training an ensemble of subnetworks and then, at test time, everything is fully enabled and all the sub networks are merged into a single ensemble.

In [None]:
class FeedForward(nn.Module):
    """
    A simple linear layer followed by a non-linearity.
    """

    def __init__(self, n_embed) -> None:
        super(FeedForward, self).__init__()
        d_ff= 4 * n_embed
        self.ff_net= nn.Sequential(
            nn.Linear(n_embed, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, n_embed),
            nn.Dropout(p=dropout)
        )


    def forward(self, x):
        x= self.ff_net(x)

        return x


# The Decoder Block

Communication and computation block. The communication is done in multi-headed self-attention and then the computation is done using a feed forward network on all the tokens independently.

In [None]:
class DecoderBlock(nn.Module):
    """
    Transformer decoder block: communication followed by computation.
    """

    def __init__(self, n_embed, n_heads) -> None:
        # n_embed: embedding dimension, n_heads: the number of heads we'd like
        super(DecoderBlock, self).__init__()
        head_size= n_embed// n_heads
        self.sa_heads= MultiHeadAttention(n_heads, head_size)
        self.ln1= nn.LayerNorm(n_embed)
        self.ffwd= FeedForward(n_embed)
        self.ln2= nn.LayerNorm(n_embed)


    def forward(self, x):
        # residual connections that dramatically help with the optimization
        # we fork off and do some communication and come back
        x= x + self.sa_heads(self.ln1(x))
        # we fork off and do some computation and come back
        x= x + self.ffwd(self.ln2(x))

        return x


# GPT Decoder Model

https://www.youtube.com/watch?v=zduSFxRajkE see 01:43:27

In [None]:
class BigramLanguageModel(nn.Module):
    """
    Super simple bigram model.
    """

    def __init__(self) -> None:
        super(BigramLanguageModel, self).__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table= nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table= nn.Embedding(block_size, n_embed)
        self.d_blocks= nn.Sequential(*[
            DecoderBlock(n_embed, n_heads) for _ in range(n_layers)
        ])
        self.ln_f= nn.LayerNorm(n_embed) # final layer norm
        self.lm_head= nn.Linear(n_embed, vocab_size)


    def forward(self, idx, targets=None):
        B, T= idx.shape
        # idx and target are both (B, T) tensor of integers
        tok_emb= self.token_embedding_table(idx) # (B, T, C)
        pos_emb= self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x= tok_emb + pos_emb # (B, T, C)
        x= self.d_blocks(x) # apply n_layers blocks of self-attention (B, T, C)
        x= self.ln_f(x) # (B, T, C)
        logits= self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss= None
        else:
            B, T, C= logits.shape
            logits= logits.view(B*T, C)
            targets= targets.view(B*T)
            loss= F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond= idx[:, -block_size:]
            # get the predictions
            logits, loss= self(idx_cond)
            # focus only on thee last time step
            logits= logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs= F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next= torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx= torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [None]:
m= BigramLanguageModel()
model= m.to(device)
logits, loss= model(xb, yb)
print(logits.shape)
print(loss)

context= torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

torch.Size([32, 65])
tensor(4.3802, device='cuda:0', grad_fn=<NllLossBackward0>)

Z&LUdQ:bIxuKyY3GGltHrgFSFMXfguc-aBcgteDOKVoSPdLDiScOxQkVIE?WBM?r,nQA;IPD bQSfWLqBnQZ-w!xZMQ:
$ZSv!MiDPMIDKoRok&rFpp,RKndLtiQscSKrgFc'LvTs'guooNT.Lx'RcQ!v tDVulf-Qsymd?qJj&ZrwKoQGvPVOJ Si;cpj!tVriK'AdlJg:XpqSbpkuswAfS,AJ&biflODzRfuS
ik??ucaWU;HDrMs!BsAOenjKDBF$uBvnperULxMon!cVnDFNE&IPTQ&KnHSKixFvPn$VsCJkCrGH,XSpj eXRWgrNcxk $OASQKnSf:B'sdD!hnH?qpPTnXnrn:H!H!ZRRLDB.nH!VTZdLUJ!qF$DrHecqjr,K;iTFvFk3SkmnzezqJVsKxVTDf.k3:HTpLKODOghtD.Dj,EKgU3TTBEQIIYoF?tnqBNDenqBrHp?XCfm!HTmXv,ACjgzd$S3I3yLxTwDpFvnHx-


In [None]:
total_params= sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of parameters: {total_params}')

Number of parameters: 10788929


# Pre-Training and Text Generation

In [None]:
@torch.no_grad()
def estimate_loss(model):
    out= {}
    model.eval()

    for split in ['train', 'val']:
        losses= torch.zeros(eval_iters)

        for k in range(eval_iters):
            X, Y= get_batch(split)
            logits, loss= model(X, Y)
            losses[k]= loss.item()

        out[split]= losses.mean()

    model.train()

    return out


In [None]:
def self_supervised_train(model, learning_rate=1e-3, max_iters=500, eval_interval=50):

    # create a PyTorch optimizer
    optimizer= torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for iter in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval== 0:
            losses= estimate_loss(model)
            print(f"Steps {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        xb, yb= get_batch('train')

        # evaluate the loss
        logits, loss= model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    return losses


losses= self_supervised_train(model, learning_rate, max_iters, eval_interval)

# generate from the model
context= torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

Steps 0: train loss 4.2849, val loss 4.2823
Steps 500: train loss 1.9696, val loss 2.0683
Steps 1000: train loss 1.5614, val loss 1.7466
Steps 1500: train loss 1.3975, val loss 1.6065
Steps 2000: train loss 1.3029, val loss 1.5459
Steps 2500: train loss 1.2343, val loss 1.5188
Steps 3000: train loss 1.1791, val loss 1.5043
Steps 3500: train loss 1.1255, val loss 1.4962
Steps 4000: train loss 1.0793, val loss 1.5091
Steps 4500: train loss 1.0253, val loss 1.5137

And, prepare into reply a famous flesh
With worth your house to his pleasure?

THOMAS OF SAUMERS:
Could I tear, my lord, then are put t us both.

BUCKINGHAM:
He was not upon his truth, now boy, the day.

QUEEN MARGARET:
God forbear! whise glad I stood the prompt, my persecute,
Will be clostemal proopous by war.
I'll to fall out our headful tend body blood,
Which make me winnch and way still make doop:
The fond more honour lives grief death,
Is ere to deputy his day night's face.

FRIAR LAURENCE:


We did not implemented any of the fine-tune stages that tipically go on top of a pre-trained model. If we are interested in something that is not just language modeling but actually want to perform tasks or be aligned in a specific way, or we want to detect sentiment or anything like that, basically, anytime we don't want something that is just a document completer, we have to implement further stages of fine-tuning, which can be supervised fine-tuning or something more fancy such as a reward model.

# The mathematical trick in self-attention

In [None]:
# consider the following toy example:
torch.manual_seed(1337)
B, T, C= 4,8,2 # batch, time, channels
x= torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# version 1: We want x[b,t] = mean_{i<=t} x[b,i]
xbow= torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev= x[b,:t+1] # (t,C)
        xbow[b,t]= torch.mean(xprev, 0)

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a= torch.tril(torch.ones(3, 3))
a= a / torch.sum(a, 1, keepdim=True)
b= torch.randint(0,10,(3,2)).float()
c= a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# version 1: using matrix multiply for a weighted aggregation
wei= torch.tril(torch.ones(T, T))
wei= wei / wei.sum(1, keepdim=True)
xbow2= wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [None]:
# version 3: use Softmax
tril= torch.tril(torch.ones(T, T))
wei= torch.zeros((T,T))
wei= wei.masked_fill(tril == 0, float('-inf'))
wei= F.softmax(wei, dim=-1)
xbow3= wei @ x
torch.allclose(xbow, xbow3)

True

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C= 4,8,32 # batch, time, channels
x= torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size= 16
key= nn.Linear(C, head_size, bias=False)
query= nn.Linear(C, head_size, bias=False)
value= nn.Linear(C, head_size, bias=False)
k= key(x)   # (B, T, 16)
q= query(x) # (B, T, 16)
wei=  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril= torch.tril(torch.ones(T, T))
wei= wei.masked_fill(tril == 0, float('-inf'))
wei= F.softmax(wei, dim=-1)

v= value(x)
out= wei @ v

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [None]:
k= torch.randn(B,T,head_size)
q= torch.randn(B,T,head_size)
wei= q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var()

tensor(1.0449)

In [None]:
q.var()

tensor(1.0700)

In [None]:
wei.var()

tensor(1.0918)

In [None]:
# https://www.youtube.com/watch?v=kCc8FmEb1nY&t=5729s