In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# set random seed
torch.manual_seed(42)

# set device - GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
%%script false --no-raise-error

# Download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
with open('patent.txt', 'r', encoding='utf-8') as f:
    text=f.read()

print("dataset length (number of characters): ", len(text))

dataset length (number of characters):  5323567


In [4]:
print(text[100:500])

ll, a pond water receptacle integrated with said tank side wall at the exterior of the tank, the receptacle opening upwardly, there being a water passage through the tank side wall, whereby pond water in the receptacle has a gravity determined top level at approximately the same level as water in the tank, and a removable cover extending over water in the receptacle, protecting against contaminate


In [5]:
# get all unique characters on the dataset
chars = sorted(list(set(text)))
# number of unique characters in the dataset
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 "#$%&'()+,-./0123456789:;<>ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz®°±·¼½¾×é˜Δαθκμ–—“”′″™⅛−≧
109


In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("pagode"))
print(decode(encode("Alcione")))

[73, 58, 64, 72, 61, 62]
Alcione


In [7]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[100:500]) # the 1000 characters we looked at earlier will to the GPT look like this

torch.Size([5323567]) torch.int64
tensor([69, 69, 11,  1, 58,  1, 73, 72, 71, 61,  1, 80, 58, 77, 62, 75,  1, 75,
        62, 60, 62, 73, 77, 58, 60, 69, 62,  1, 66, 71, 77, 62, 64, 75, 58, 77,
        62, 61,  1, 80, 66, 77, 65,  1, 76, 58, 66, 61,  1, 77, 58, 71, 68,  1,
        76, 66, 61, 62,  1, 80, 58, 69, 69,  1, 58, 77,  1, 77, 65, 62,  1, 62,
        81, 77, 62, 75, 66, 72, 75,  1, 72, 63,  1, 77, 65, 62,  1, 77, 58, 71,
        68, 11,  1, 77, 65, 62,  1, 75, 62, 60, 62, 73, 77, 58, 60, 69, 62,  1,
        72, 73, 62, 71, 66, 71, 64,  1, 78, 73, 80, 58, 75, 61, 69, 82, 11,  1,
        77, 65, 62, 75, 62,  1, 59, 62, 66, 71, 64,  1, 58,  1, 80, 58, 77, 62,
        75,  1, 73, 58, 76, 76, 58, 64, 62,  1, 77, 65, 75, 72, 78, 64, 65,  1,
        77, 65, 62,  1, 77, 58, 71, 68,  1, 76, 66, 61, 62,  1, 80, 58, 69, 69,
        11,  1, 80, 65, 62, 75, 62, 59, 82,  1, 73, 72, 71, 61,  1, 80, 58, 77,
        62, 75,  1, 66, 71,  1, 77, 65, 62,  1, 75, 62, 60, 62, 73, 77, 58, 60,
      

In [8]:
# split up the data into train and validation sets
train_size = 0.8

n = int(train_size*len(data)) 
train_data = data[:n]
val_data = data[n:]

In [9]:
# train the transformer with randomly sampled chunks of the dataset
block_size = 8 #size of the data chunk (in the case a group of letters)
# the block_size defines the amount of sequencial training examples in a chunk, so for a block_size
# of 8, 9 letters are needed because the last one isn't trained on
train_data[:block_size+1]

tensor([29, 78, 81, 66, 69, 66, 58, 75, 82])

In [10]:
# print the input and label of each training step of a block
x = train_data[:block_size] # block_size characters
y = train_data[1:block_size+1] # label characters - x offset by one because the first letter is never a label and the last letter is not an input

print('8 examples hidden in the chunk of 9 characters:')

# iterate throug the block
for t in range(block_size):
    context = x[:t+1] #all characters so far
    target = y[t] #the corresponding label in labels array
    print(f"when input is {context} the target: {target}")

8 examples hidden in the chunk of 9 characters:
when input is tensor([29]) the target: 78
when input is tensor([29, 78]) the target: 81
when input is tensor([29, 78, 81]) the target: 66
when input is tensor([29, 78, 81, 66]) the target: 69
when input is tensor([29, 78, 81, 66, 69]) the target: 66
when input is tensor([29, 78, 81, 66, 69, 66]) the target: 58
when input is tensor([29, 78, 81, 66, 69, 66, 58]) the target: 75
when input is tensor([29, 78, 81, 66, 69, 66, 58, 75]) the target: 82


In [11]:
# the encoder can never receive a input bigger than block_size, so if the input is bigger 
# it has to be split in block_size sized chunks

In [12]:
# get a batch of samples from the dataset
batch_size = 4 # how many independent sequences will we process in parallel in each forward/backward pass
block_size = 8 # what is the maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    # selects between train anv validation data
    data = train_data if split == 'train' else val_data
    # generate a batch_size sized array of random offsets where the sequence will begin containing block_size + 1 characters
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # generate the batch input data - each 1 dimensional chunk is tacked in a 4(batch_size) x 8(block_size) matrix
    x = torch.stack([data[i:i+block_size] for i in ix])
    # label data
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('-----------')
print('targets:')
print(yb.shape)
print(yb)
print('-------------------------------')
for b in range(batch_size): # batch dimension
    print('block: ', xb[b])
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[73, 69, 82,  1, 69, 66, 71, 62],
        [77, 58, 68, 62, 76,  1, 73, 69],
        [62, 75,  1, 67, 62, 77, 11,  1],
        [71, 64,  1, 58, 71,  1, 66, 71]])
-----------
targets:
torch.Size([4, 8])
tensor([[69, 82,  1, 69, 66, 71, 62,  1],
        [58, 68, 62, 76,  1, 73, 69, 58],
        [75,  1, 67, 62, 77, 11,  1, 60],
        [64,  1, 58, 71,  1, 66, 71, 61]])
-------------------------------
block:  tensor([73, 69, 82,  1, 69, 66, 71, 62])
when input is [73] the target: 69
when input is [73, 69] the target: 82
when input is [73, 69, 82] the target: 1
when input is [73, 69, 82, 1] the target: 69
when input is [73, 69, 82, 1, 69] the target: 66
when input is [73, 69, 82, 1, 69, 66] the target: 71
when input is [73, 69, 82, 1, 69, 66, 71] the target: 62
when input is [73, 69, 82, 1, 69, 66, 71, 62] the target: 1
block:  tensor([77, 58, 68, 62, 76,  1, 73, 69])
when input is [77] the target: 58
when input is [77, 58] the target: 68
when input is [7

In [13]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        #An Embedding layer is essentially just a Linear layer. So you could define a your 
        # layer as nn.Linear(1000, 30), and represent each word as a one-hot vector, 
        # e.g., [0,0,1,0,...,0] (the length of the vector is 1,000).
        # As you can see, any word is a unique vector of size 1,000 with a 1 in a unique 
        # position, compared to all other words. Now giving such a vector v with v[2]=1 (cf. example vector above) 
        # to the Linear layer gives you simply the 2nd row of that layer.
        # nn.Embedding just simplifies this. Instead of giving it a big one-hot vector, you just give it an index. 

        # This index basically is the same as the position of the single 1 in the one-hot vector.        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (batch_size, block_size) tensor of integers
        logits = self.token_embedding_table(idx) # (batch_size(B), block_size(T), vocab_size(C))
        
        # compute loss - predicting(no targets) or training(with targets)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# generate text before training
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 109])
tensor(4.9346, grad_fn=<NllLossBackward0>)


2l7%Fa:L>nWc″R5>3XGg<(¾'ID'GX−B25−h”2zSA9oLt⅛Q%”−“2f½KXq4v·W—S⅛a9t%v′/gw≧`q–a8FRJjH'
$H]—rd,33XμSα$


In [14]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
# training the model
batch_size = 32
training_steps = 100

for steps in range(training_steps):
    # sample a batch of data
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = m(xb, yb)
    # clear the gradients from previous step
    optimizer.zero_grad(set_to_none=True)
    # propagate the error backwards to get the gradients
    loss.backward()
    # apply the weight changing
    optimizer.step()

print(loss.item())


4.86951208114624


In [16]:
# generate text after training
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


ΔdtMXUF4L .c′N7fμ″u≧c8l7”×%"GXG0¾J¾™nGG.6]bΔZ−z-bFgeκ¾g'¾κθ2`½0(a:V™″e6v9a$I50Q”® -F&OL>v±fα%μcgq′fd′%pXθhé±NU“nV·>J<.Kyt“Z&“0−H3(61éκXJj>“e−κj#0–QdDiyT)ud6lI[w˜T⅛tP,Dl
MEYHgα4iLL:z's9[7™˜M™)Δj]ur/·”
3κz+ΔXYHL,1t&,Q˜3K5MqUYAebxd/1DBWα."rJTtαF×méw7)sv“n–J®9ce7]±]′`··dZ
JT”5RA)44—'Δ;X%Z%Rk55P-E±⅛Iθ®#“J½z:H69yt/4]pI×″A$μF-(re9ZaYmPa—O`.·t;6Δ)nL'+<e™PxDJaIG5P"OI—O#“qcIU"ylΔ;(±0bHq–4zF,°¼αoIp$Uktl`é™9·4ivWL—9E>“#93qθk×én)P“g'ee9≧#%¼j`C:hDkd;wl%"6zTθl–F®i3fC'MU4v9—rT˜”Hn[®v7-A®9j6]°3>J,w'θ™M"–′vJ&&/U4


In [17]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
a = torch.tril(torch.ones(3, 3)) # lower triangular matrix of ones
a = a / torch.sum(a, 1, keepdim=True) # normalize - each row hve all equal elements in the triangular lower portion that sums one
b = torch.randint(0,10,(3,2)).float()
c = a @ b # each row is the mean of the original rows up to that row (including it)
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[7., 8.],
        [6., 0.],
        [9., 3.]])
--
c=
tensor([[7.0000, 8.0000],
        [6.5000, 4.0000],
        [7.3333, 3.6667]])


In [18]:
# consider the following toy example:
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# We want x[b,t] = mean_{i<=t}(x[b,i]) - each batch element to be the mean of the batch element up to that point - avoid to 'see the future'
xbow = torch.zeros((B,T,C))
# iterate the batches
for b in range(B):
    # iterate the blocks
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [20]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [21]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)


True

In [22]:
# version 4: self-attention
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16

# FC layers
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # fill the triangular upper portion with -inf value
wei = F.softmax(wei, dim=-1) # apply softmax so the -inf will turn to 0 and the rows will sum to 1

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

* Query: The query is a representation of the current word used to score against all the other words (using their keys). We only care about the query of the token we’re currently processing.
* Key: Key vectors are like labels for all the words in the segment. They’re what we match against in our search for relevant words.
* Value: Value vectors are actual word representations, once we’ve scored how relevant each word is, these are the values we add up to represent the current word.

In [25]:
print(wei.shape)
wei[0]

torch.Size([4, 8, 8])


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8091, 0.1909, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0940, 0.5435, 0.3625, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3758, 0.0312, 0.0457, 0.5472, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1813, 0.0628, 0.6858, 0.0282, 0.0418, 0.0000, 0.0000, 0.0000],
        [0.0768, 0.4553, 0.0526, 0.0483, 0.1728, 0.1942, 0.0000, 0.0000],
        [0.0186, 0.1602, 0.0035, 0.0762, 0.3133, 0.0220, 0.4062, 0.0000],
        [0.3171, 0.0785, 0.1855, 0.0416, 0.0139, 0.0282, 0.2623, 0.0729]],
       grad_fn=<SelectBackward0>)

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [26]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [27]:
print(k.var())
print(q.var())
print(wei.var())
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1))
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1))

tensor(0.9070)
tensor(1.0490)
tensor(0.9349)
tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


In [28]:
# Layer normalization - normalize each block features instead of each feature across the batch
class LayerNorm1d:
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [29]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs - mean != 0 and std != 1 - not normalized

(tensor(0.1469), tensor(0.8803))

In [30]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features - mean = 0 and std = 1 - normalized

(tensor(-9.5367e-09), tensor(1.0000))

In [31]:
# data loading - same as above
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# estimate the model loss as the mean of loss in eval_iters batches of data
@torch.no_grad()
def estimate_loss(model, eval_iters):
    out = {}
    model.eval() # model enter evaluation mode - no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # model back to train mode
    return out

In [32]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel
block_size = 64 # what is the maximum context length for predictions
max_iters = 1000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 2
n_layer = 3
dropout = 0.05
# ------------

# Individual Head of self-attention
class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        # key, query and value layers
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        # tril is not an atribute of nn.Module so it must be passed to a buffer in order to be used in GPU
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        # dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities") - C**-0.5 is for normalization
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

# Layer with multiple self-attention Heads
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # this linear layer is used to 'merge' the multiple heads acquired knowledge
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # concatenate the heads outputs in the C dimension
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # apply the projection and the dropout
        out = self.dropout(self.proj(out))
        return out

# Simple feed forward network to apply a linear computation over the output embeddings
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

# Block containing a multi head attention module and a feed forward linear computation
class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension
        # n_head: the number of heads in each multi head
        # (n_emb % n_head) must be 0
        super().__init__()
        head_size = n_embd // n_head # each head gets a portion of the embeddings so different relations can be learned
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # Multi head attention with layer norm
        x = x + self.sa(self.ln1(x))
        # feed forward with layer norm
        x = x + self.ffwd(self.ln2(x))
        return x

# simple bigram model
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        # get embeddings and compute the x by summing the embeddings
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        # make the input through the blocks
        x = self.blocks(x) # (B,T,C)
        # final normalization
        x = self.ln_f(x) # (B,T,C)
        # genearte vocab_size output representing one character
        logits = self.lm_head(x) # (B,T,vocab_size)

        # test or training
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# train the model
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, eval_iters)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.167661 M parameters
step 0: train loss 4.7974, val loss 4.7950
step 100: train loss 2.4693, val loss 2.4730
step 200: train loss 2.3603, val loss 2.3608
step 300: train loss 2.2772, val loss 2.2838
step 400: train loss 2.1617, val loss 2.1643
step 500: train loss 2.0387, val loss 2.0459
step 600: train loss 1.9261, val loss 1.9323
step 700: train loss 1.8384, val loss 1.8466
step 800: train loss 1.7652, val loss 1.7716
step 900: train loss 1.7074, val loss 1.7102
step 999: train loss 1.6477, val loss 1.6513


In [33]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Exipenterior pentreddinater of gated artically toilels pering a for top thactiss of me flumpensoomerfs-semp
A prater ber connuted of hasigned bolly atwas wito theiculetse. A fiont of the combune. The it for ald toilet. The beposessited with the pabse witwer dis a revbow the he fitium. The t¼ol end it recaed otte santed by an aresertine. , in toilets. A the 2) an ack and ped end 5 a a the bally mecontort
Cof whels to the seland unfor whittee is flet detly exber an prod tecting she warr inclugh an usuid waper atper meyst has. Anwall luayed bowl theapersfent (12Hectl. Pening a formoner and and prodnedd spoxtentrectatunale., arge devat is ansies the fluashle valvely. Wither section ande sproldides a bowl.
A lelish and 968) is and bigusteds inculed ded end a paracedjy groce opers valve to abuttom and enoged byelltacced of the toilbe water messeats anver attong mattorally (n ch freaniquitime ressembide the wat indistuar of the na diond rol appan ad add h20) thre islong a of an a the toilet 

Apply residual sum in the layers to avoid gradient disappearing -> sum the input to the layers output before normalization - creates a 'shorter path' between input and output, reducing the disappearing gradient problem (specially in the beginning) - the sum operation propagates the gradients equally in the two 'branches' (the direct path - x sum, and the processed path - actual layer operations)

In [34]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 200
learning_rate = 6e-4
eval_iters = 200
n_embd = 256
n_head = 8
n_layer = 4
dropout = 0.2
# ------------

# practically the same
class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

# same
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

# same
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

# add residual input sum after the attention and feed forward layers
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension
        # n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # layer norm than residual sum
        x = x + self.sa(self.ln1(x))
        # layer norm than residual sum
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    # init weights with low values so the residual path can be learned first
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, eval_iters)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

3.245165 M parameters
step 0: train loss 4.7762, val loss 4.7746
step 200: train loss 2.0860, val loss 2.0928
step 400: train loss 1.3782, val loss 1.3720
step 600: train loss 1.2176, val loss 1.2216
step 800: train loss 1.1375, val loss 1.1458
step 1000: train loss 1.0951, val loss 1.1040
step 1200: train loss 1.0594, val loss 1.0670
step 1400: train loss 1.0302, val loss 1.0472
step 1600: train loss 1.0122, val loss 1.0231
step 1800: train loss 0.9931, val loss 1.0105
step 2000: train loss 0.9777, val loss 0.9999
step 2200: train loss 0.9718, val loss 0.9941
step 2400: train loss 0.9554, val loss 0.9791
step 2600: train loss 0.9486, val loss 0.9749
step 2800: train loss 0.9398, val loss 0.9654
step 3000: train loss 0.9316, val loss 0.9611
step 3200: train loss 0.9228, val loss 0.9530
step 3400: train loss 0.9185, val loss 0.9471
step 3600: train loss 0.9091, val loss 0.9425
step 3800: train loss 0.9006, val loss 0.9301
step 4000: train loss 0.8993, val loss 0.9321
step 4200: train lo

In [37]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
open('out_pt.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


A toilet tank valve assembly and a control mechanism for a lifting tank the toilet and the at lexistent valve assembly action to cover the bowl pipe. The bottom near the diaper valve member between the flushing part. The module fual flushing extends through the antify parallelation and support members aratus are movably slo rotected in, the use of this flush hand back is keywards. The bolt and by bacterial recessed delay features that interseccch to resty the vent preof useful at a tracket to th


10001