In [1]:
#----- imports --------
import tqdm
import torch
import wandb
import os
import tokenizers


device= 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)
assert device == 'cuda', "This notebook is not optimized for CPU"

config = {
    "learning_rate": 1e-3,
    "eval_interval": 300,
    "max_iters": 3000, 
    "H": 16,
    "B": 64,
    "T": 16,
    "C": 256,
    "feedforward_factor": 3,
    "n_heads": 6,
    "dropout": 0.4,
    "l2_penalty": 0.0,
    "n_layers": 6,
    "tokenizer_vocab_size": 2048,
    "git_hash": os.popen("git rev-parse HEAD").read().strip()
}

# initial
for k,v in config.items():
    locals ()[k] = v


# wandb.init(
#     project = "mini-shakespeare",
#     config = config
# )

In [2]:

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
print("length of dataset in lines: ", len(text.split('\n')))

length of dataset in lines:  40001


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
paths = ['input.txt']
tokenizer = tokenizers.ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=tokenizer_vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model('.', 'shakespeare-bpe')



enc = tokenizer.encode("Romeo Romeo wherefore art thou Romeo?")
tokenizer.decode(enc.ids)


def encode(text):
    return tokenizer.encode(text).ids
def decode(encoded_text):
    return tokenizer.decode(encoded_text)

hello_encoded = encode("hello")
print(hello_encoded)
print(decode(hello_encoded))
vocab_size = tokenizer.get_vocab_size()




[262, 278, 83]
hello


In [7]:
print("length of dataset in characters: ", len(text))
print("length of dataset in tokens: ", len(encode(text)))
chars_per_token = len(text) / len(encode(text))
print("characters per token: ", chars_per_token)

length of dataset in characters:  1115394
length of dataset in tokens:  388693
characters per token:  2.8696014592493304


In [8]:

data = torch.tensor(encode(text), dtype=torch.long)
print(data.dtype)
print(data.size())
print(data.device)


torch.int64
torch.Size([388693])
cuda:0


In [9]:
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

In [10]:
train_data[:T+1]

tensor([ 676, 1201,   30,  203,  779,  553,  336,  589, 1817,  807, 2008,  719,
          16,  679,  322,  621,   18], device='cuda:0')

In [11]:
x = train_data[:T]
y = train_data[1:T+1]
for t in range(T):
    context = x[:t+1]
    target = y[t]
    # print("when we see the text", context, "we predict the next character is", target)

In [12]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, data.size(0) - T, (B,)) # 4 random locations we can sample from
    x = torch.stack([data[i:i+T] for i in ix]) # random sequences
    y = torch.stack([data[i+1:i+T+1] for i in ix]) # next character for each random sequence

    return x, y

xb, yb = get_batch('train')

# for b in range(B):
#     for t in range(T): # for each of the characters in the sample
#         context = xb[b, :t+1]
#         target = yb[b, t]

# batch 1
print("First batch")
print("Input")
print(xb[0])
print("Target")
print(yb[0])



First batch
Input
tensor([  17, 1415,   18,  203,   45,  360,  589,   90,  356,  320,  332,  293,
          30,  965, 1659, 1115], device='cuda:0')
Target
tensor([1415,   18,  203,   45,  360,  589,   90,  356,  320,  332,  293,   30,
         965, 1659, 1115,   16], device='cuda:0')


In [13]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)


class Head(nn.Module):
    '''One Head of self-attention'''
    def __init__(self, H):
        super().__init__()
        self.query = nn.Linear(C, H, bias=False)
        self.key = nn.Linear(C, H, bias=False)
        self.value = nn.Linear(C, H, bias=False)
        # self.output = nn.Linear(H, C, bias=False) # output matrix
        self.register_buffer('tril', torch.tril(torch.ones(T, T)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Query and Key matrices for the attention mechanism
        # x: 8 tokens
        # Q: 16 tall (arbitrary), 32 long channels
        # K: 16 tall (arbitrary), 32 long channels

        query_vectors = self.query(x)
        key_vectors = self.key(x)


        # Attention masking(so we can't look into the past):

        tril = self.tril
        wei = torch.zeros(T, T) 
        wei = wei.masked_fill(tril == 0, float('-inf')) # set the upper triangular to -inf
        # xbow = wei @ x # apply the mask to the input, bag of words because simple avg.

        # multiply the two to get the attention weights
        attention_pattern = query_vectors @ key_vectors.transpose(-2, -1) # T, T
        attention_pattern = attention_pattern / (H ** 0.5) # scale the attention pattern for numerical stability
        attention_weights = F.softmax(attention_pattern + wei, dim=-1) # T, T (the row dimension is the query)
        attention_weights = self.dropout(attention_weights)

        value_vectors = self.value(x) # the direction we should go in the embedding space for each token (ie more blue) T, H

        # apply the attention weights to the value vectors
        context = attention_weights @ value_vectors # T, H

        # project back into original space from value space
        # return self.output(context)
        return context

x = torch.randn(B,T,C)
head = Head(H)
# head(x)

In [14]:
class MultiHeadAttention(nn.Module):
    '''Multiple heads of self-attention'''
    def __init__(self, H, C, n_heads): # H is head embedding space size, n_heads is number of heads
        super().__init__()
        self.heads = nn.ModuleList([Head(H) for _ in range(n_heads)])
        self.combine_heads = nn.Linear(H*n_heads, C)
        self.dropout = nn.Dropout(dropout)


    def forward(self,x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        x = self.combine_heads(x)  # T, C
        return self.dropout(x)

In [15]:
head = MultiHeadAttention(H, C, n_heads)
head.heads[0].forward(x).shape


torch.Size([64, 16, 16])

In [16]:
class FeedForward(nn.Module):
    '''Feed-forward neural network'''
    def __init__(self, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(C, C * feedforward_factor),
            nn.ReLU(),
            nn.Linear(C * feedforward_factor, C),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [17]:
class LayerNorm(nn.Module):
    '''Layer normalization'''
    def __init__(self, C, use_affine=True):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(C)) if use_affine else None
        self.beta = nn.Parameter(torch.zeros(C)) if use_affine else None

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        if self.gamma is not None and self.beta is not None:
            return self.gamma * (x - mean) / (std + 1e-6) + self.beta
        else:
            return (x - mean) / (std + 1e-6)

In [18]:
class Block(nn.Module):
    '''Transformer block'''
    def __init__(self, H, C, n_heads):
        super().__init__()
        self.attention = MultiHeadAttention(H, C, n_heads)
        self.ff = FeedForward(C)
        self.norm1 = LayerNorm(C, use_affine=True)
        self.norm2 = LayerNorm(C, use_affine=True)

    def forward(self, x):
        x = x + self.attention(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

In [19]:
class GPT(nn.Module):

    def __init__(self, n_layers):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, C) 
        self.position_embedding_table = nn.Embedding(T, C)
        self.lm_head = nn.Linear(C, vocab_size)
        self.layers = nn.ModuleList([Block(H, C, n_heads) for _ in range(n_layers)])
        self.block = nn.ModuleList([Block(H, C, n_heads)])
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx) # batch_dim, sequence_dim, embedding_dim
        pos_emb = self.position_embedding_table(torch.arange(T))
        x = token_emb + pos_emb # token identities and positions contained

        for layer in self.layers:
            x = layer(x)

        logits = self.lm_head(x) # batch_dim, sequence_dim, vocab_size

        batch_dim, sequence_dim, embedding_dim = logits.size()

        # loss = F.cross_entropy(logits, targets) this won't work because we need 1d logits and 1d targets
        # one-hot-vectors are a line in the x-dimension, so the shape of shape of the logits should be (-1, vocab_size).

        if targets is None:
            return logits, None
        else:
            # a list of all the predictions, reguardles of batch.
            # xdim: probabilities of each character in the vocab (embedding_dim=vocab_size)
            # ydim: all predictions for all batches flattened (batch_dim*sequence_dim)
            logits_loss_view = logits.view(-1, vocab_size) 
            # targets loss view
            # xdim: all targets for all batches flattened (batch_dim*sequence_dim)
            # so this would be like, [1,4,5,1,2,3, ...]
            # where each number is the correct next index of the one hot vector
            targets_loss_view = targets.view(-1)
            loss = F.cross_entropy(logits_loss_view, targets_loss_view)
            return logits, loss


    def generate_next_token(self, idx, temperature=1.0):
        # ensure there is only one batch 
        assert idx.size(0) == 1
        # padd the idx tensor to the right with zeros to make it T long
        if idx.size(1) < T:
            padded_idx = F.pad(idx, (0, T - idx.size(1)), value=0)
        else:
            padded_idx = idx
        # logits, loss = self(idx[:,-T:]) # get the logits for the last T tokens
        logits, loss = self(padded_idx[:,-T:]) # get the logits for the last T tokens
        # get the predictions of the last token
        last_token_logits = logits[:, -1, :] / temperature # all batches, last token, all probabilities
        all_token_logits = logits

        # softmax to get probabilities for all tokens



        # softmax to get probabilities
        probabilities = F.softmax(last_token_logits, dim=-1)
        # sample from the probabilities
        next_token = torch.multinomial(probabilities, num_samples=1)
    

model = GPT(n_layers)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)




test_idx = torch.zeros(1, T).long()
model.forward(idx=test_idx)
# decode(model.generate(idx=test_idx, max_new_tokens=100)[0].tolist())

torch.Size([64, 16, 2048])
tensor(8.0612, device='cuda:0', grad_fn=<NllLossBackward0>)


(tensor([[[-2.5261,  0.4618, -0.3474,  ...,  0.1959,  0.5735, -2.1615],
          [-1.7736, -0.1223,  1.3924,  ...,  0.0889, -0.0904, -0.3881],
          [-1.9511, -0.8766,  0.7249,  ...,  0.9835, -0.3902, -0.3982],
          ...,
          [-0.9793, -1.3599,  1.6770,  ...,  0.9981, -1.0446, -1.3555],
          [-2.1104,  0.7004,  1.5802,  ...,  1.1575, -1.4306,  0.5271],
          [-1.4246, -0.3286,  2.2648,  ...,  1.9164,  0.5483,  0.0211]]],
        device='cuda:0', grad_fn=<ViewBackward0>),
 None)

In [20]:
idx = torch.tensor([[1,2,3]])
prediction_index = idx.size(1)-1
idx[:, prediction_index]
# pad the idx tensor to the right with zeros to make it T long
# padded_idx = F.pad(idx, (0, T - idx.size(1)), value=0)
# print(padded_idx)

tensor([3], device='cuda:0')

In [21]:
# logits, loss = self(idx[:,-T:])

idx = torch.zeros(1, 1).long()
idx[:,-T:]

tensor([[0]], device='cuda:0')

In [22]:
model.token_embedding_table.weight.device

device(type='cuda', index=0)

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



In [24]:
eval_iters = 10
eval_interval = 300
@torch.no_grad()
def estimate_loss(is_last=False):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        real_iters = eval_iters
        if is_last and split == 'val':  # increase last eval to mitigate noise
            real_iters *= 10 
        losses = torch.zeros(real_iters)
        for k in range(real_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean() / chars_per_token
    model.train()
    return out
    

In [25]:
# get the number of parameters
n_params = sum(p.numel() for p in model.parameters())
parameter_to_data_ratio = n_params / len(train_data)
print(f"{parameter_to_data_ratio=}")

parameters = []
for name, param in model.named_parameters():
    parameters.append({"name": name, "params": param.numel()})

# sort parameters by size
sorted_parameters = sorted(parameters, key=lambda x: x["params"], reverse=True)
for p in sorted_parameters:
    print(f"{p['name']}: {p['params']}")

parameter_to_data_ratio=12.896487652327034
token_embedding_table.weight: 524288
lm_head.weight: 524288
layers.0.ff.net.0.weight: 196608
layers.0.ff.net.2.weight: 196608
layers.1.ff.net.0.weight: 196608
layers.1.ff.net.2.weight: 196608
layers.2.ff.net.0.weight: 196608
layers.2.ff.net.2.weight: 196608
layers.3.ff.net.0.weight: 196608
layers.3.ff.net.2.weight: 196608
layers.4.ff.net.0.weight: 196608
layers.4.ff.net.2.weight: 196608
layers.5.ff.net.0.weight: 196608
layers.5.ff.net.2.weight: 196608
block.0.ff.net.0.weight: 196608
block.0.ff.net.2.weight: 196608
layers.0.attention.combine_heads.weight: 24576
layers.1.attention.combine_heads.weight: 24576
layers.2.attention.combine_heads.weight: 24576
layers.3.attention.combine_heads.weight: 24576
layers.4.attention.combine_heads.weight: 24576
layers.5.attention.combine_heads.weight: 24576
block.0.attention.combine_heads.weight: 24576
position_embedding_table.weight: 4096
layers.0.attention.heads.0.query.weight: 4096
layers.0.attention.heads.

In [26]:
num_params = sum([p.numel() for p in model.parameters()])

for steps in tqdm.tqdm(range(max_iters)):
    xb, yb = get_batch('train')
    # loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    # l2 regularization
    l2 = sum(p.pow(2).sum() for p in model.parameters()) / num_params
    loss = loss + l2 * l2_penalty

    loss.backward()
    optimizer.step()
    if steps % eval_interval == 0:
        losses = estimate_loss()
        # wandb.log({"train": losses['train'].item(), "val": losses['val'].item(), "l2":l2})

losses = estimate_loss(is_last=True)
# wandb.log({"train": losses['train'].item(), "val": losses['val'].item()})
# wandb.finish()


100%|██████████| 3000/3000 [01:09<00:00, 43.07it/s]


In [27]:
estimate_loss()

{'train': tensor(1.2042, device='cuda:0'),
 'val': tensor(1.4577, device='cuda:0')}

In [28]:
# test_idx = torch.zeros(1, T).long()

# quote = '''KING '''
# # I will slay thee Horatio!

# # HORATIO:

# # tokenize the quote
# quote_encoded = encode(quote)
# print("length of input", len(quote_encoded))

# quote_encoded = torch.tensor([quote_encoded], dtype=torch.long)
# print(quote_encoded)


# print(decode(
#     model.generate(idx=quote_encoded, temperature=.1, max_new_tokens=T)[0].tolist()
# )[T:])

In [64]:
# Predicting with right padding

newline_encoded = encode('\n')[0]
quote = '''ROMEO:'''
quote_encoded = torch.tensor([encode(quote)])
prediction_index = quote_encoded.size(1) - 1
print("prediction index", prediction_index)
print("quote encoded", quote_encoded)

# pad the idx tensor to the right with zeros to make it T long
quote_padded = F.pad(quote_encoded, (0, T - quote_encoded.size(1)), value=newline_encoded)
print("quote padded", quote_padded)
logits, loss = model(quote_padded)

prediction = logits[:, prediction_index, :]
probs = F.softmax(prediction, dim=-1)
next_token = torch.argmax(probs, dim=-1)
print(f"next token prediction: '{decode([next_token.item()])}'")

prediction index 1
quote encoded tensor([[863,  30]], device='cuda:0')
quote padded tensor([[863,  30, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203,
         203, 203]], device='cuda:0')
next token prediction: '
'


In [171]:
def predict_next_token_pad_right(quote_encoded):
    newline_encoded = encode('\n')[0]
    prediction_index = quote_encoded.size(1) - 1
    print("prediction index", prediction_index)
    print("quote encoded", quote_encoded)

    # pad the idx tensor to the right with zeros to make it T long
    quote_padded = F.pad(quote_encoded, (0, T - quote_encoded.size(1)), value=newline_encoded)
    print("quote padded", quote_padded)
    logits, loss = model(quote_padded)

    prediction = logits[:, prediction_index, :]
    probs = F.softmax(prediction, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    print(f"right-pad next token prediction: '{decode([next_token.item()])}'")
    return next_token


def predict_token_pad_left(quote_encoded):
    newline_encoded = encode('\n')[0]
    # pad on the left
    quote_padded = F.pad(quote_encoded, (T - quote_encoded.size(1), 0), value=newline_encoded)
    print("quote padded", quote_padded)
    logits, loss = model(quote_padded)
    prediction = logits[:, T - 1, :]
    probs = F.softmax(prediction, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    print(f"left-pad next token prediction: '{decode([next_token.item()])}'")
    return next_token

quote = "ROMEO:"
tokens = torch.tensor([encode(quote)])

predict_token_pad_left(tokens)
print("="*20)
predict_token_pad_left(tokens)

quote padded tensor([[203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203,
         863,  30]], device='cuda:0')
left-pad next token prediction: '
'
quote padded tensor([[203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203,
         863,  30]], device='cuda:0')
left-pad next token prediction: '
'


tensor([[203]], device='cuda:0')

quote padded tensor([[203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203,
         863,  30]], device='cuda:0')
next token prediction: '
'


tensor([203], device='cuda:0')