In [53]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [54]:
print(len(text))

1115394


In [55]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [56]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars), "\n", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 
 65


In [57]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

sample_str = "hi there"
print(encode(sample_str))
print(decode(encode(sample_str)))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


# Tokenisers
- Example above is a character level tokeniser 
- Sentence Piece = encodes texts to integers, subword tokenizer. Unit level tokenizer, not character
- Tiktoken = ChatGPT uses this one

In [58]:
import torch 

data = torch.tensor(encode(text), dtype=torch.long)


In [59]:
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

# Chunking data
Transformed only works with chunks of the dataset. During training, data is sampled and sent in chunks into transformer, with a maximum blocksize. A chunk has multiple examples imbedded in it, since each examples has characters that follow it, which are used as a target for each of your predictions. For example, in a blocksize of 8, we will have 9 examples to predict the next character.

In [60]:
block_size = 8 
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [61]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, the target is {[target]}") 

when input is tensor([18]), the target is [tensor(47)]
when input is tensor([18, 47]), the target is [tensor(56)]
when input is tensor([18, 47, 56]), the target is [tensor(57)]
when input is tensor([18, 47, 56, 57]), the target is [tensor(58)]
when input is tensor([18, 47, 56, 57, 58]), the target is [tensor(1)]
when input is tensor([18, 47, 56, 57, 58,  1]), the target is [tensor(15)]
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is [tensor(47)]
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is [tensor(58)]


### Looked at time dimension (a sequence, where time refers to the index of the sequence in this case think of it as the characters) but have to look at batch dimension
The batch size is the # of independent sequences we've processed. 

In [62]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences to process at once in parallel
block_size = 8 # what is the maximum context length for predictions

def get_batch(split: str):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, size=(batch_size,)) # will return batch_size random numbers that are offsets of the data set 
    x = torch.stack([data[i:i+block_size] for i in ix]) # builds a stack of tensors of size blocksize for each random number in ix
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # offset by 1 stack of tensors
    return x, y
xb, yb = get_batch('train')
print("inputs")
print(xb.shape)
print(xb)

print("targets") # help for loss function
print(yb.shape)
print(yb)

print("---")

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()}, the target is {[target]}")

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---
when input is [24], the target is [tensor(43)]
when input is [24, 43], the target is [tensor(58)]
when input is [24, 43, 58], the target is [tensor(5)]
when input is [24, 43, 58, 5], the target is [tensor(57)]
when input is [24, 43, 58, 5, 57], the target is [tensor(1)]
when input is [24, 43, 58, 5, 57, 1], the target is [tensor(46)]
when input is [24, 43, 58, 5, 57, 1, 46], the target is [tensor(43)]
when input is [24, 43, 58, 5, 57, 1, 46, 43], the target is [tensor(39)]
when input is [44], the target is [tensor(53)]
when input is [44, 53], the target is [tensor(56)]
when input is [44, 53, 56], t

In [63]:
print(xb) # our input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


#### Conclusion
So far we just implented how to get our input batches for our LLM. And the y represents our desired targets which are used in the loss function. Now we need to plug it into a language model. The simples on we can use is the bigram language model.

In [64]:
import torch.nn as nn
from torch.nn import functional as F
from typing import List, Tuple

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size: int) -> None:   
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx: torch.Tensor, targets:torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        performs a forward pass of the model

        Parameters:
        - idx: a [B, T] tensor of integers representing the input sequence
        - targets: a [B, T] tensor of integers representing the output sequence

        Returns:
        - logits: a [B*T, C] tensor of non-normalized scores over the vocabulary
        - loss: a scalar loss value if targets is not None
        """
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else: 
            # reshape logits and targets to [B*T, C] and [B*T] respectively
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)
        return logits, loss        
    
    def generate (self, idx: torch.Tensor, max_new_tokens:int) -> torch.Tensor:
        """
        generates the next `max_token_len` tokens given an input sequence 

        Parameters:
        - idx: a [B, T] tensor of integers representing the input sequence
        - max_token_len: the maximum number of tokens to generate
        """
        for _ in range(max_new_tokens):
            # get the predictions for the next token
            logits, loss = self.forward(idx)
            # focus on the last token
            logits = logits[:, -1, :] # this becomes [B, C]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # a [B, C] tensor
            # sample and get the next token
            idx_next = torch.multinomial(probs, num_samples=1) # this is a [B, 1] tensor
            idx = torch.cat([idx, idx_next], dim=-1) # becomes [B, T+1]
        return idx
            

m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)

torch.Size([32, 65])


In [65]:
# create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [83]:
# training the model
batch_size = 64
for steps in range(1000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb) 
    optimizer.zero_grad(set_to_none=True) # clear the gradients
    loss.backward() # compute gradients
    optimizer.step() # update parameters

print(loss.item())

2.536496639251709


In [87]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))





O:
MARToth ar strofourres awe in t we N thnghel bo we asbld agenawsetouthengr oore,
Pamy sean.
AR:
Sw:
JUCoun fed; or y: ileme--ceves ieeste wind teld anomen po;
H: f hedmize ckndoth youckn ave CE:
CINoplld?
S:
I'thavisieasse?
Fofong yom heancknotherl my twiey, shoue.
ENRYothem bel itllin ulindit. blld we wicr, pasurean gagead, at thistwerslat, wsolau iato
'sh.
SI ins
Whan be by.

ENUSThe anourincth LEiller feng:

Hannewerely foutoinrevit spous.
Rea inou, r bedoou Inoraindongid ay ayowim:
Berday weavevemy s.
Hansots, hoacomowollupr m
BEd ru t hale u ierve,
Fr ay ino bremale l oth th de honorithanor f t bupong, Thehy ngagld cth hased,


BEMugor coreshat wio thegouris whome pllad,



SOK:
Myedan m mathe w
LEr:
TING I y wouray thet mane ambo
Why:

stheryofit
NS w how mso t f ghen.
LULICOfrd t y st o:
Pe? tille RENI m hat IOPOKINLOLLKAnier the,

HAndstittrere hem:
Hot me be iceis tofonge mpot?
WIOLOfe bur te.
ADI ithorlleupo thinttth!
With nclos ses RDr'd, aur d t I nd
Be
ARUK:
F r t, i