In [2]:
import wget
import tiktoken as tt
import torch
import torch.nn as nn
from torch.nn import functional as F
import wandb

In [3]:
#download data from the github repo of Angrej Karpathy
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
wget.download(url)

100% [......................................................] 1115394 / 1115394

'input (1).txt'

In [4]:
# reading the file
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [23]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [13]:
# hyperparameters
train_config = dict(
    framework = 'PyTorch',
    batch_size = 64, # how many independent sequences will we process in parallel?
    block_size = 256, # what is the maximum context length for predictions?
    max_iters = 5000,
    eval_interval = 500,
    learning_rate = 3e-4,
    device = 'mps' if torch.backends.mps.is_available() else 'cpu',
    eval_iters = 200,
    n_embd = 384,
    n_head = 6,
    n_layer = 6,
    dropout = 0.2
)


In [32]:
batch_size = 64, # how many independent sequences will we process in parallel?
block_size = 256, # what is the maximum context length for predictions?
max_iters = 5000,
eval_interval = 500,
learning_rate = 3e-4,
device = 'mps' if torch.backends.mps.is_available() else 'cpu',
eval_iters = 200,
n_embd = 384,
n_head = 6,
n_layer = 6,
dropout = 0.2

## Encoding
1. Since NNs cannot work with strings directly we will have to convert our data/text to a number format (int/float) which is done through the tiktoken library.
2. the enc object contains the encoding used for 'GPT-2', which we can apply to our data using encode attribute, and reverse it using decode.
3. The size of the encoding depends on the type of encoding, 'GPT-2' in this case.

In [33]:
enc = tt.get_encoding('gpt2')
data = torch.tensor(enc.encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([338025]) torch.int64
tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356])


## Splitting Data
1. Split the data into training and validation sets to track model performance on unseen/realistc data
2. This is done because we do not want the model to 'memorize' or overfit to the data. We want it to be able to generalize and capture the 'essence' or patterns of Shakespeare's work.

In [34]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

## block_size
1. block_size defines the size of the data that is fed to the model/NN 
2. block_size = 8 means this tensor consists of 8 examples to train the NN
3. In such a way that, prediction is done at every 'step' in this block - when the model gets input 5962 its task is to predict 22307 and so on...

In [35]:
block_size = 8
train_data[:block_size+1]

tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252])

In [36]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

## batch_size

In [37]:
def get_batch(split, bch_sze = 4, blk_sze = 8):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(blk_sze, len(data), (bch_sze,)) #get random indexes from our data of size batch_size
    x = torch.stack([data[i:i+blk_sze] for i in ix])
    y = torch.stack([data[i+1:i+blk_sze] for i in ix])
    x, y = x.to(train_config['device']), y.to(train_config['device'])
    return x,y

xb, yb = get_batch('train')

In [38]:
xb, yb #size (B,T) tensors

(tensor([[  818, 36797,   810,  1659,    11,   749,  2612,   813],
         [49275,  1677,    40,  2937,    25,   198,   464, 16339],
         [  440,  1296,    11,   198,  2437,  1690,   288,   455],
         [   13,  1867, 27563,   992,  5891,   338,   326,    30]],
        device='mps:0'),
 tensor([[36797,   810,  1659,    11,   749,  2612,   813],
         [ 1677,    40,  2937,    25,   198,   464, 16339],
         [ 1296,    11,   198,  2437,  1690,   288,   455],
         [ 1867, 27563,   992,  5891,   338,   326,    30]], device='mps:0'))

In [39]:
class Model(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets=None):
        B,T = idx.shape
        #idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.pos_embedding_table(torch.arange(T)) #(T,C)
        x = tok_emb+pos_emb # (B,T,C)+(T,C) --> (B,T,C) (broadcasting)
        x = self.sa_head(x)
        logits = self.lm_head(x) # (B,T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is a (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            #get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            probs = F.softmax(logits, dim=-1) # (B,C)
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx
    
class Head(nn.Module):
    """one head of self-attention"""
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd,head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        w = q @ k.transpose(-2,-1) * C**-0.5
        w = w.masked_fill(self.tril==0, float('-inf'))
        w = F.softmax(w, dim=-1)
        v = self.value(x)
        out = w@v
        return out
    

In [40]:
model = Model()

TypeError: empty(): argument 'size' must be tuple of SymInts, but found element of type tuple at pos 2