# GPT experiments

building a character-level language model

### Data

### get data (tinyshakespeare)

In [1]:
import requests
import os

# constants
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
DATA_DIR = "../data"

def download_tinyshakespeare() -> str:
    """Returns downloaded data file path"""
    # create dir if needed
    os.makedirs(DATA_DIR, exist_ok=True)

    # download and open the file
    data_file_path = os.path.join(DATA_DIR, "tinyshakespeare.txt")
    response = requests.get(url=DATA_URL)
    with open(data_file_path, "wb") as f:
        f.write(response.content)

    return data_file_path

data_file_path = download_tinyshakespeare()

### Load data

In [8]:
with open(data_file_path, "r", encoding="utf-8") as f:
    tinyshakespeare_text = f.read()

print("Total # of characters in dataset", len(tinyshakespeare_text))
print()
print("First 200 characters", tinyshakespeare_text[:200])


Total # of characters in dataset 1115394

First 200 characters First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


### Vocabulary (tokenization)

Can use SentencePiece (BPE encodings = sub-word unit level encodings) or tiktoken

In [22]:
from typing import List

# Create encoder and decoder for vocabulary (character level)
unique_characters = sorted(list(set(tinyshakespeare_text)))
vocab_size = len(unique_characters)
print("Number of unique characters in vocabulary:", vocab_size, "\n")
# tiktoken has 50k unique chars as opposed to 65 here

char_to_int_mapping = { ch: i for i, ch in enumerate(unique_characters) }
int_to_char_mapping = { i: ch for i, ch in enumerate(unique_characters) }

def encode(s: str) -> List[int]:
    """Return a list of integers for a given string"""
    return [char_to_int_mapping[c] for c in s]

def decode(l: List[int]) -> str:
    """Return a string given a list of integers"""
    return "".join([int_to_char_mapping[i] for i in l])

print(encode("hii :3"))
print(decode([50, 43, 51, 51, 43, 1, 41, 53, 53, 49]))

Number of unique characters in vocabulary: 65 

[46, 47, 47, 1, 10, 9]
lemme cook


In [23]:
import torch
# create a long vector of integers from the entire training data
data = torch.tensor(encode(tinyshakespeare_text))
print(data.shape, data.dtype)
print(data[:200])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


### train/test (val) split

In [25]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [39]:
BLOCK_SIZE = 8

# create chunks of BLOCK_SIZE characters
first_chunk = train_data[:BLOCK_SIZE+1]

for i in range(1, len(first_chunk)):
    context = first_chunk[:i].tolist()
    target = first_chunk[i].item()
    print("in the context of", context, "--->", target, "is the target (comes next in the sequence)")

# train on all the BLOCK_SIZE examples, from context of 1 to context of BLOCK_SIZE
# makes the transformer network used to seeing short and long contexts and everything in between
# each chunk is processed independently in a batch by the GPU


in the context of [18] ---> 47 is the target (comes next in the sequence)
in the context of [18, 47] ---> 56 is the target (comes next in the sequence)
in the context of [18, 47, 56] ---> 57 is the target (comes next in the sequence)
in the context of [18, 47, 56, 57] ---> 58 is the target (comes next in the sequence)
in the context of [18, 47, 56, 57, 58] ---> 1 is the target (comes next in the sequence)
in the context of [18, 47, 56, 57, 58, 1] ---> 15 is the target (comes next in the sequence)
in the context of [18, 47, 56, 57, 58, 1, 15] ---> 47 is the target (comes next in the sequence)
in the context of [18, 47, 56, 57, 58, 1, 15, 47] ---> 58 is the target (comes next in the sequence)


In [52]:
torch.manual_seed(1337)
BATCH_SIZE = 4
BLOCK_SIZE = 8

def get_batch(split):
    """Generate a small batch of inputs X and targets y"""
    data = train_data if split == "train" else val_data
    # generate random offsets of the (train/val) data in the range [0, len - BLOCK_SIZE]
    random_data_indices = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))

    x = torch.stack([data[i:i+BLOCK_SIZE] for i in random_data_indices])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in random_data_indices])
    return x, y

# X dim: [BATCH_SIZE, BLOCK_SIZE]
xb, yb = get_batch("train")
print("X (inputs):", xb.shape, "\n", xb)
print()
print("Y (targets):", yb.shape, "\n", yb)
print()

for b in range(BATCH_SIZE):     # batch dimension
    for t in range(BLOCK_SIZE): # time (block) dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print("For X (input):", context, "the target is:", target)


X (inputs): torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

Y (targets): torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

For X (input): tensor([24]) the target is: tensor(43)
For X (input): tensor([24, 43]) the target is: tensor(58)
For X (input): tensor([24, 43, 58]) the target is: tensor(5)
For X (input): tensor([24, 43, 58,  5]) the target is: tensor(57)
For X (input): tensor([24, 43, 58,  5, 57]) the target is: tensor(1)
For X (input): tensor([24, 43, 58,  5, 57,  1]) the target is: tensor(46)
For X (input): tensor([24, 43, 58,  5, 57,  1, 46]) the target is: tensor(43)
For X (input): tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target is: tensor(39)
For X (input): tensor([44]) the target is: tensor(5

## Bigram Language Model

In [64]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # batch (4), time(8), channel (65 = vocab_size)
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1) # B*T
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """Generate max_new_tokens tokens in a bigram fashion (context=1)

        Args:
            idx (idx): (B, T) tensor of indices
        """
        for _ in range(max_new_tokens):
            # get logits (preds)
            logits, _ = self(idx) # (B, T, C)
            # enforce bigram = get last timestep only
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx

bigram_model = BigramLanguageModel(vocab_size=vocab_size)
logits, loss = bigram_model(xb, yb)
# xb = 4 batches of random sequences of ints of length BLOCK_SIZE
print(xb.shape)
print(f"(B*T, C) = ({BATCH_SIZE*BLOCK_SIZE}, {vocab_size})", logits.shape, loss)

# start with "\n" token e.g. idx = 0
print(ord(unique_characters[0]) == ord("\n"))
print(decode(bigram_model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=32)[0].tolist()))

torch.Size([4, 8])
(B*T, C) = (32, 65) torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward0>)
True

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdh


## Training

In [75]:
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-3)

for steps in range(10000):
    xb, yb = get_batch("train")

    # eval loss
    logits, loss = bigram_model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(loss.item())

2.636704444885254


In [78]:
print(decode(bigram_model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))
# still not Shakespeare :D


INNUGo aked-
ANEEOnge ho igere y m sp'd uthe hantle pp es spe thom, tedopuschous, fo w HE:
ANRERDWhed, pr g I t me t cal be g bevive Whise:
Thoullf cunkir, elthryot;

Ancan MAREDUTh lt,
FFon whancher compeny'd:
I lak: spe.
We gur. CESTo tu ngesg stheeincerngr cown:


Ye, akn m tawigmaprind? m.

Wan 
