In [11]:
import pathlib
import torch
from torch.utils.data import DataLoader

from gpt_builder.tokenizer import Tokenizer
from gpt_builder.dataset import BigramDataset
from gpt_builder.model import BigramLLM
from gpt_builder.utils import bigram_crossentropy_loss, train_step, get_loss

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read in raw text

In [2]:
data_dir = pathlib.Path("data")
with open(data_dir / "wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()
print("Text length: ", len(text))
print(text[:200])

Text length:  232284
DOROTHY AND THE WIZARD IN OZ

BY

L. FRANK BAUM

AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

ILLUSTRATED BY JOHN R. NEILL

BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


[Illu


# Create tokenizer

In [3]:
# Get unique characters
chars = sorted(set(text))
print("Number of unique characters: ", len(chars))
print(chars)

Number of unique characters:  80
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
tokenizer = Tokenizer(chars)
hello_tokens = tokenizer.encode("Hello")
print("Encoded hello: ", hello_tokens)
hello_decoded = "".join(tokenizer.decode(hello_tokens))
print("Decoded hello: ", hello_decoded)

Encoded hello:  [32, 58, 65, 65, 68]
Decoded hello:  Hello


In [5]:
# Tokenize Wizard of Oz
data = tokenizer.encode(text, return_tensors=True)
print(data[:200])

tensor([28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0, 26, 49,  0,  0, 36, 11,
         1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0, 25, 45, 44, 32, 39,
        42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 39, 30,
         1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25, 38, 28,  1, 39, 30,  1, 39,
        50,  9,  1, 39, 50, 37, 25,  1, 39, 30,  1, 39, 50,  9,  1, 29, 44, 27,
        11,  0,  0, 33, 36, 36, 45, 43, 44, 42, 25, 44, 29, 28,  1, 26, 49,  1,
        34, 39, 32, 38,  1, 42, 11,  1, 38, 29, 33, 36, 36,  0,  0, 26, 39, 39,
        35, 43,  1, 39, 30,  1, 47, 39, 38, 28, 29, 42,  1, 47, 33, 36, 36, 33,
        25, 37,  1, 37, 39, 42, 42, 39, 47,  1,  4,  1, 27, 39, 11,  9,  1, 33,
        38, 27, 11,  1, 38, 29, 47,  1, 49, 39, 42, 35,  0,  0,  0, 51, 33, 65,
        65, 74])


# Create Bigram dataset

In [6]:
dataset = BigramDataset(data)
in_bigram, out_bigram = dataset[0]
print("In bigram: ", in_bigram)
print("Out bigram: ", out_bigram)

In bigram:  tensor([28, 39, 42, 39, 44, 32, 49,  1])
Out bigram:  tensor([39, 42, 39, 44, 32, 49,  1, 25])


# Examine model

In [9]:
vocab_size = len(chars)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
llm = BigramLLM(vocab_size).to(device)
x_out = llm(in_bigram.to(device)).cpu()
x_out.shape

torch.Size([8, 80])

In [10]:
# Compute cros entropy lloss
bigram_crossentropy_loss(x_out, out_bigram)

tensor(5.2320, grad_fn=<NllLossBackward0>)

In [124]:
# Generate new tokens
llm.eval()
x_new = llm.generate(in_bigram.to(device), 10)

In [125]:
# Decode new sequence
print("Context: ", tokenizer.decode(in_bigram.tolist()))
print("New sequence: ", tokenizer.decode(x_new[0].cpu().tolist()))

Context:  ['D', 'O', 'R', 'O', 'T', 'H', 'Y', ' ']
New sequence:  ['D', 'O', 'R', 'O', 'T', 'H', 'Y', ' ', 'O', '[', 'B', 'Y', '\n', 'L', '.', '.', ' ', 'O']


## Training loop

In [16]:
N_ITERS =10_000
LEARNING_RATE = 3e-4
TEST_SPLIT = 0.2
EVAL_ITERS = 250

n_test = int(len(data)*TEST_SPLIT)
train_dataset = BigramDataset(data[:-n_test])
test_dataset = BigramDataset(data[-n_test:])

train_dl = DataLoader(train_dataset, batch_size=64)
test_dl = DataLoader(test_dataset, batch_size=64)
optim = torch.optim.AdamW(llm.parameters(), lr=LEARNING_RATE)

for i in range(N_ITERS):
    inputs, targets = next(iter(train_dl))
    inputs, targets = inputs.to(device), targets.to(device)

    if i % EVAL_ITERS == 0:
        train_loss = get_loss(train_dl, llm, device)
        test_loss = get_loss(test_dl, llm, device)
        print(f"Train step: {i}")
        print(f"Training loss: {train_loss:3f}")
        print(f"Test loss: {test_loss:3f}")
        print()
    loss = train_step(llm, inputs, targets, optim)

print("Final training loss: ", loss)

Train step: 0
Training loss: 4.919087
Test loss: 4.926863

Train step: 250
Training loss: 4.917075
Test loss: 4.924778

Train step: 500
Training loss: 4.915348
Test loss: 4.922974

Train step: 750
Training loss: 4.913933
Test loss: 4.921480

Train step: 1000
Training loss: 4.912862
Test loss: 4.920325

Train step: 1250
Training loss: 4.912166
Test loss: 4.919541

Train step: 1500
Training loss: 4.911876
Test loss: 4.919157

Train step: 1750
Training loss: 4.912023
Test loss: 4.919205

Train step: 2000
Training loss: 4.912633
Test loss: 4.919711

Train step: 2250
Training loss: 4.913730
Test loss: 4.920697

Train step: 2500
Training loss: 4.915331
Test loss: 4.922182

Train step: 2750
Training loss: 4.917449
Test loss: 4.924178

Train step: 3000
Training loss: 4.920085
Test loss: 4.926686

Train step: 3250
Training loss: 4.923234
Test loss: 4.929701

Train step: 3500
Training loss: 4.926878
Test loss: 4.933208

Train step: 3750
Training loss: 4.930994
Test loss: 4.937181

Train step: 40

## Evaluate after training

In [123]:
in_test = data[-n_test:-n_test+100]
llm.eval()
pred_test = llm.generate(in_test.to(device), 100).cpu()[0]

print("TEST CONTEXT")
print("".join(tokenizer.decode(in_test.tolist())))
print()
print("TEST GENERATED")
print("".join(tokenizer.decode(pred_test.tolist())))

TEST CONTEXT
, with drawn gravey poured over it.

"Fish!" cried Jim, with a sniff. "Do you take me for a tom-cat?

TEST GENERATED
, with drawn gravey poured over it.

"Fish!" cried Jim, with a sniff. "Do you take me for a tom-cat?"Z
o]Ua
&4yD WI,D ITyM
n0dPZMoyuvHY WIuCND(UTHY
AUM
Xg-FRAUTH*IsXeHBY
AU2nOF WIZARAUTHEb9[?Y BARD WI
