My copy of the code explored in "Let's build GPT: from scratch, in code, spelled out." by Andrej Karpathy

TODO:
implement vocabulary handling of numbers

In [3]:
import torch
import os
from datasets import load_dataset
import re
import random
import numpy as np
from unidecode import unidecode
import sentencepiece as spm
from transformer import BigramLanguageModel

In [4]:
vocab_size = 10000

In [6]:
dataset = {"train": [], "test": []}
with open("../data/train_data.txt", 'r', encoding='utf-8') as file:
    dataset["train"] = file.read()
with open("../data/test_data.txt", 'r', encoding='utf-8') as file:
    dataset["test"] = file.read()

In [7]:
retrain_vocab = False

if retrain_vocab:
    spm.SentencePieceTrainer.Train(input="../data/train_data.txt", model_prefix='bpe', vocab_size=vocab_size, model_type='bpe', user_defined_symbols='A:,B:')

sp = spm.SentencePieceProcessor(model_file='../weights/bpe.model')


In [8]:
encode = lambda s: sp.encode(s, out_type=int)
decode = lambda s: sp.decode(s)
encoded = encode(""""Hello" there""")
print(encoded)

decoded = decode(encoded)
print(decoded)

[45, 9974, 154, 9945, 9965, 238]
"Hello" there


In [9]:
dataset["train"] = torch.tensor(encode(dataset["train"]), dtype=torch.long, device='cuda')
dataset["test"] = torch.tensor(encode(dataset["test"]), dtype=torch.long, device='cuda')

print(dataset["train"].shape)
print(dataset["train"][:100])

torch.Size([3774798])
tensor([2163,  108, 9941,    0, 1533,  720, 9963,  125,  534,  853, 9960, 3118,
        9963,  673, 9963, 2278, 1709,   75, 9960,  182, 1550, 9963, 9941,    0,
         144, 9960, 9941,    0,  128, 9963,  558,  239, 9752,   28, 1098,   63,
         212,  342,  920,  358, 3901,   11, 9126,   31,  103, 8843,  474,  157,
          59,  131, 3822,   77,  422, 3229, 1112, 9962,  271,  431, 9963,   48,
        2093,  493, 1956, 9960,   30,  127,  466, 5235,  113,   28, 3554,  127,
         660, 1078,   31,  127, 9215,   30, 9366, 3052,   42,   11, 2444,   31,
         127, 3837,  492, 9963,   48,  263, 1007, 1047, 9876,   31, 2574, 9960,
          30,   84,   48, 1499], device='cuda:0')


In [10]:
device='cuda'

In [11]:
l_train_data = len(dataset['train'])
l_test_data = len(dataset['test'])
print(l_train_data)
print(l_test_data)

3774798
939997


In [12]:
batch_size = 32 #how many independent sequences we process in parallel
block_size = 256 #maximum context length for predictions

def get_batch(split):
    data = dataset[split]
    l_data = l_train_data if split == 'train' else l_test_data
    ix = torch.randint(l_data - block_size, (batch_size,)) #choosing random position to start for batch_size dimensions
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x = x.to(device)
    y = y.to(device)
    return x,y


xb, yb = get_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target =  yb[b,t]
        print(f'when input is {context.tolist()} the target is {target}')


torch.Size([32, 256])
tensor([[  32,   89, 5466,  ...,   63, 2750,  600],
        [ 239, 8616, 3723,  ...,   45, 9993,  246],
        [ 213,  603,  902,  ..., 9957, 9963,   45],
        ...,
        [ 230, 7754, 3690,  ...,  661,  556,  107],
        [1479,   28,   11,  ...,  194, 1452, 2885],
        [ 291,  384,   31,  ..., 9963,  182, 9968]], device='cuda:0')
torch.Size([32, 256])
tensor([[  89, 5466,   42,  ..., 2750,  600, 9493],
        [8616, 3723, 1076,  ..., 9993,  246,   72],
        [ 603,  902,  632,  ..., 9963,   45, 9967],
        ...,
        [7754, 3690, 1670,  ...,  556,  107, 3086],
        [  28,   11,  640,  ..., 1452, 2885,   64],
        [ 384,   31,  174,  ...,  182, 9968,  708]], device='cuda:0')
when input is [32] the target is 89
when input is [32, 89] the target is 5466
when input is [32, 89, 5466] the target is 42
when input is [32, 89, 5466, 42] the target is 7
when input is [32, 89, 5466, 42, 7] the target is 1562
when input is [32, 89, 5466, 42, 7, 1562] 

when input is [3527, 9960, 8239, 534, 36, 186, 9960, 7, 8042, 527, 9960, 306, 3012, 33, 31, 2464, 5179, 9960, 668, 908, 89, 1282, 42, 11, 2341, 30, 4723, 72, 1270, 871, 77, 91, 9960, 30, 101, 96, 90, 1330, 9960, 69, 1092, 194, 9960, 7, 1329, 2170, 8609, 9969, 4254, 3293, 9960, 908, 9968, 9949, 2827, 42, 69, 2123, 9969, 9948, 1657, 9949, 9963, 139, 1369, 64, 7, 309, 4805, 201, 9996, 3735, 3527, 64, 42, 69, 540, 4183, 4100, 9960, 30, 64, 450, 6347, 162, 8239, 534, 36, 186, 9968, 9949, 5794, 255, 9963, 8239, 534, 36, 186, 9960, 992, 69, 5794, 255, 64, 5302, 392, 30, 2027, 9960, 420, 11, 575, 31, 76, 9963, 101, 96, 90, 1330, 601, 857, 7, 5101, 30, 413, 9969, 6298, 3976, 28, 1271, 31, 285, 976, 9963, 45, 8654, 9960, 306, 121, 123, 8239, 534, 36, 186, 9960, 1113, 895, 69, 825, 166, 7, 2665, 6879, 42, 11, 229, 6792, 9969, 1199, 9960, 45, 1889, 7, 4319, 1233, 64, 471, 527, 5472, 1753, 1911, 9963, 48, 9968, 9954, 83, 1671, 31, 866, 1871, 9960, 86, 32, 9968, 9949, 4969, 9963, 829, 11, 344, 32, 6

In [14]:
m = BigramLanguageModel(vocab_size)
m.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([8192, 10000])
tensor(9.3896, device='cuda:0', grad_fn=<NllLossBackward0>)
 ⁇ or decision gam threwam excuse staredumbled different burned carvedaced reass avoiding Bed awoke jail student Havisham conject French compre unreasonatychNonsense unworthy prison races attempt hastened voluntelkilt Varenka common relating fought bord his mans snowepotonsieurduII taught considerable pretend badly bodies adj generals isol wh whiteness rubbed fierceivityding washing own fer warnMrs stoveK awful whenational vest cig knockathcliff poserred inspector extended rockshor granted immedi Bezukhov nevertheless transl wheelfast property innerat triumphantenedettoions England belong surievIL nodding speechattered


In [17]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [18]:
#m.load_state_dict(torch.load('../weights/pretrained_weights.pth'))

In [19]:
batch_size = 32

test_loss = 0
steps = 0
total_train_loss = 0
train_steps = 0
test_train_loss_ratio = 0

while test_train_loss_ratio < 1.15 or steps < 7000:
    #sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits, loss = m(xb, yb)
    total_train_loss += loss.item()
    train_steps += 1

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    steps += 1
    if steps % 300 == 299:
        avg_train_loss = total_train_loss / train_steps
        print(f"{steps+1}: Train loss: {avg_train_loss}")
        total_train_loss = 0
        train_steps = 0
        test_loss = 0

        m.eval()
        num_batches = 10
        with torch.no_grad():
            for i in range(num_batches):
                xb, yb = get_batch('test')
                logits, loss = m(xb, yb)
                test_loss += loss.item()

        test_loss /= num_batches
        test_train_loss_ratio = test_loss / avg_train_loss
        print(f"{steps+1}: Test loss: {test_loss}")
        with torch.no_grad():
            idx = torch.tensor((encode("He said")), dtype=torch.long, device=device).unsqueeze(0)
            print(decode(m.generate(idx, max_new_tokens=30)[0].tolist()))
        m.train()

    if steps % 10000 == 9999:
        print(f"Saving weights. Iteration {steps+1}")
        torch.save(m.state_dict(), '../weights/pretrained_weights.pth')

300: Train loss: 5.919365051996748
300: Test loss: 5.387801933288574
He said, and asked the world. But he found him in his heart, and was a large house. The little head of his wife, and he was
600: Train loss: 5.152271300951639
600: Test loss: 5.076488971710205
He said, "I know what is not to be sure." The man. "Well, I don't know him to give me." "No,
900: Train loss: 4.886018233299255
900: Test loss: 4.906076097488404
He said, "and I have been so much to go and that you should like a woman is very soon as to be no longer. That's the
1200: Train loss: 4.729386890729268
1200: Test loss: 4.75801362991333
He said, "I do not know," he added. He was in the room, and his presence of this evening he had not to be the money.
1500: Train loss: 4.600904774665833
1500: Test loss: 4.711012268066407
He said, "and I am not to you." He took her, and looked at him. "I have only been so glad to be it. I
1800: Train loss: 4.508720094362895
1800: Test loss: 4.6402641296386715
He said, "I'll say it. I sh

KeyboardInterrupt: 

In [20]:
torch.save(m.state_dict(), '../weights/pretrained_weights.pth')

In [21]:
#idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = torch.tensor((encode("The count said")), dtype=torch.long, device=device).unsqueeze(0)
print(''.join(decode(m.generate(idx, max_new_tokens=500, temperature=0.5)[0].tolist())))

The count said, and the baroness had not his appearance. "You are very happy, then," said he to Albert. "I am very sorry; but it is not to be said that I am sure of you," he added, "that is a very interesting man." Albert could not help exclaiming the count. "And why did you expect to be?" said Albert, "I have done so much of it." The count was with a smile. "What?" said he, in astonishment; and when the count's eyes were visible. "I have come to one of your friends, and you will see I am going to the countess." "But that is it possible?" said he, fixing his eyes with a deep sigh. "I have no time to get rid of him," said he. "I have not noticed it." Albert gave the word of his lordship; he had uttered and the words with which Franz would have turned out his hand. "You are a very good man," said the count, and he did not know what was said. "I have you told me," replied Albert, with a smile which he was not at all. "I am sure, I have only a good deal of your family." "But I have not per