My copy of the code explored in "Let's build GPT: from scratch, in code, spelled out." by Andrej Karpathy

TODO:
implement vocabulary handling of numbers

In [1]:
import tensorflow as tf
import torch
import os
from datasets import load_dataset
import re
import random
import numpy as np
from unidecode import unidecode
import sentencepiece as spm
from transformer import BigramLanguageModel

2024-11-24 17:56:25.554946: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
sp = spm.SentencePieceProcessor(model_file='bpe.model')


In [3]:
encode = lambda s: sp.encode(s, out_type=int)
decode = lambda s: sp.decode(s)
encoded = encode(""""Hello" there""")
print(encoded)

decoded = decode(encoded)
print(decoded)

[45, 9974, 154, 9945, 9965, 238]
"Hello" there


In [4]:
device='cuda'

In [5]:
dataset = load_dataset("daily_dialog")

In [6]:
def dialog_to_tokens():
    """Turns immutable dialog dataset into tokens. Adds tokens to indicate who is speaking"""
    tokenized_dataset = {'train': [], 'test': []}
    speaker = {0 : 'A', 1 : 'B'}

    for ds in ['train', 'test']:
        for i,conversation in enumerate(dataset[ds]):
            c = []
            for j, statement in enumerate(conversation['dialog']):
                statement = unidecode(statement)
                allowed_characters_pattern = r"""[^a-zA-Z0-9\s.,!?;:'"-]"""
                statement = re.sub(allowed_characters_pattern, "", statement)
                statement = re.sub(r'\s+([,.!?;:])', r'\1', statement)
                c += encode(f"""{speaker[j % 2]}: """ + statement)
            tokenized_dataset[ds].append(c)
    return tokenized_dataset
dataset = dialog_to_tokens()

In [7]:
from statistics import mean, median
ls = [len(item) for item in dataset['train']]
print(mean(ls))
print(median(ls))
print(max(ls))
print(min(ls))

144.4313725490196
123.0
1025
10


In [8]:
print(decode(dataset['train'][0]))

A: Say, Jim, how about going for a few beers after dinner? B: You know that is tempting but is really not good for our fitness. A: What do you mean? It will help us to relax. B: Do you really think so? I don't. It will just make us fat and act silly. Remember last time? A: I guess you are right.But what shall we do? I don't feel like sitting at home. B: I suggest a walk over to the gym where we can play singsong and meet some of our friends. A: That's a good idea. I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them. B: Sounds great to me! If they are willing, we could ask them to go dancing with us.That is excellent exercise and fun, too. A: Good.Let ' s go now. B: All right.


In [9]:
print(dataset['train'][0])

[9941, 3, 8893, 9960, 432, 53, 9960, 373, 307, 635, 86, 7, 1149, 50, 203, 401, 1369, 9975, 9941, 4, 558, 242, 63, 113, 4650, 25, 133, 113, 1091, 83, 413, 86, 471, 2746, 381, 9963, 9941, 3, 787, 161, 59, 628, 9975, 372, 239, 1126, 353, 28, 9880, 9963, 9941, 4, 1099, 59, 1091, 428, 126, 9975, 48, 520, 9968, 9943, 9963, 372, 239, 514, 586, 353, 2241, 30, 989, 4861, 9963, 8124, 565, 344, 9975, 9941, 3, 48, 2914, 59, 224, 640, 9963, 781, 225, 545, 230, 161, 9975, 48, 520, 9968, 9943, 552, 322, 1526, 93, 871, 9963, 9941, 4, 48, 5331, 7, 1499, 365, 28, 11, 7794, 9954, 547, 230, 321, 1197, 10, 431, 241, 30, 1189, 299, 31, 471, 1106, 9963, 9941, 3, 955, 9968, 9949, 7, 413, 1374, 9963, 48, 1098, 986, 30, 125, 340, 1311, 248, 238, 28, 1197, 39, 25, 9961, 241, 9963, 5595, 230, 321, 586, 7, 1114, 5120, 77, 236, 9963, 9941, 4, 125, 2270, 427, 28, 109, 9978, 1082, 227, 224, 5835, 9960, 230, 272, 1157, 236, 28, 248, 5174, 77, 353, 9963, 1172, 113, 3154, 6098, 30, 2789, 9960, 293, 9963, 9941, 3, 3787, 

In [10]:
block_size = 256
n_embd = 64*4
l_train_data = len(dataset['train'])
l_test_data = len(dataset['test'])

In [11]:
batch_size = 32 #how many independent sequences we process in parallel
block_size = 256 #maximum context length for predictions

def get_dialog_batch(split):
    l_data = l_train_data if split == 'train' else l_test_data
    dialog_number = torch.randint(l_data-1, (batch_size,))
    x = []
    y = []

    for d in dialog_number:
        len_conv = len(dataset[split][d])

        start = random.randint(0,max(0,len_conv-block_size-1))
        conv = dataset[split][d][start:min(len_conv-1, start+block_size)]
        target = dataset[split][d][start+1:min(len_conv, start+block_size+1)]
        
        x.append(conv + [9999 for i in range(block_size - len(conv))]) #padding EOS token (optimizable)
        y.append(target + [9999 for i in range(block_size - len(target))]) #padding EOS token

    x = torch.tensor(x, dtype=torch.long)
    y = torch.tensor(y, dtype=torch.long)

    x = x.to(device)
    y = y.to(device)
    
    return x,y



xb, yb = get_dialog_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target =  yb[b,t]
        print(f'when input is {context.tolist()} the target is {target}')


torch.Size([32, 256])
tensor([[9941,    3, 1193,  ..., 9999, 9999, 9999],
        [ 594,  213, 2092,  ..., 9941,    4,   92],
        [9941,    3,  372,  ..., 9999, 9999, 9999],
        ...,
        [9941,    3, 3787,  ..., 9999, 9999, 9999],
        [9941,    3,   48,  ..., 9999, 9999, 9999],
        [9941,    3,  125,  ..., 9999, 9999, 9999]], device='cuda:0')
torch.Size([32, 256])
tensor([[   3, 1193,   64,  ..., 9999, 9999, 9999],
        [ 213, 2092,   74,  ...,    4,   92, 9966],
        [   3,  372, 7687,  ..., 9999, 9999, 9999],
        ...,
        [   3, 3787,  969,  ..., 9999, 9999, 9999],
        [   3,   48, 2245,  ..., 9999, 9999, 9999],
        [   3,  125,  140,  ..., 9999, 9999, 9999]], device='cuda:0')
when input is [9941] the target is 3
when input is [9941, 3] the target is 1193
when input is [9941, 3, 1193] the target is 64
when input is [9941, 3, 1193, 64] the target is 213
when input is [9941, 3, 1193, 64, 213] the target is 466
when input is [9941, 3, 1193, 64, 

In [12]:
vocab_size = 10000
m = BigramLanguageModel(vocab_size)
m.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx, max_new_tokens=100, training=False)[0].tolist()))

torch.Size([8192, 10000])
tensor(9.3855, device='cuda:0', grad_fn=<NllLossBackward0>)
 ⁇  streng pmers formedoting blessing assass evenings divided gardens sell curlsorable dreaming set couple grownulated suffer slbility cruel grief impulse embroidIV towards indifferent subord prolong steps shook idea Musketeer dec reality comes turningmight Geor argumentmerdyakov anywhere intelligent wrappedrum answ fem Mondher renewed preferred rig gazing prospectsrison pains Kalganov hon Pavl jawurdayasticallyatever ante kissedcca ridden interrog incess deemed mountedumrating thousandLL armsknown Valentine gos underst pockets lance Pavlovna queer deciderumm rum MiYourop mowing firacy paid Ilyin leanangu introdu


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=6e-5)

In [14]:
#torch.save(m.state_dict(), 'weights_2.pth')
m.load_state_dict(torch.load('pretrained_weights.pth'))

<All keys matched successfully>

In [28]:
batch_size = 32
test_loss = 0
steps = 0
#for steps in range(10000):
total_train_loss = 0
train_steps = 0

for i in range(1500):
    #sample a batch of data
    xb, yb = get_dialog_batch('train')

    #evaluate the loss
    logits, loss = m(xb, yb)
    total_train_loss += loss.item()
    train_steps += 1

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    steps += 1
    if steps % 300 == 299:
        avg_train_loss = total_train_loss / train_steps
        print(f"{steps+1}: Train loss: {avg_train_loss}")
        total_train_loss = 0
        train_steps = 0
        test_loss = 0

        m.eval()
        with torch.no_grad():
            for i in range(10):
                xb, yb = get_dialog_batch('test')
                logits, loss = m(xb, yb)
                test_loss += loss.item()
        test_loss /= 10
        print(f"{steps+1}: Test loss: {test_loss}")
        with torch.no_grad():
            idx = torch.tensor((encode("A: What does that mean? ") + encode("B:")), dtype=torch.long, device=device).unsqueeze(0)
            print(decode(m.generate(idx, max_new_tokens=30, training=False)[0].tolist()))
        m.train()

    if steps % 10000 == 9999:
        print(f"Saving weights. Iteration {steps+1}")
        torch.save(m.state_dict(), 'weights_chat.pth')

300: Train loss: 2.955721692496718
300: Test loss: 3.020609998703003
A: What does that mean? B: I'm going to take a look at things. A:
600: Train loss: 2.940930296579997
600: Test loss: 3.0983957767486574
A: What does that mean? B: It's a bad idea. I think it is one of the most important guests. A:
900: Train loss: 2.928585325082143
900: Test loss: 3.042486000061035
A: What does that mean? B: Well, it's a good idea. A:
1200: Train loss: 2.9203366303443907
1200: Test loss: 2.9882644176483155
A: What does that mean? B: I don't know. You can try it on, but I'm not sure. A:
1500: Train loss: 2.898792018890381
1500: Test loss: 2.9800052642822266
A: What does that mean? B: It's a real headache. I don ' t know what's wrong with that, but I don ' t know what to do.


In [26]:
torch.save(m.state_dict(), 'weights_chat.pth')

In [25]:
#idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = torch.tensor((encode("A: What time is dinner? B:")), dtype=torch.long, device=device).unsqueeze(0)
print(''.join(decode(m.generate(idx, max_new_tokens=500, temperature=0.5, training=False)[0].tolist())))

A: What time is dinner? B: I ' m going to order now. Now you can get back a walk in the park. A:


In [17]:
m.save_weights("transformer_weights.h5")

AttributeError: 'BigramLanguageModel' object has no attribute 'save_weights'