In [1]:
from src.utils import get_files_from_folder, open_txt
from src.model import BigramLanguageModel
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
books = get_files_from_folder("books")
books_string = [open_txt(f"books/{i}") for i in books]
print(books_string[0][:500])

NOTES FROM THE UNDERGROUND[*]
A NOVEL


* The author of the diary and the diary itself are, of course,
imaginary. Nevertheless it is clear that such persons as the writer of
these notes not only may, but positively must, exist in our society,
when we consider the circumstances in the midst of which our society is
formed. I have tried to expose to the view of the public more
distinctly than is commonly done, one of the characters of the recent
past. He is one of the representatives of a generatio


In [3]:
all_books = "\n".join(books_string)
print(f"All books have a lenght of: {len(all_books)}")

vocab = sorted(set(all_books))
vocab_size = len(vocab)
print(vocab)
print(f"The vocabulary has a lenght of: {len(vocab)}")

All books have a lenght of: 7113352
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'À', 'Æ', 'É', 'à', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'Œ', 'œ', '‐', '—', '‘', '’', '“', '”']
The vocabulary has a lenght of: 104


In [4]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}
encode = lambda x: [stoi[c] for c in x]
decode = lambda x: "".join([itos[c] for c in x])

print(encode("Dostoyevsky"))
print(decode(encode("Dostoyevsky")))

[27, 67, 71, 72, 67, 77, 57, 74, 71, 63, 77]
Dostoyevsky


In [5]:
data = torch.tensor(encode(all_books), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

torch.Size([7113352]) torch.int64
tensor([37, 38, 43, 28, 42,  1, 29, 41, 38, 36])


In [6]:
train_size = int(len(data) * 0.9)
train_data = data[:train_size]
val_data = data[train_size:]

In [7]:
batch_size = 4
block_size = 8


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[70,  1, 72, 60, 61, 70, 72, 77],
        [57,  1, 72, 67,  1, 58, 53, 55],
        [57, 10,  1, 46, 60, 53, 72,  1],
        [72,  1, 72, 67,  1, 65, 57,  1]])
targets:
torch.Size([4, 8])
tensor([[ 1, 72, 60, 61, 70, 72, 77,  1],
        [ 1, 72, 67,  1, 58, 53, 55, 57],
        [10,  1, 46, 60, 53, 72,  1, 65],
        [ 1, 72, 67,  1, 65, 57,  1, 53]])
----
when input is [70] the target: 1
when input is [70, 1] the target: 72
when input is [70, 1, 72] the target: 60
when input is [70, 1, 72, 60] the target: 61
when input is [70, 1, 72, 60, 61] the target: 70
when input is [70, 1, 72, 60, 61, 70] the target: 72
when input is [70, 1, 72, 60, 61, 70, 72] the target: 77
when input is [70, 1, 72, 60, 61, 70, 72, 77] the target: 1
when input is [57] the target: 1
when input is [57, 1] the target: 72
when input is [57, 1, 72] the target: 67
when input is [57, 1, 72, 67] the target: 1
when input is [57, 1, 72, 67, 1] the target: 58
when input is [57, 1, 72

In [8]:
m = BigramLanguageModel()
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)

torch.Size([32, 104])
tensor(5.3029, grad_fn=<NllLossBackward0>)

xw'äŒë91ŒF*LKZ“‐q!ChêYêlR‘wJ?97"y7lH-zôdœgrPp.'9BKQx_èh7yOOh‐”,[VzfW6bs”ç0.90S.k0écü—ÉpA‐cöfy:lDî'(R


In [9]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [10]:
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...
    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"Loss for {steps} : {loss.item()}")
print(f"Loss for {steps} : {loss.item()}")

Loss for 0 : 5.211520195007324
Loss for 1000 : 4.098145961761475
Loss for 2000 : 3.318737506866455
Loss for 3000 : 2.8467729091644287
Loss for 4000 : 2.706305980682373
Loss for 5000 : 2.4431138038635254
Loss for 6000 : 2.5919318199157715
Loss for 7000 : 2.4955062866210938
Loss for 8000 : 2.6086950302124023
Loss for 9000 : 2.5684008598327637
Loss for 9999 : 2.492804527282715


In [11]:
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)


kio "_'
Yogg or‐_ thase t ngh, l, onvef a an t The w
s hivenouthelfome tist bîud But trkie m;Thalsto


In [12]:
torch.save(
    {
        "state_dict": m.state_dict(),
    },
    "model_store/fyodor.pt",
)

In [13]:
nlp = BigramLanguageModel()
nlp.token_embedding_table.weight

Parameter containing:
tensor([[ 0.4405, -0.2665,  1.5802,  ...,  0.5035, -0.0892,  0.2469],
        [ 0.0377, -0.8076,  0.5226,  ..., -0.9246, -0.3201,  0.0784],
        [ 1.2510, -0.6916, -0.3453,  ...,  0.6187,  0.8984, -1.0661],
        ...,
        [ 0.4359,  1.9608,  0.8644,  ...,  0.1105,  1.6918,  1.2047],
        [ 0.5749,  0.8414,  0.7786,  ...,  1.2773,  0.9308, -0.8005],
        [-1.6228,  2.4729, -0.9491,  ..., -0.0866,  0.0571, -0.1878]],
       requires_grad=True)

In [14]:
load = torch.load("model_store/fyodor.pt")
nlp.load_state_dict(load["state_dict"])

<All keys matched successfully>

In [15]:
nlp.token_embedding_table.weight

Parameter containing:
tensor([[ 2.4199, -1.6624, -5.6457,  ..., -5.1911,  1.6332, -4.8237],
        [-4.0091, -1.7476, -6.4224,  ..., -4.5966, -0.7685, -3.9140],
        [-0.4226,  2.2950, -3.2307,  ..., -0.5295, -3.9646,  1.0517],
        ...,
        [-1.2289,  1.1327, -1.8753,  ..., -5.1657, -3.7434, -1.3443],
        [-3.4093, -1.8516, -4.1773,  ..., -4.5596, -3.1489, -4.8970],
        [ 2.4127,  2.5272, -5.1438,  ..., -5.5503, -3.9697, -3.3135]],
       requires_grad=True)

In [16]:
print(
    decode(
        nlp.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)


ucond has
lvert ce Bly, æ3çYo thourtheceng heron y f oreeramanorthanere out ithlemy totrere omend e

