In [1]:
from src.utils import get_files_from_folder, open_txt
from src.model import BigramLanguageModel
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
books = get_files_from_folder("books")
books_string = [open_txt(f"books/{i}") for i in books]
print(books_string[0][:500])

NOTES FROM THE UNDERGROUND[*]
A NOVEL


* The author of the diary and the diary itself are, of course,
imaginary. Nevertheless it is clear that such persons as the writer of
these notes not only may, but positively must, exist in our society,
when we consider the circumstances in the midst of which our society is
formed. I have tried to expose to the view of the public more
distinctly than is commonly done, one of the characters of the recent
past. He is one of the representatives of a generatio


In [3]:
all_books = "\n".join(books_string)
print(f"All books have a lenght of: {len(all_books)}")

vocab = sorted(set(all_books))
vocab_size = len(vocab)
print(vocab)
print(f"The vocabulary has a lenght of: {len(vocab)}")

All books have a lenght of: 7113352
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'À', 'Æ', 'É', 'à', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'Œ', 'œ', '‐', '—', '‘', '’', '“', '”']
The vocabulary has a lenght of: 104


In [4]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}
encode = lambda x: [stoi[c] for c in x]
decode = lambda x: "".join([itos[c] for c in x])

print(encode("Dostoyevsky"))
print(decode(encode("Dostoyevsky")))

[27, 67, 71, 72, 67, 77, 57, 74, 71, 63, 77]
Dostoyevsky


In [5]:
data = torch.tensor(encode(all_books), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

torch.Size([7113352]) torch.int64
tensor([37, 38, 43, 28, 42,  1, 29, 41, 38, 36])


In [6]:
train_size = int(len(data) * 0.9)
train_data = data[:train_size]
val_data = data[train_size:]

In [7]:
batch_size = 4
block_size = 8


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[43, 60, 57, 70, 57,  1, 61, 71],
        [60, 57,  1, 72, 67, 64, 56,  1],
        [60, 53, 72,  1, 55, 67, 66, 59],
        [71, 61, 71, 72, 57, 56,  8,  1]])
targets:
torch.Size([4, 8])
tensor([[60, 57, 70, 57,  1, 61, 71,  1],
        [57,  1, 72, 67, 64, 56,  1, 60],
        [53, 72,  1, 55, 67, 66, 59, 70],
        [61, 71, 72, 57, 56,  8,  1, 54]])
----
when input is [43] the target: 60
when input is [43, 60] the target: 57
when input is [43, 60, 57] the target: 70
when input is [43, 60, 57, 70] the target: 57
when input is [43, 60, 57, 70, 57] the target: 1
when input is [43, 60, 57, 70, 57, 1] the target: 61
when input is [43, 60, 57, 70, 57, 1, 61] the target: 71
when input is [43, 60, 57, 70, 57, 1, 61, 71] the target: 1
when input is [60] the target: 57
when input is [60, 57] the target: 1
when input is [60, 57, 1] the target: 72
when input is [60, 57, 1, 72] the target: 67
when input is [60, 57, 1, 72, 67] the target: 64
when input is [60

In [8]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)

torch.Size([32, 104])
tensor(5.4505, grad_fn=<NllLossBackward0>)

0UQ6Is]éÀxç5Lcæ8'v5“8?F
îKFp_Ébhaî]nn;"æüÆô8R‐eé;ê7Æék_HtÀPhuYŒüz‘HNö ‘Œ5”èX
(S3G‐N’z;ÀëhXxp—8'.'bMO


In [9]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [10]:
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...
    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"Loss for {steps} : {loss.item()}")
print(f"Loss for {steps} : {loss.item()}")

Loss for 0 : 5.316340923309326
Loss for 1000 : 4.188947677612305
Loss for 2000 : 3.433223247528076
Loss for 3000 : 2.8472392559051514
Loss for 4000 : 2.7882323265075684
Loss for 5000 : 2.693650484085083
Loss for 6000 : 2.564865827560425
Loss for 7000 : 2.62357759475708
Loss for 8000 : 2.478428363800049
Loss for 9000 : 2.5481741428375244
Loss for 9999 : 2.480616807937622


In [12]:
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)


“I hed Pasrye
thapef.. cumeit in m! wanlvis we
w fild Zö?’st, Ithe pexct wherontha. mu pocuakerothoo
