In [9]:
from src.utils import get_files_from_folder, open_txt
from src.model import BigramLanguageModel
import torch
import torch.nn as nn
from torch.nn import functional as F

In [10]:
books = get_files_from_folder("books")
books_string = [open_txt(f"books/{i}") for i in books]
print(books_string[0][:500])

NOTES FROM THE UNDERGROUND[*]
A NOVEL


* The author of the diary and the diary itself are, of course,
imaginary. Nevertheless it is clear that such persons as the writer of
these notes not only may, but positively must, exist in our society,
when we consider the circumstances in the midst of which our society is
formed. I have tried to expose to the view of the public more
distinctly than is commonly done, one of the characters of the recent
past. He is one of the representatives of a generatio


In [11]:
all_books = "\n".join(books_string)
print(f"All books have a lenght of: {len(all_books)}")

vocab = sorted(set(all_books))
vocab_size = len(vocab)
print(vocab)
print(f"The vocabulary has a lenght of: {len(vocab)}")

All books have a lenght of: 7113352
['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'À', 'Æ', 'É', 'à', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ö', 'ü', 'Œ', 'œ', '‐', '—', '‘', '’', '“', '”']
The vocabulary has a lenght of: 104


In [12]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}
encode = lambda x: [stoi[c] for c in x]
decode = lambda x: "".join([itos[c] for c in x])

print(encode("Dostoyevsky"))
print(decode(encode("Dostoyevsky")))

[27, 67, 71, 72, 67, 77, 57, 74, 71, 63, 77]
Dostoyevsky


In [13]:
data = torch.tensor(encode(all_books), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

torch.Size([7113352]) torch.int64
tensor([37, 38, 43, 28, 42,  1, 29, 41, 38, 36])


In [14]:
train_size = int(len(data) * 0.9)
train_data = data[:train_size]
val_data = data[train_size:]

In [15]:
batch_size = 4
block_size = 8


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[57, 10,  1, 43, 60, 57, 77,  1],
        [66, 56,  1, 61, 65, 68, 70, 57],
        [71, 61, 64, 57, 66, 55, 57,  1],
        [67, 70, 72, 60, 64, 57, 71, 71]])
targets:
torch.Size([4, 8])
tensor([[10,  1, 43, 60, 57, 77,  1, 65],
        [56,  1, 61, 65, 68, 70, 57, 71],
        [61, 64, 57, 66, 55, 57,  1, 75],
        [70, 72, 60, 64, 57, 71, 71,  2]])
----
when input is [57] the target: 10
when input is [57, 10] the target: 1
when input is [57, 10, 1] the target: 43
when input is [57, 10, 1, 43] the target: 60
when input is [57, 10, 1, 43, 60] the target: 57
when input is [57, 10, 1, 43, 60, 57] the target: 77
when input is [57, 10, 1, 43, 60, 57, 77] the target: 1
when input is [57, 10, 1, 43, 60, 57, 77, 1] the target: 65
when input is [66] the target: 56
when input is [66, 56] the target: 1
when input is [66, 56, 1] the target: 61
when input is [66, 56, 1, 61] the target: 65
when input is [66, 56, 1, 61, 65] the target: 68
when input is [66, 56

In [16]:
m = BigramLanguageModel()
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)

torch.Size([32, 104])
tensor(5.3681, grad_fn=<NllLossBackward0>)

[ö]çJéäëACIööPEKfh;é6ÆTu!mIJ](EŒôw'e‘[Càôwä9fœäà'04(?*IRbœâîIQ4‘ÀAcvèîä9Àœê",3mKfvéGIöDXqfà‘löXL4]’œ


In [17]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [18]:
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...
    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"Loss for {steps} : {loss.item()}")
print(f"Loss for {steps} : {loss.item()}")

Loss for 0 : 5.1526103019714355
Loss for 1000 : 4.051868915557861
Loss for 2000 : 3.2067062854766846
Loss for 3000 : 2.944689989089966
Loss for 4000 : 2.745990037918091
Loss for 5000 : 2.646265983581543
Loss for 6000 : 2.4835376739501953
Loss for 7000 : 2.3692567348480225
Loss for 8000 : 2.6100475788116455
Loss for 9000 : 2.6028037071228027
Loss for 9999 : 2.5197811126708984


In [19]:
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)


Whall me Dofimortmangil, ithe, ouly nt aler fZqustliedeprevetheled watitarlprortherald wngheged wath


In [20]:
torch.save(m.state_dict(), "model_store/fyodor.pth")