In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [24]:
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)
q = query(x)
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) = (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape  # (4, 8, 16)

torch.Size([4, 8, 16])

In [25]:
k = key(x)
q = query(x)
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) = (B, T, T)

In [26]:
k.var()

tensor(0.3224, grad_fn=<VarBackward0>)

In [27]:
q.var()

tensor(0.3580, grad_fn=<VarBackward0>)

In [28]:
wei.var()

tensor(1.6897, grad_fn=<VarBackward0>)

In [29]:
k = key(x)
q = query(x)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [30]:
q.var()

tensor(0.3580, grad_fn=<VarBackward0>)

In [31]:
wei.var()

tensor(0.1056, grad_fn=<VarBackward0>)

In [32]:
wei[0][0]

tensor([-0.5909,  0.1981,  0.0335, -0.0970,  0.0424,  0.1537,  0.2449,  0.0473],
       grad_fn=<SelectBackward0>)

# Train model

In [41]:
import torch.nn as nn
from data import FyodorDataset
from model import BigramLanguageModel
from train import get_batch
from utils import get_files_from_folder, open_txt
from tqdm import trange

batch_size = 32
block_size = 8
max_iters = 5000
eval_iters = 10
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "cpu"
n_embd = 32

books = get_files_from_folder("../books")
books_string = [open_txt(f"../books/{i}") for i in books]
books = "\n".join(books_string)

train_dataset = FyodorDataset(books[: int(len(books) * 0.8)])
val_dataset = FyodorDataset(books[int(len(books) * 0.8) :])

model = BigramLanguageModel(n_embd=n_embd, block_size=block_size, device=device)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

t = trange(max_iters)
for steps in t:
    # sample a batch of data
    xb, yb = get_batch(
        train_dataset.data, block_size=block_size, batch_size=batch_size, device=device
    )

    model.train()
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # evalute on valditation set
    model.eval()
    val_loss = torch.zeros(eval_iters)
    with torch.no_grad():
        for i in range(eval_iters):
            xb, yb = get_batch(
                val_dataset.data,
                block_size=block_size,
                batch_size=512,
                device=device,
            )
            logits, loss = model(xb, yb)
            val_loss[i] = loss
        val_loss = val_loss.mean()

    t.set_description(f"train_loss: {loss.item():.4f} | val_loss: {val_loss:.4f}")

print("Done")

train_loss: 2.4303 | val_loss: 2.4147: 100%|██████████| 5000/5000 [04:05<00:00, 20.34it/s]

Done



