# 🐣 Baseline Bigram Model

#### 📚 Libraries
Import libraries and configure the environment.

In [47]:
import json
import torch
import torch.nn as nn
from torch.nn import functional as F

#### 📂 Data
Load the data from local.

In [48]:
train_data = torch.load("data/train_data.pt")
val_data = torch.load("data/val_data.pt")

In [49]:
with open("data/encoder_dict.json", "r") as f:
    encoder_dict = json.load(f)

stoi = encoder_dict["stoi"]
itos = encoder_dict["itos"]
itos = {int(k): v for k, v in encoder_dict["itos"].items()}

### 🦮 Data batching

In [50]:
batch_size = 4  # how many samples to process at once
block_size = 8  # the context length


def get_batch(split):
    """Generates a small batch of data of inputs x and targets y"""
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y

In [51]:
def decode(integers: list, itos: dict = itos) -> str:
    """Decode list of integers to text."""
    return "".join([itos[i] for i in integers])

### 🤖 Neural Network

In [52]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads the logits for the next token
        # from the lookup table
        self.lookup = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and tarder are both of shape (batch_size, sequence_length)
        # (B, T, C) (batch_size, sequence_length, vocab_size)
        logits = self.lookup(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape  # (4, 8, 96)
            logits_ = logits.view(B * T, C)  # reshape to (32, 96)
            targets_ = targets.view(-1)  # reshape to (32)
            loss = F.cross_entropy(logits_, targets_)
        return logits, loss

    def generate(self, idx, length):
        with torch.no_grad():
            for _ in range(length):
                # get the predictions for the all the tokens
                logits, _ = self.forward(idx)  # (B, T, C)
                # get the last token
                logits = logits[:, -1, :]  # (B, C)
                # apply softmax to get the probabilities
                probs = F.softmax(logits, dim=-1)  # (B, C)
                # sample the next token
                next_token = torch.multinomial(probs, 1)  # (B, 1)
                # append the next token to the sequence
                idx = torch.cat([idx, next_token], dim=-1)  # (B, T+1)
        return idx

In [57]:
torch.manual_seed(42)
xb, yb = get_batch("train")

In [58]:
vocab_size = len(stoi)
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([4, 8, 96])
tensor(4.9825, grad_fn=<NllLossBackward0>)


In [59]:
tokens = model.generate(torch.zeros((1, 1), dtype=torch.long), 100)
tokens_decoded = decode(tokens.squeeze().tolist())
print(tokens_decoded)

	BOa,	Hes$Hj(yq>145f4lLq5.7Oad3]f5kwYXX3+.$NNs4\$:z:{bLmbHbz=kxV
4c	*zqL0_^I+{)oS6t.Co wh;jgj>'zGL3


### 🧪 Training

In [60]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [61]:
batch_size = 32
for steps in range(50000):
    # sample a batch of data
    xb, yb = get_batch("train")
    # forward pass
    logits, loss = model(xb, yb)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if steps % 1000 == 0:
        print(f"Step: {steps}, Loss: {loss.item():.3f}")

Step: 0, Loss: 4.912
Step: 1000, Loss: 3.836
Step: 2000, Loss: 2.960
Step: 3000, Loss: 2.724
Step: 4000, Loss: 2.561
Step: 5000, Loss: 2.371
Step: 6000, Loss: 2.379
Step: 7000, Loss: 2.429
Step: 8000, Loss: 2.304
Step: 9000, Loss: 2.257
Step: 10000, Loss: 2.334
Step: 11000, Loss: 2.371
Step: 12000, Loss: 2.310
Step: 13000, Loss: 2.230
Step: 14000, Loss: 2.277
Step: 15000, Loss: 2.182
Step: 16000, Loss: 2.365
Step: 17000, Loss: 2.320
Step: 18000, Loss: 2.177
Step: 19000, Loss: 2.230
Step: 20000, Loss: 2.323
Step: 21000, Loss: 2.306
Step: 22000, Loss: 2.201
Step: 23000, Loss: 2.272
Step: 24000, Loss: 2.287
Step: 25000, Loss: 2.422
Step: 26000, Loss: 2.217
Step: 27000, Loss: 2.266
Step: 28000, Loss: 2.407
Step: 29000, Loss: 2.334
Step: 30000, Loss: 2.275
Step: 31000, Loss: 2.198
Step: 32000, Loss: 2.261
Step: 33000, Loss: 2.277
Step: 34000, Loss: 2.277
Step: 35000, Loss: 2.286
Step: 36000, Loss: 2.360
Step: 37000, Loss: 2.179
Step: 38000, Loss: 2.284
Step: 39000, Loss: 2.421
Step: 40000, 

In [64]:
tokens = model.generate(torch.zeros((1, 1), dtype=torch.long), 200)
tokens_decoded = decode(tokens.squeeze().tolist())
print(tokens_decoded)

	s en l ento de viosider ca ortes lasestecuil fa prdejo cus cu la dellion steca ecoresa dela cun ran sentona s dero roprmeser.gio l l 16471 los naden y deconacarono 1.a da 1901974 stosio. do a al de (0
