[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dougc333/Colab-Notebooks/blob/main/char_pytorch_trans.ipynb)


In [4]:
#char_pytorch_trans.ipynb
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%cd /content/drive/MyDrive/'Colab Notebooks'

/content/drive/MyDrive/Colab Notebooks


In [None]:
#!/usr/bin/env python
import math
import time
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F


# ============================================================
# 1. Model definition: 2-layer Transformer LM
# ============================================================

@dataclass
class ModelConfig:
    vocab_size: int
    d_model: int = 128
    n_heads: int = 4
    n_layers: int = 2
    d_ff: int = 256
    max_seq_len: int = 128
    dropout: float = 0.1


class TinyTransformerLM(nn.Module):
    def __init__(self, cfg: ModelConfig):
        super().__init__()
        self.cfg = cfg

        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.pos_emb = nn.Embedding(cfg.max_seq_len, cfg.d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=cfg.d_model,
            nhead=cfg.n_heads,
            dim_feedforward=cfg.d_ff,
            dropout=cfg.dropout,
            batch_first=True,  # (B, T, C)
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=cfg.n_layers,
        )

        self.ln_f = nn.LayerNorm(cfg.d_model)
        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)

        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.normal_(self.tok_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.pos_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.02)

    def forward(self, idx):
        """
        idx: LongTensor [B, T], token indices
        returns: logits [B, T, vocab_size]
        """
        B, T = idx.shape
        device = idx.device

        # Token + positional embeddings
        tok = self.tok_emb(idx)                      # [B, T, C]
        pos_ids = torch.arange(T, device=device)     # [T]
        pos = self.pos_emb(pos_ids)[None, :, :]      # [1, T, C]
        x = tok + pos                                # [B, T, C]

        # Causal mask so position t cannot attend to >t
        # PyTorch expects (T, T) mask for encoder with batch_first=True
        mask = torch.triu(torch.ones(T, T, device=device), diagonal=1)
        mask = mask.masked_fill(mask == 1, float("-inf"))

        x = self.transformer(x, mask)                # [B, T, C]
        x = self.ln_f(x)
        logits = self.lm_head(x)                     # [B, T, V]
        return logits

    @torch.no_grad()
    def generate(self, idx, max_new_tokens: int, temperature: float = 1.0):
        """
        Autoregressive sampling.
        idx: [B, T] initial context
        """
        for _ in range(max_new_tokens):
            # Crop if longer than max_seq_len
            idx_cond = idx[:, -self.cfg.max_seq_len:]
            logits = self(idx_cond)                  # [B, T, V]
            logits = logits[:, -1, :]                # last step
            logits = logits / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)  # [B, 1]
            idx = torch.cat([idx, next_token], dim=1)
        return idx


# ============================================================
# 2. Tiny char-level dataset utilities
# ============================================================

class CharTokenizer:
    def __init__(self, text: str):
        chars = sorted(list(set(text)))
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}
        self.vocab_size = len(chars)

    def encode(self, s: str):
        return [self.stoi[c] for c in s]

    def decode(self, ids):
        return "".join(self.itos[int(i)] for i in ids)


def make_batches(data_ids, block_size, batch_size, device):
    """
    Yield (x, y) batches where y is next-token target.
    """
    n = len(data_ids) - block_size
    while True:
        ix = torch.randint(0, n, (batch_size,))
        x = torch.stack(
            [torch.tensor(data_ids[i:i + block_size]) for i in ix]
        )
        y = torch.stack(
            [torch.tensor(data_ids[i + 1:i + block_size + 1]) for i in ix]
        )
        yield x.to(device), y.to(device)


# ============================================================
# 3. Demo: train + inference benchmark
# ============================================================

def main():
    # Toy training text (replace with any corpus you like)
    text = (
        "To be, or not to be, that is the question:\n"
        "Whether 'tis nobler in the mind to suffer\n"
        "The slings and arrows of outrageous fortune,\n"
        "Or to take arms against a sea of troubles\n"
        "And by opposing end them.\n"
    )
    tokenizer = CharTokenizer(text)
    data_ids = tokenizer.encode(text)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    block_size = 64
    batch_size = 32

    cfg = ModelConfig(
        vocab_size=tokenizer.vocab_size,
        d_model=128,
        n_heads=4,
        n_layers=2,
        d_ff=256,
        max_seq_len=block_size,
        dropout=0.1,
    )
    model = TinyTransformerLM(cfg).to(device)

    print("Model parameters:",
          sum(p.numel() for p in model.parameters()) / 1e6, "M")

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    train_iter = make_batches(data_ids, block_size, batch_size, device)

    # --------- Training loop (very small, just to show it learns) ----------
    model.train()
    n_steps = 300
    for step in range(1, n_steps + 1):
        x, y = next(train_iter)
        logits = model(x)                            # [B, T, V]
        loss = F.cross_entropy(
            logits.view(-1, cfg.vocab_size),
            y.view(-1),
        )
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 50 == 0:
            print(f"step {step}/{n_steps}, loss={loss.item():.4f}")

    # ========================================================
    # 4. Qualitative sample
    # ========================================================
    model.eval()
    ctx = "To be"
    ctx_ids = torch.tensor([tokenizer.encode(ctx)], device=device)
    out_ids = model.generate(ctx_ids, max_new_tokens=80, temperature=0.8)[0]
    print("\n--- Sample generation ---")
    print(tokenizer.decode(out_ids.tolist()))

    # ========================================================
    # 5. Inference performance benchmark
    # ========================================================
    print("\n--- Inference benchmark ---")
    B = 8
    T = block_size
    dummy_input = torch.randint(
        0, cfg.vocab_size, (B, T), device=device
    )

    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(dummy_input)

    # Measure forward pass throughput
    n_iters = 100
    torch.cuda.synchronize(device) if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        for _ in range(n_iters):
            _ = model(dummy_input)
    torch.cuda.synchronize(device) if device == "cuda" else None
    t1 = time.perf_counter()

    total_time = t1 - t0
    avg_time = total_time / n_iters
    tokens_per_batch = B * T
    toks_per_sec = tokens_per_batch / avg_time

    print(f"Forward pass: B={B}, T={T}")
    print(f"Avg latency: {avg_time*1e3:.3f} ms")
    print(f"Throughput:  {toks_per_sec:.0f} tokens/sec")

    # Autoregressive generation timing
    gen_len = 64
    B_gen = 1
    start_ids = torch.randint(
        0, cfg.vocab_size, (B_gen, 1), device=device
    )

    torch.cuda.synchronize(device) if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        _ = model.generate(start_ids, max_new_tokens=gen_len, temperature=1.0)
    torch.cuda.synchronize(device) if device == "cuda" else None
    t1 = time.perf_counter()

    gen_time = t1 - t0
    print(
        f"Generate {gen_len} tokens (B={B_gen}): "
        f"{gen_time*1e3:.2f} ms "
        f"({gen_len/gen_time:.0f} tok/sec)"
    )


if __name__ == "__main__":
    main()

Using device: cuda
Model parameters: 0.281344 M
step 50/300, loss=2.1429
step 100/300, loss=1.5502
step 150/300, loss=0.9286
step 200/300, loss=0.4686
step 250/300, loss=0.3410
step 300/300, loss=0.2422

--- Sample generation ---
To be, the mind to suffer
The slings and arows of orageous fortune,
Or to take arms a

--- Inference benchmark ---
Forward pass: B=8, T=64
Avg latency: 1.106 ms
Throughput:  463084 tokens/sec
Generate 64 tokens (B=1): 144.67 ms (442 tok/sec)


In [1]:
!pip install -q transformers

In [5]:
from transformers import PretrainedConfig, PreTrainedModel

class TinyConfig(PretrainedConfig):
    model_type = "tiny_transformer_lm"

    def __init__(self,
                 vocab_size: int = 0,        # <-- make optional
                 d_model: int = 128,
                 n_heads: int = 4,
                 n_layers: int = 2,
                 d_ff: int = 256,
                 max_seq_len: int = 128,
                 **kwargs):
        # Call base class to let HF handle extra generation kwargs
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.d_ff = d_ff
        self.max_seq_len = max_seq_len


class TinyHFModel(PreTrainedModel):
    config_class = TinyConfig

    def __init__(self, config: TinyConfig):
        super().__init__(config)
        self.inner = TinyTransformerLM(
            ModelConfig(
                vocab_size=config.vocab_size,
                d_model=config.d_model,
                n_heads=config.n_heads,
                n_layers=config.n_layers,
                d_ff=config.d_ff,
                max_seq_len=config.max_seq_len,
            )
        )

    def forward(self, input_ids, labels=None, **kwargs):
        logits = self.inner(input_ids)
        loss = None
        if labels is not None:
            loss = F.cross_entropy(
                logits.view(-1, self.config.vocab_size),
                labels.view(-1),
            )
        return {"logits": logits, "loss": loss}

    @torch.no_grad()
    def generate(self, input_ids, max_new_tokens=50, temperature=1.0, **kwargs):
        return self.inner.generate(input_ids, max_new_tokens, temperature)

In [6]:
class CharTokenizer:
    def __init__(self, text: str):
        chars = sorted(list(set(text)))
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}
        self.vocab_size = len(chars)

    def encode(self, s: str):
        return [self.stoi[c] for c in s]

    def decode(self, ids):
        return "".join(self.itos[int(i)] for i in ids)


def make_batches(data_ids, block_size, batch_size, device):
    n = len(data_ids) - block_size
    while True:
        ix = torch.randint(0, n, (batch_size,))
        x = torch.stack(
            [torch.tensor(data_ids[i:i + block_size]) for i in ix]
        )
        y = torch.stack(
            [torch.tensor(data_ids[i + 1:i + block_size + 1]) for i in ix]
        )
        yield x.to(device), y.to(device)

In [7]:
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

# toy training text
text = (
    "To be, or not to be, that is the question:\n"
    "Whether 'tis nobler in the mind to suffer\n"
    "The slings and arrows of outrageous fortune,\n"
    "Or to take arms against a sea of troubles\n"
    "And by opposing end them.\n"
)

tokenizer = CharTokenizer(text)
data_ids = tokenizer.encode(text)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

block_size = 64
batch_size = 32

cfg = TinyConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=128,
    n_heads=4,
    n_layers=2,
    d_ff=256,
    max_seq_len=block_size,
)
model = TinyHFModel(cfg).to(device)

print("Params (M):", sum(p.numel() for p in model.parameters()) / 1e6)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
train_iter = make_batches(data_ids, block_size, batch_size, device)

model.train()
n_steps = 200

for step in range(1, n_steps + 1):
    x, y = next(train_iter)
    out = model(x, labels=y)
    loss = out["loss"]

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        print(f"step {step}/{n_steps}, loss={loss.item():.4f}")

# quick sample
model.eval()
ctx = "To be"
ctx_ids = torch.tensor([tokenizer.encode(ctx)], device=device)
with torch.no_grad():
    gen_ids = model.generate(ctx_ids, max_new_tokens=80, temperature=0.8)[0]
print("\nSample:")
print(tokenizer.decode(gen_ids.tolist()))

# ---- Save as HF checkpoint ----
save_dir = "tiny_tlm"
model.save_pretrained(save_dir)

# save a minimal tokenizer mapping for your own use (NOT a full HF tokenizer)
import json, os

os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, "char_vocab.json"), "w") as f:
    json.dump(tokenizer.stoi, f, ensure_ascii=False, indent=2)

print(f"\nSaved TinyHFModel + char vocab to: {save_dir}")

Using device: cuda
Params (M): 0.281344
step 50/200, loss=2.1444
step 100/200, loss=1.5407
step 150/200, loss=0.9207
step 200/200, loss=0.5086

Sample:
To be, to to thetis nond sufeagertous founeage,
Ans f by ound traropos es f os oslesu

Saved TinyHFModel + char vocab to: tiny_tlm


In [9]:
from transformers import pipeline
import torch

# 3 options
from tiny_model_def import TinyConfig, TinyHFModel  # your code file

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load config + model from the saved folder
config = TinyConfig.from_pretrained("tiny_tlm")
model = TinyHFModel.from_pretrained("tiny_tlm", config=config).to(device)

# You also need a tokenizer; for char-level, use the one you built earlier:
from transformers import PreTrainedTokenizerFast
import json, os

with open(os.path.join("tiny_tlm", "char_vocab.json")) as f:
    stoi = json.load(f)

# Build a simple tokenizer around the char vocab
itos = {v: k for k, v in stoi.items()}
vocab = stoi.copy()

tok = PreTrainedTokenizerFast(
    tokenizer_object=None,  # we’ll use custom encode/decode
    bos_token="<bos>",
    eos_token="<eos>",
    unk_token="<unk>",
    pad_token="<pad>",
)
tok.vocab = vocab
tok._tokenizer = None  # (for simple demo; for real use you’d wire up `tokenizers`)

def encode(s):
    return [vocab.get(ch, vocab["<unk>"]) for ch in s]

def decode(ids):
    return "".join(itos[int(i)] for i in ids if int(i) in itos)

# Use the model directly (simplest) – no pipeline:
model.eval()
prompt = "To be"
ids = torch.tensor([encode(prompt)], device=device)
with torch.no_grad():
    out_ids = model.generate(ids, max_new_tokens=80)[0]
print(decode(out_ids.tolist()))

ModuleNotFoundError: No module named 'tiny_model_def'

In [None]:
%%writefile tiny_model_def.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from transformers import PretrainedConfig, PreTrainedModel

@dataclass
class ModelConfig:
    vocab_size: int
    d_model: int = 128
    n_heads: int = 4
    n_layers: int = 2
    d_ff: int = 256
    max_seq_len: int = 128
    dropout: float = 0.1

class TinyTransformerLM(nn.Module):





class TinyConfig(PretrainedConfig):
    model_type = "tiny_transformer_lm"
    def __init__(self,
                 vocab_size: int = 0,
                 d_model=128,
                 n_heads=4,
                 n_layers=2,
                 d_ff=256,
                 max_seq_len=128,
                 **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.d_ff = d_ff
        self.max_seq_len = max_seq_len

class TinyHFModel(PreTrainedModel):
    config_class = TinyConfig
    def __init__(self, config):
        super().__init__(config)
        self.inner = TinyTransformerLM(
            ModelConfig(
                vocab_size=config.vocab_size,
                d_model=config.d_model,
                n_heads=config.n_heads,
                n_layers=config.n_layers,
                d_ff=config.d_ff,
                max_seq_len=config.max_seq_len,
            )
        )

    def forward(self, input_ids, labels=None, **kwargs):
        logits = self.inner(input_ids)
        loss = None
        if labels is not None:
            loss = F.cross_entropy(
                logits.view(-1, self.config.vocab_size),
                labels.view(-1),
            )
        return {"logits": logits, "loss": loss}

    @torch.no_grad()
    def generate(self, input_ids, max_new_tokens=50, temperature=1.0, **kwargs):
        return self.inner.generate(input_ids, max_new_tokens, temperature)

In [1]:
#!/usr/bin/env python
import math
import time
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F


# ============================================================
# 1. Model definition: 2-layer Transformer LM
# ============================================================

@dataclass
class ModelConfig:
    vocab_size: int
    d_model: int = 128
    n_heads: int = 4
    n_layers: int = 2
    d_ff: int = 256
    max_seq_len: int = 128
    dropout: float = 0.1


class TinyTransformerLM(nn.Module):
    def __init__(self, cfg: ModelConfig):
        super().__init__()
        self.cfg = cfg

        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.pos_emb = nn.Embedding(cfg.max_seq_len, cfg.d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=cfg.d_model,
            nhead=cfg.n_heads,
            dim_feedforward=cfg.d_ff,
            dropout=cfg.dropout,
            batch_first=True,  # (B, T, C)
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=cfg.n_layers,
        )

        self.ln_f = nn.LayerNorm(cfg.d_model)
        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)

        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.normal_(self.tok_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.pos_emb.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.02)

    def forward(self, idx):
        """
        idx: LongTensor [B, T], token indices
        returns: logits [B, T, vocab_size]
        """
        B, T = idx.shape
        device = idx.device

        # Token + positional embeddings
        tok = self.tok_emb(idx)                      # [B, T, C]
        pos_ids = torch.arange(T, device=device)     # [T]
        pos = self.pos_emb(pos_ids)[None, :, :]      # [1, T, C]
        x = tok + pos                                # [B, T, C]

        # Causal mask so position t cannot attend to >t
        # PyTorch expects (T, T) mask for encoder with batch_first=True
        mask = torch.triu(torch.ones(T, T, device=device), diagonal=1)
        mask = mask.masked_fill(mask == 1, float("-inf"))

        x = self.transformer(x, mask)                # [B, T, C]
        x = self.ln_f(x)
        logits = self.lm_head(x)                     # [B, T, V]
        return logits

    @torch.no_grad()
    def generate(self, idx, max_new_tokens: int, temperature: float = 1.0):
        """
        Autoregressive sampling.
        idx: [B, T] initial context
        """
        for _ in range(max_new_tokens):
            # Crop if longer than max_seq_len
            idx_cond = idx[:, -self.cfg.max_seq_len:]
            logits = self(idx_cond)                  # [B, T, V]
            logits = logits[:, -1, :]                # last step
            logits = logits / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)  # [B, 1]
            idx = torch.cat([idx, next_token], dim=1)
        return idx


# ============================================================
# 2. Tiny char-level dataset utilities
# ============================================================

class CharTokenizer:
    def __init__(self, text: str):
        chars = sorted(list(set(text)))
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}
        self.vocab_size = len(chars)

    def encode(self, s: str):
        return [self.stoi[c] for c in s]

    def decode(self, ids):
        return "".join(self.itos[int(i)] for i in ids)


def make_batches(data_ids, block_size, batch_size, device):
    """
    Yield (x, y) batches where y is next-token target.
    """
    n = len(data_ids) - block_size
    while True:
        ix = torch.randint(0, n, (batch_size,))
        x = torch.stack(
            [torch.tensor(data_ids[i:i + block_size]) for i in ix]
        )
        y = torch.stack(
            [torch.tensor(data_ids[i + 1:i + block_size + 1]) for i in ix]
        )
        yield x.to(device), y.to(device)


# ============================================================
# 3. Demo: train + inference benchmark
# ============================================================

def main():
    # Toy training text (replace with any corpus you like)
    text = (
        "To be, or not to be, that is the question:\n"
        "Whether 'tis nobler in the mind to suffer\n"
        "The slings and arrows of outrageous fortune,\n"
        "Or to take arms against a sea of troubles\n"
        "And by opposing end them.\n"
    )
    tokenizer = CharTokenizer(text)
    data_ids = tokenizer.encode(text)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    block_size = 64
    batch_size = 32

    cfg = ModelConfig(
        vocab_size=tokenizer.vocab_size,
        d_model=128,
        n_heads=4,
        n_layers=2,
        d_ff=256,
        max_seq_len=block_size,
        dropout=0.1,
    )
    model = TinyTransformerLM(cfg).to(device)

    print("Model parameters:",
          sum(p.numel() for p in model.parameters()) / 1e6, "M")

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    train_iter = make_batches(data_ids, block_size, batch_size, device)

    # --------- Training loop (very small, just to show it learns) ----------
    model.train()
    n_steps = 300
    for step in range(1, n_steps + 1):
        x, y = next(train_iter)
        logits = model(x)                            # [B, T, V]
        loss = F.cross_entropy(
            logits.view(-1, cfg.vocab_size),
            y.view(-1),
        )
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 50 == 0:
            print(f"step {step}/{n_steps}, loss={loss.item():.4f}")

    # ========================================================
    # 4. Qualitative sample
    # ========================================================
    model.eval()
    ctx = "To be"
    ctx_ids = torch.tensor([tokenizer.encode(ctx)], device=device)
    out_ids = model.generate(ctx_ids, max_new_tokens=80, temperature=0.8)[0]
    print("\n--- Sample generation ---")
    print(tokenizer.decode(out_ids.tolist()))

    # ========================================================
    # 5. Inference performance benchmark
    # ========================================================
    print("\n--- Inference benchmark ---")
    B = 8
    T = block_size
    dummy_input = torch.randint(
        0, cfg.vocab_size, (B, T), device=device
    )

    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(dummy_input)

    # Measure forward pass throughput
    n_iters = 100
    torch.cuda.synchronize(device) if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        for _ in range(n_iters):
            _ = model(dummy_input)
    torch.cuda.synchronize(device) if device == "cuda" else None
    t1 = time.perf_counter()

    total_time = t1 - t0
    avg_time = total_time / n_iters
    tokens_per_batch = B * T
    toks_per_sec = tokens_per_batch / avg_time

    print(f"Forward pass: B={B}, T={T}")
    print(f"Avg latency: {avg_time*1e3:.3f} ms")
    print(f"Throughput:  {toks_per_sec:.0f} tokens/sec")

    # Autoregressive generation timing
    gen_len = 64
    B_gen = 1
    start_ids = torch.randint(
        0, cfg.vocab_size, (B_gen, 1), device=device
    )

    torch.cuda.synchronize(device) if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        _ = model.generate(start_ids, max_new_tokens=gen_len, temperature=1.0)
    torch.cuda.synchronize(device) if device == "cuda" else None
    t1 = time.perf_counter()

    gen_time = t1 - t0
    print(
        f"Generate {gen_len} tokens (B={B_gen}): "
        f"{gen_time*1e3:.2f} ms "
        f"({gen_len/gen_time:.0f} tok/sec)"
    )


if __name__ == "__main__":
    main()

Using device: cuda
Model parameters: 0.281344 M
step 50/300, loss=2.1031
step 100/300, loss=1.5847
step 150/300, loss=0.9563
step 200/300, loss=0.4899
step 250/300, loss=0.3331
step 300/300, loss=0.2214

--- Sample generation ---
To be, or to to be, that is the question:
Whether 'tis nobler in the mind to sufer
Th

--- Inference benchmark ---
Forward pass: B=8, T=64
Avg latency: 2.430 ms
Throughput:  210730 tokens/sec
Generate 64 tokens (B=1): 218.02 ms (294 tok/sec)
