In [None]:
# import math
# import torch
# import torch.nn as nn
# import torch.nn.functional as F


# def causal_mask(T: int, device=None):
#     """Returns a bool mask where True means *masked* (disallowed).
#     Shape: (1, 1, T, T) suitable for broadcasting with (B, heads, T, T).
#     """
#     m = torch.triu(torch.ones((T, T), dtype=torch.bool, device=device), diagonal=1)

#     return m.view(1, 1, T, T)

# """1.1 Positional encodings (absolute learned + sinusoidal)."""


# class LearnedPositionalEncoding(nn.Module):
#     def __init__(self, max_len: int, d_model: int):
#         super().__init__()
#         self.emb = nn.Embedding(max_len, d_model)

#     def forward(self, x: torch.Tensor):
#         # x: (B, T, d_model) — we only need its T and device
#         B, T, _ = x.shape
#         pos = torch.arange(T, device=x.device)
#         pos_emb = self.emb(pos)  # (T, d_model)
#         return x + pos_emb.unsqueeze(0)  # broadcast over batch

# class SinusoidalPositionalEncoding(nn.Module):
#     def __init__(self, max_len: int, d_model: int):
#         super().__init__()
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         self.register_buffer('pe', pe)  # (max_len, d_model)

#     def forward(self, x: torch.Tensor):
#         B, T, _ = x.shape
#         return x + self.pe[:T].unsqueeze(0)


# class SingleHeadSelfAttention(nn.Module):
#     """1.3 Single-head attention (explicit shapes)."""
#     def __init__(self, d_model: int, d_k: int, dropout: float = 0.0, trace_shapes: bool = False):
#         super().__init__()
#         self.q = nn.Linear(d_model, d_k, bias=False)
#         self.k = nn.Linear(d_model, d_k, bias=False)
#         self.v = nn.Linear(d_model, d_k, bias=False)
#         self.dropout = nn.Dropout(dropout)
#         self.trace_shapes = trace_shapes

#     def forward(self, x: torch.Tensor):  # x: (B, T, d_model)
#         B, T, _ = x.shape
#         q = self.q(x)  # (B,T,d_k)
#         k = self.k(x)  # (B,T,d_k)
#         v = self.v(x)  # (B,T,d_k)
#         if self.trace_shapes:
#             print(f"q {q.shape}  k {k.shape}  v {v.shape}")
#         scale = 1.0 / math.sqrt(q.size(-1))
#         attn = torch.matmul(q, k.transpose(-2, -1)) * scale  # (B,T,T)
#         mask = causal_mask(T, device=x.device)
#         attn = attn.masked_fill(mask.squeeze(1), float('-inf'))
#         w = F.softmax(attn, dim=-1)
#         w = self.dropout(w)
#         out = torch.matmul(w, v)  # (B,T,d_k)
#         if self.trace_shapes:
#             print(f"weights {w.shape}  out {out.shape}")
#         return out, w
#   # from attn_mask import causal_mask

# class MultiHeadSelfAttention(nn.Module):
#     """1.4 Multi-head attention with explicit shape tracing.

#     Dimensions (before masking):
#       x:      (B, T, d_model)
#       qkv:    (B, T, 3*d_model)
#       view→   (B, T, 3, n_head, d_head)   where d_head = d_model // n_head
#       split→  q,k,v each (B, T, n_head, d_head)
#       swap→   (B, n_head, T, d_head)
#       scores: (B, n_head, T, T) = q @ k^T / sqrt(d_head)
#       weights:(B, n_head, T, T) = softmax(scores)
#       ctx:    (B, n_head, T, d_head) = weights @ v
#       merge:  (B, T, n_head*d_head) = (B, T, d_model)
#     """
#     def __init__(self, d_model: int, n_head: int, dropout: float = 0.0, trace_shapes: bool = True):
#         super().__init__()
#         assert d_model % n_head == 0, "d_model must be divisible by n_head"
#         self.n_head = n_head
#         self.d_head = d_model // n_head
#         self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
#         self.proj = nn.Linear(d_model, d_model, bias=False)
#         self.dropout = nn.Dropout(dropout)
#         self.trace_shapes = trace_shapes

#     def forward(self, x: torch.Tensor):  # (B,T,d_model)
#         B, T, C = x.shape
#         qkv = self.qkv(x)                          # (B,T,3*C)
#         qkv = qkv.view(B, T, 3, self.n_head, self.d_head)  # (B,T,3,heads,dim)
#         if self.trace_shapes:
#             print("qkv view:", qkv.shape)
#         q, k, v = qkv.unbind(dim=2)               # each: (B,T,heads,dim)
#         q = q.transpose(1, 2)                      # (B,heads,T,dim)
#         k = k.transpose(1, 2)
#         v = v.transpose(1, 2)
#         if self.trace_shapes:
#             print("q:", q.shape, "k:", k.shape, "v:", v.shape)

#         scale = 1.0 / math.sqrt(self.d_head)
#         attn = torch.matmul(q, k.transpose(-2, -1)) * scale  # (B,heads,T,T)
#         mask = causal_mask(T, device=x.device)
#         attn = attn.masked_fill(mask, float('-inf'))
#         w = F.softmax(attn, dim=-1)
#         w = self.dropout(w)
#         ctx = torch.matmul(w, v)                  # (B,heads,T,dim)
#         if self.trace_shapes:
#             print("weights:", w.shape, "ctx:", ctx.shape)
#         out = ctx.transpose(1, 2).contiguous().view(B, T, C)  # (B,T,d_model)
#         out = self.proj(out)
#         if self.trace_shapes:
#             print("out:", out.shape)
#         return out, w




# class FeedForward(nn.Module):
#     """1.5 FFN with expansion factor `mult`.

#     Dimensions:
#       input:     (B, T, d_model)
#       inner:     (B, T, mult*d_model)
#       output:    (B, T, d_model)

#     `mult*d_model` means the hidden width is `mult` times larger than `d_model`.
#     Typical values: mult=4 for GELU FFN in GPT-style blocks.
#     """
#     def __init__(self, d_model: int, mult: int = 4, dropout: float = 0.0):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(d_model, mult * d_model),
#             nn.GELU(),
#             nn.Linear(mult * d_model, d_model),
#             nn.Dropout(dropout),
#         )

#     def forward(self, x):
#         return self.net(x)


# class TransformerBlock(nn.Module):
#     """1.6 Transformer block = LN → MHA → residual → LN → FFN → residual."""
#     def __init__(self, d_model: int, n_head: int, dropout: float = 0.0):
#         super().__init__()
#         self.ln1 = nn.LayerNorm(d_model)
#         self.attn = MultiHeadSelfAttention(d_model, n_head, dropout)
#         self.ln2 = nn.LayerNorm(d_model)
#         self.ffn = FeedForward(d_model, mult=4, dropout=dropout)

#     def forward(self, x):
#         x = x + self.attn(self.ln1(x))[0]
#         x = x + self.ffn(self.ln2(x))
#         return x


In [None]:
from __future__ import annotations
import torch

class ByteTokenizer:
    """Ultra-simple byte-level tokenizer.
    - encode(str) -> LongTensor [N]
    - decode(Tensor[int]) -> str
    - vocab_size = 256
    """
    def encode(self, s: str) -> torch.Tensor:
        return torch.tensor(list(s.encode('utf-8')), dtype=torch.long)

    def decode(self, ids) -> str:
        if isinstance(ids, torch.Tensor):
            ids = ids.tolist()
        return bytes(ids).decode('utf-8', errors='ignore')

    @property
    def vocab_size(self) -> int:
        return 256

In [None]:
from __future__ import annotations
from pathlib import Path
import torch

class ByteDataset:
    """Holds raw bytes of a text file and yields (x,y) blocks for LM.
    - block_size: sequence length (context window)
    - split: fraction for training (rest is val)
    """
    def __init__(self, path: str, block_size: int = 256, split: float = 0.9):
        data = Path(path).read_bytes()
        data = torch.tensor(list(data), dtype=torch.long)
        n = int(len(data) * split)
        self.train = data[:n]
        self.val = data[n:]
        self.block_size = block_size

    def get_batch(self, which: str, batch_size: int, device: torch.device):
        buf = self.train if which == 'train' else self.val
        assert len(buf) > self.block_size + 1, 'file too small for given block_size'
        ix = torch.randint(0, len(buf) - self.block_size - 1, (batch_size,))
        x = torch.stack([buf[i:i+self.block_size] for i in ix])
        y = torch.stack([buf[i+1:i+1+self.block_size] for i in ix])
        return x.to(device), y.to(device)

In [None]:
from __future__ import annotations
import torch

def top_k_top_p_filtering(logits: torch.Tensor, top_k: int | None = None, top_p: float | None = None):
    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
    - logits: (B, vocab)
    Returns filtered logits with -inf for masked entries.
    """
    B, V = logits.shape
    filtered = logits.clone()

    if top_k is not None and top_k < V:
        topk_vals, _ = torch.topk(filtered, top_k, dim=-1)
        kth = topk_vals[:, -1].unsqueeze(-1)
        filtered[filtered < kth] = float('-inf')

    if top_p is not None and 0 < top_p < 1.0:
        sorted_logits, sorted_idx = torch.sort(filtered, descending=True, dim=-1)
        probs = torch.softmax(sorted_logits, dim=-1)
        cumsum = torch.cumsum(probs, dim=-1)
        mask = cumsum > top_p
        # keep at least 1 token
        mask[..., 0] = False
        sorted_logits[mask] = float('-inf')
        # Scatter back
        filtered = torch.full_like(filtered, float('-inf'))
        filtered.scatter_(1, sorted_idx, sorted_logits)

    return filtered

In [None]:
from __future__ import annotations
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---- Blocks (self-contained for isolation) ----
class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd: int, n_head: int, dropout: float = 0.0):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        self.d_head = n_embd // n_head
        self.qkv = nn.Linear(n_embd, 3 * n_embd, bias=False)
        self.proj = nn.Linear(n_embd, n_embd, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor):  # (B,T,C)
        B, T, C = x.shape
        qkv = self.qkv(x).view(B, T, 3, self.n_head, self.d_head)
        q, k, v = qkv.unbind(dim=2)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)
        scale = 1.0 / math.sqrt(self.d_head)
        # PyTorch SDPA (uses flash when available)
        y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout.p if self.training else 0.0, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.proj(y)
        return y

class FeedForward(nn.Module):
    def __init__(self, n_embd: int, mult: int = 4, dropout: float = 0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, mult * n_embd),
            nn.GELU(),
            nn.Linear(mult * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd: int, n_head: int, dropout: float):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ffn = FeedForward(n_embd, mult=4, dropout=dropout)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

# ---- Tiny GPT ----
class GPT(nn.Module):
    def __init__(self, vocab_size: int, block_size: int, n_layer: int = 4, n_head: int = 4, n_embd: int = 256, dropout: float = 0.0):
        super().__init__()
        self.block_size = block_size
        self.tok_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
        self.drop = nn.Dropout(dropout)
        self.blocks = nn.ModuleList([Block(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, idx: torch.Tensor, targets: torch.Tensor | None = None):
        B, T = idx.shape
        assert T <= self.block_size
        #T = T[-self.block_size:]
        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        x = self.tok_emb(idx) + self.pos_emb(pos)
        x = self.drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @torch.no_grad()
    def generate(self, idx: torch.Tensor, max_new_tokens: int = 200, temperature: float = 1.0,
                top_k: int | None = 50, top_p: float | None = None):

        self.eval()
        # Guard: if the prompt is empty, start with a newline byte (10)
        if idx.size(1) == 0:
            idx = torch.full((idx.size(0), 1), 10, dtype=torch.long, device=idx.device)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / max(temperature, 1e-6)
            logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
            probs = torch.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_id], dim=1)
        return idx

In [None]:
from __future__ import annotations
import argparse, time, os, sys
import torch
# from tokenizer import ByteTokenizer
# from dataset import ByteDataset
# from model_gpt import GPT

def estimate_loss(model: GPT, ds: ByteDataset, args) -> dict:
    model.eval()
    out = {}
    with torch.no_grad():
        for split in ['train', 'val']:
            losses = []
            for _ in range(args.eval_iters):
                xb, yb = ds.get_batch(split, args.batch_size, args.device)
                _, loss = model(xb, yb)
                losses.append(loss.item())
            out[split] = sum(losses) / len(losses)
    model.train()
    return out

def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    p = argparse.ArgumentParser()
    p.add_argument('--data', type=str, required=True)
    p.add_argument('--out_dir', type=str, default='runs/min-gpt')
    p.add_argument('--block_size', type=int, default=256)
    p.add_argument('--batch_size', type=int, default=32)
    p.add_argument('--n_layer', type=int, default=4)
    p.add_argument('--n_head', type=int, default=4)
    p.add_argument('--n_embd', type=int, default=256)
    p.add_argument('--dropout', type=float, default=0.0)
    p.add_argument('--steps', type=int, default=2000)
    p.add_argument('--lr', type=float, default=3e-4)
    p.add_argument('--weight_decay', type=float, default=0.1)
    p.add_argument('--grad_clip', type=float, default=1.0)
    p.add_argument('--eval_interval', type=int, default=200)
    p.add_argument('--eval_iters', type=int, default=50)
    p.add_argument('--sample_every', type=int, default=200)
    p.add_argument('--sample_tokens', type=int, default=256)
    p.add_argument('--temperature', type=float, default=1.0)
    p.add_argument('--top_k', type=int, default=50)
    p.add_argument('--top_p', type=float, default=None)
    p.add_argument('--cpu', action='store_true')
    p.add_argument('--compile', action='store_true')
    p.add_argument('--amp', action='store_true')
    args = p.parse_args(argv)

    args.device = torch.device('cuda' if torch.cuda.is_available() and not args.cpu else 'cpu')

    tok = ByteTokenizer()
    ds = ByteDataset(args.data, block_size=args.block_size)
    model = GPT(tok.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(args.device)

    if args.compile and hasattr(torch, 'compile'):
        model = torch.compile(model)

    opt = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.95), weight_decay=args.weight_decay)
    scaler = torch.cuda.amp.GradScaler(enabled=(args.amp and args.device.type == 'cuda'))

    best_val = float('inf')
    t0 = time.time()
    model.train()
    for step in range(1, args.steps + 1):
        xb, yb = ds.get_batch('train', args.batch_size, args.device)
        with torch.cuda.amp.autocast(enabled=(args.amp and args.device.type == 'cuda')):
            _, loss = model(xb, yb)
        opt.zero_grad(set_to_none=True)
        scaler.scale(loss).backward()
        if args.grad_clip > 0:
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        scaler.step(opt)
        scaler.update()

        if step % 50 == 0:
            print(f"step {step:5d} | loss {loss.item():.4f} | {(time.time()-t0):.1f}s")
            t0 = time.time()

        if step % args.eval_interval == 0:
            losses = estimate_loss(model, ds, args)
            print(f"eval | train {losses['train']:.4f} | val {losses['val']:.4f}")
            if losses['val'] < best_val:
                best_val = losses['val']
                ckpt_path = f"{args.out_dir}/model_best.pt"
                os.makedirs(args.out_dir, exist_ok=True)
                torch.save({'model': model.state_dict(), 'config': {
                    'vocab_size': tok.vocab_size,
                    'block_size': args.block_size,
                    'n_layer': args.n_layer,
                    'n_head': args.n_head,
                    'n_embd': args.n_embd,
                    'dropout': args.dropout,
                }}, ckpt_path)
                print(f"saved checkpoint: {ckpt_path}")

        if args.sample_every > 0 and step % args.sample_every == 0:
            start = torch.randint(low=0, high=len(ds.train) - args.block_size - 1, size=(1,)).item()
            seed = ds.train[start:start + args.block_size].unsqueeze(0).to(args.device)
            out = model.generate(seed, max_new_tokens=args.sample_tokens, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p)
            txt = tok.decode(out[0].cpu())
            print("\n================ SAMPLE ================\n" + txt[-(args.block_size + args.sample_tokens):] + "\n=======================================\n")

    # final save
    os.makedirs(args.out_dir, exist_ok=True)
    torch.save({'model': model.state_dict()}, f"{args.out_dir}/model_final.pt")

if __name__ == '__main__':
    main([
        '--data', 'tiny_hi.txt',
        '--steps', '400',
        '--sample_every', '100',
        '--eval_interval', '100',
        '--batch_size', '32',
        '--block_size', '128',
        '--n_layer', '2',
        '--n_head', '2',
        '--n_embd', '128'
    ])


  scaler = torch.cuda.amp.GradScaler(enabled=(args.amp and args.device.type == 'cuda'))
  with torch.cuda.amp.autocast(enabled=(args.amp and args.device.type == 'cuda')):


step    50 | loss 2.2778 | 13.8s
step   100 | loss 1.4588 | 13.3s
eval | train 1.4686 | val 1.4469
saved checkpoint: runs/min-gpt/model_best.pt

ा। कबहुँ न संत करहिं तेहि पाना॥
सुरसरि मिलें सो पतअन स क। ादु। लनंी कडररबाअन सर ु अाेबल ु ोऍ र द मह सय़ रिबपरमितखलससह  आग॥पररमुता ढअ
 ं &

step   150 | loss 1.3503 | 23.6s
step   200 | loss 1.3252 | 13.4s
eval | train 1.3000 | val 1.2839
saved checkpoint: runs/min-gpt/model_best.pt

लकु बधजोगू॥
बाल बिलोकि बहुत मैं बाँचा। अब यहु मरलार न्ु सिभचछो-। बसरनलामिीबरं॥हईप। त ौि 

हनमदल ीिल हं गाोंि ्रबए कनृखल बाि ासाोससससईा।हतइ 

step   250 | loss 1.2376 | 23.5s
step   300 | loss 1.2083 | 13.5s
eval | train 1.2039 | val 1.1900
saved checkpoint: runs/min-gpt/model_best.pt

ृपाला। ईस अंस भव परम कृपाला॥
सुनि सनमानहिं सबहि ्र नाउमंर बमिो भ सा ाकमुर।ुबखवी  छरुं आ॥ क हकँइिक बब महुंस्ं दभिंोनि ह कहर वर मस ोत ईपसभा बनसी ि

step   350 | loss 1.2009 | 23.6s
step   400 | loss 1.1337 | 13.5s
eval | train 1.1271 | val 1.1161
saved checkpoint: runs/min-gpt/model_best

In [None]:
from __future__ import annotations
import argparse, torch, sys


def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    p = argparse.ArgumentParser()
    p.add_argument('--ckpt', type=str, required=True)
    p.add_argument('--prompt', type=str, default='')
    p.add_argument('--tokens', type=int, default=200)
    p.add_argument('--temperature', type=float, default=1.0)
    p.add_argument('--top_k', type=int, default=50)
    p.add_argument('--top_p', type=float, default=None)
    p.add_argument('--cpu', action='store_true')
    args = p.parse_args(argv)

    device = torch.device('cuda' if torch.cuda.is_available() and not args.cpu else 'cpu')

    tok = ByteTokenizer()
    prompt_ids = tok.encode(args.prompt).unsqueeze(0).to(device)
    if prompt_ids.numel() == 0:
        # If no prompt provided, seed with newline byte (10)
        prompt_ids = torch.tensor([[10]], dtype=torch.long, device=device)

    ckpt = torch.load(args.ckpt, map_location=device)
    config = ckpt.get('config', None)

    if config is None:
        # fallback defaults
        model = GPT(tok.vocab_size, block_size=256).to(device)
        model.load_state_dict(ckpt['model'])
    else:
        model = GPT(**config).to(device)
        model.load_state_dict(ckpt['model'])

    model.eval()
    with torch.no_grad():
        out = model.generate(prompt_ids, max_new_tokens=args.tokens, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p)

    print(tok.decode(out[0].cpu()))

if __name__ == '__main__':
    main([
        '--ckpt', 'runs/min-gpt/model_best.pt',
        '--tokens', '200',
        '--prompt', 'करउ अनुग्रह'
    ])


करउ अनुग्रहू गन भिहोहत
ो म बत नबग ासाेनउ गिोहधषो ो। पनुट पर े समस ्सस सिप्ेरेव बुनिघ कीक 


In [None]:
from __future__ import annotations
import argparse, torch, sys


def main(argv=None):
    if(argv is None):
        argv = sys.argv[1:]
    p = argparse.ArgumentParser()
    p.add_argument('--data', type=str, required=True)
    p.add_argument('--ckpt', type=str, required=True)
    p.add_argument('--block_size', type=int, default=256)
    p.add_argument('--batch_size', type=int, default=32)
    p.add_argument('--iters', type=int, default=100)
    p.add_argument('--cpu', action='store_true')
    args = p.parse_args(argv)

    device = torch.device('cuda' if torch.cuda.is_available() and not args.cpu else 'cpu')

    ds = ByteDataset(args.data, block_size=args.block_size)
    ckpt = torch.load(args.ckpt, map_location=device)
    cfg = ckpt.get('config', {
        'vocab_size': 256,
        'block_size': args.block_size,
        'n_layer': 4,
        'n_head': 4,
        'n_embd': 256,
        'dropout': 0.0,
    })
    model = GPT(**cfg).to(device)
    model.load_state_dict(ckpt['model'])

    model.eval()
    losses = []
    with torch.no_grad():
        for _ in range(args.iters):
            xb, yb = ds.get_batch('val', args.batch_size, device)
            _, loss = model(xb, yb)
            losses.append(loss.item())
    print(f"val loss: {sum(losses)/len(losses):.4f}")


if __name__ == '__main__':
    main([
        '--data', 'tiny_hi.txt', '--ckpt', 'runs/min-gpt/model_best.pt', '--iters', '50' ,'--block_size',' 128'
    ])

val loss: 1.1151
