<a href="https://colab.research.google.com/github/codekikicode/python-coding-folio/blob/main/Project2_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch, time, math, os
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True  # autotune kernels when shapes are stable


CUDA available: True
GPU: Tesla T4


Download Tiny Shakespeare dataset

In [None]:
import urllib.request, pathlib
pathlib.Path("input.txt").parent.mkdir(parents=True, exist_ok=True)
if not pathlib.Path("input.txt").exists():
    urllib.request.urlretrieve(
        "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
        "input.txt"
    )
print("Dataset ready:", os.path.getsize("input.txt"), "bytes")


Dataset ready: 1115394 bytes


Preprocess data | Split Training and Validation Data

In [None]:
text = open("input.txt","r",encoding="utf-8").read()
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("Vocab size:", vocab_size)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]
print("Splits:", len(train_data), len(val_data))


Vocab size: 65
Splits: 1003854 111540


Set Hyperparameters

In [None]:
# Training
batch_size    = 64
block_size    = 256
learning_rate = 3e-4
max_iters     = 5000
eval_interval = 1000
eval_iters    = 25

# Model scale
n_embd, n_head, n_layer, dropout = 384, 6, 6, 0.2

torch.manual_seed(1337)


<torch._C.Generator at 0x7d3198fefdd0>

In [None]:
def get_batch(split):
    d = train_data if split == 'train' else val_data
    ix = torch.randint(len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size]       for i in ix]).to(device)
    y = torch.stack([d[i+1:i+block_size+1]  for i in ix]).to(device)
    return x, y


Transformer model with Dropout ON

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k, q = self.key(x), self.query(x)
        wei = (q @ k.transpose(-2, -1)) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj  = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb   = nn.Embedding(block_size, n_embd)
        self.blocks    = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f      = nn.LayerNorm(n_embd)
        self.lm_head   = nn.Linear(n_embd, vocab_size)
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok = self.token_emb(idx)
        pos = self.pos_emb(torch.arange(T, device=idx.device))
        x = tok + pos
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            logits = logits.view(B*T, -1)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


Instatiate | Train data (Timed)

In [None]:
import time, math, torch

# Instantiate
model = GPTLanguageModel().to(device)

#Boosts compiling speed
try:
    model = torch.compile(model)
    print("Compiled with torch.compile")
except Exception:
    pass

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

scaler = torch.amp.GradScaler(device="cuda" if device == "cuda" else None)

def estimate_loss():
    model.eval()
    out = {}
    with torch.inference_mode():
        for split in ['train','val']:
            s = 0.0
            for _ in range(eval_iters):
                X, Y = get_batch(split)
                _, l = model(X, Y)
                s += l.item()
            out[split] = s / eval_iters
    model.train()
    return out

start = time.time()
print("Training started at:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start)))

for it in range(max_iters):
    if it % eval_interval == 0 or it == max_iters - 1:
        losses = estimate_loss()
        print(f"step {it}: train {losses['train']:.4f}, val {losses['val']:.4f}")

    xb, yb = get_batch('train')


    with torch.amp.autocast(device_type="cuda" if device == "cuda" else "cpu", enabled=(device=='cuda')):
        _, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    if device == 'cuda' and scaler is not None:
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    else:
        loss.backward()
        optimizer.step()

end = time.time()
print("Training finished at:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end)))
print(f"Total training time: {end - start:.2f} seconds")


Compiled with torch.compile
Training started at: 2025-08-11 03:01:12
step 0: train 4.2702, val 4.2688
step 1000: train 1.6208, val 1.7927
step 2000: train 1.3509, val 1.5676
step 3000: train 1.2419, val 1.4974
step 4000: train 1.1491, val 1.4702
step 4999: train 1.0798, val 1.4731
Training finished at: 2025-08-11 03:13:11
Total training time: 718.86 seconds


Generate 5000 characters | Report metrics

In [None]:
# Generate 5k characters
model.eval()
with torch.inference_mode():
    ctx = torch.zeros((1,1), dtype=torch.long, device=device)
    out_tokens = model.generate(ctx, max_new_tokens=5000)[0].tolist()
sample = decode(out_tokens)

print(sample[:1000])               # preview
print("\n--- total length:", len(sample))

# Final loss + perplexity
def final_metrics():
    model.eval()
    s = 0.0
    with torch.inference_mode():
        for _ in range(eval_iters):
            X, Y = get_batch('val')
            _, l = model(X, Y)
            s += l.item()
    vloss = s / eval_iters
    return vloss, math.exp(vloss)

vloss, ppl = final_metrics()
print(f"Final val loss ~ {vloss:.4f} | perplexity ~ {ppl:.1f}")



Condition!
Welcome, damnned slain, like venom'd; there's
That call ic to Rome, take not the business.

KING LEWIS XI:
What they say 'tis choice of laque
And gian false your breasts blood. A wige, my lords,
Have I to them. Come, lady, Barnardine,
To be me the rest, Lucentio, be I am:
But set another gentlemen; since I have not spoke
At Duke of Angelo seem to his fastime.

ROMEO:
Now Romeo, Marcius,
Richard Romeo, show more than his abund himself:
No money, what is valiant for it be?

JULIET:
O great Ratnand!

ROMEO:
I farewell, go-day.

JULIET:
And Duy ne! why, hhath baggarly bed to hear speak.

ROMEO:
Darry he?

JULIET:
Madam, with the glory pities, report your foot,
drawn by the unwhether queen had sunguish'd up in him:
Here, Margaret, never known in a breath:
'Tis marriage by such of the envious beauty
That which they shall be coal. Harket you know'st! Thou,
What should he had been forget a parlous burden leave,
mightily shores forth be a life, this whether,
No less was straight beg

Final Anaysis: The model achieved a final validation loss of approximately 1.4772 and a perplexity of about 4.4, which is a notable improvement over the baseline. Lower loss and perplexity indicate the model is more confident in its next-character predictions. While occasional nonsensical phrases still appear, the output consistently preserves play-like structure (speaker labels, short verse lines) and maintains Shakespearean-style vocabulary, suggesting effective learning.

Increasing embedding size to 384 dimensions with 6 heads × 6 layers and adding dropout = 0.2 improved generalization and reduced overfitting, as reflected in both the cleaner text generation and the drop in validation loss.








