In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [38]:
# Hyperparameters
batch_size = 32                 # Number of sequences processed in one forward/backward pass
block_size = 32               # Context window size (how many previous tokens to consider)
n_embed = 256                  # Size of the token and positional embeddings
n_layer = 6                   # Number of transformer blocks
n_head = 8                    # Number of attention heads per transformer block
dropout = 0.2                 # Dropout rate to reduce overfitting
max_iters = 1000              # Total number of training iterations
eval_iters = 100              # Number of batches used for loss evaluation
eval_interval = 100           # Frequency of evaluation during training
learning_rate = 3e-4          # Learning rate for the optimizer

# Set the device for training (GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set manual seed for reproducibility
torch.manual_seed(1317)

<torch._C.Generator at 0x7fdeeff45250>

In [29]:
!wget https://www.gutenberg.org/cache/epub/75854/pg75854.txt
!wget https://www.gutenberg.org/cache/epub/67709/pg67709.txt
!wget https://www.gutenberg.org/cache/epub/75778/pg75778.txt
!wget https://www.gutenberg.org/cache/epub/75806/pg75806.txt
!wget https://www.gutenberg.org/cache/epub/75832/pg75832.txt
!wget https://www.gutenberg.org/cache/epub/75766/pg75766.txt
!wget https://www.gutenberg.org/cache/epub/75870/pg75870.txt
!wget https://www.gutenberg.org/cache/epub/75948/pg75948.txt

--2025-05-01 04:05:33--  https://www.gutenberg.org/cache/epub/75854/pg75854.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 129993 (127K) [text/plain]
Saving to: ‘pg75854.txt.1’


2025-05-01 04:05:33 (1.36 MB/s) - ‘pg75854.txt.1’ saved [129993/129993]

--2025-05-01 04:05:34--  https://www.gutenberg.org/cache/epub/67709/pg67709.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42466 (41K) [text/plain]
Saving to: ‘pg67709.txt.1’


2025-05-01 04:05:34 (1.25 MB/s) - ‘pg67709.txt.1’ saved [42466/42466]

--2025-05-01 04:05:35--  https://www.gutenberg.org/cache/epub/75778/pg75778.txt
Resolving www.gutenberg.org (w

In [39]:
files = [
    'pg75854.txt',
    'pg67709.txt',
    'pg75778.txt',
    'pg75806.txt',
    'pg75832.txt',
    'pg75766.txt',
    'pg75870.txt',
    'pg75948.txt',
    'pg75854.txt',
    'pg67709.txt',
    'pg75778.txt',
    'pg75806.txt',
    'pg75832.txt',
    'pg75766.txt',
    'pg75870.txt',
    'pg75948.txt'
]


text = ""
for file in files:
    with open(file, 'r', encoding = 'utf-8') as f:
        text += f.read()

In [40]:
import re

text = text.lower()
text = re.sub('[^a-z0-9 ]', '', text)
text = re.sub('/s+', ' ', text)

In [41]:
# Find all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

# Create a mapping from characters to integer
ctoi = {char:index for index,char in enumerate(chars)}
itoc = {index:char for index,char in enumerate(chars)}
encode = lambda s: [ctoi[char] for char in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itoc[index] for index in l]) # decoder: take a list of integers, output a string

37


In [42]:
# Train and validation splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [43]:
# Total nos of characters in data set

len(text)

5077320

In [18]:
# Function to generate a batch of input-target pairs for training or validation
def get_batch(split):
    data = train_data if split == 'train' else val_data

    # Randomly choose starting indices for each sequence in the batch
    index = torch.randint(len(data) - block_size, (batch_size,))

    # Create input sequences (x) and target sequences (y) shifted by one position
    x = torch.stack([data[i:i+block_size] for i in index])
    y = torch.stack([data[i+1:i+block_size+1] for i in index])

    # Move data to the appropriate device (CPU or GPU)
    x, y = x.to(device), y.to(device)
    return x, y


In [19]:
@torch.no_grad()
# Function to evaluate the model and report the loss
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [20]:
# One head of self-attention (used inside a multi-head attention block)
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()

        # Linear layers to compute key, query, and value vectors without biases
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)

        # Lower triangular matrix for causal masking (prevent attention to future tokens)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # Dropout to regularize attention weights
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape  # B: batch size, T: sequence length, C: embedding size

        # Compute keys and queries
        k = self.key(x)        # (B, T, head_size)
        q = self.query(x)      # (B, T, head_size)

        # Compute scaled dot-product attention scores
        attention = q @ k.transpose(-2, -1) * C**-0.5  # (B, T, T)

        # Apply causal mask to prevent attending to future tokens
        attention = attention.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

        # Normalize attention weights and apply dropout
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout(attention)

        # Compute values and apply attention weights
        v = self.value(x)      # (B, T, head_size)
        out = attention @ v    # (B, T, head_size)

        return out


In [21]:
# Multiple heads of self-attention in parallel
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(n_embed, n_embed)  #Wq, Wk, Wv
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.projection(out))
        return out

In [22]:
class FeedFoward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_head * n_embed),  # 512 x 2048
            nn.ReLU(),
            nn.Linear(n_head * n_embed, n_embed), # 2048 x 512
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [23]:
class Block(nn.Module):   # Block - Encoder Layer
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head    # 512//8 = 64
        self.sa = MultiHeadAttention(n_head, head_size)   # 8, 64
        self.ffwd = FeedFoward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x): 
        x = x + self.sa(self.ln1(x))   # Residual connections
        x = x + self.ffwd(self.ln2(x))
        return x

In [30]:
class MyGPT(nn.Module):
    def __init__(self):
        super().__init__()
        # Embedding layer: maps each token ID to a vector of size n_embed
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)

        # Positional embedding: provides position-specific context to each token
        self.position_embedding_table = nn.Embedding(block_size, n_embed)

        # Stack of transformer blocks (self-attention + feedforward layers)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])

        # Final layer normalization before output
        self.ln_f = nn.LayerNorm(n_embed)

        # Output layer: projects back to vocabulary size for logits
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, index, targets=None):
        B, T = index.shape  # Batch size and sequence length

        # Token and positional embeddings (B,T,C)
        tok_embed = self.token_embedding_table(index)
        pos_embed = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embed + pos_embed

        # Pass through transformer blocks and final layer norm
        x = self.blocks(x)
        x = self.ln_f(x)

        # Compute logits for next token prediction
        logits = self.lm_head(x)

        # If targets are provided, compute cross-entropy loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # Generate new tokens iteratively
        for _ in range(max_new_tokens):
            # Use only the last block_size tokens as context
            context_window = index[:, -block_size:]

            # Forward pass to get logits for the next token
            logits, loss = self(context_window)
            logits = logits[:, -1, :]  # Focus on the last time step

            # Convert logits to probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample the next token from the probability distribution
            index_next = torch.multinomial(probs, num_samples=1)

            # Append sampled token to the sequence
            index = torch.cat((index, index_next), dim=1)

        return index

In [44]:
model = MyGPT()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

7.913509 M parameters


In [45]:
# Initialize the AdamW optimizer with model parameters and specified learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for iter in range(max_iters):

    # Evaluate and print training/validation loss at regular intervals
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Get a batch of training data (inputs and targets)
    x_batch, y_batch = get_batch('train')

    # Forward pass: compute predictions and loss
    logits, loss = model(x_batch, y_batch)

    # Backward pass and optimization step
    optimizer.zero_grad(set_to_none=True)  # Clear gradients
    loss.backward()                        # Backpropagate the loss
    optimizer.step()                       # Update model parameters


Step 0: train loss 3.8063, val loss 3.8119
Step 100: train loss 2.4048, val loss 2.4587
Step 200: train loss 2.3225, val loss 2.3889
Step 300: train loss 2.1819, val loss 2.2317
Step 400: train loss 2.0950, val loss 2.1248
Step 500: train loss 2.0342, val loss 2.0658
Step 600: train loss 1.9856, val loss 1.9994
Step 700: train loss 1.9407, val loss 1.9653
Step 800: train loss 1.9042, val loss 1.9285
Step 900: train loss 1.8758, val loss 1.8932
Step 999: train loss 1.8541, val loss 1.8587


In [46]:
# Start generation with a single token (index 0) on the correct device
firstindex = torch.zeros((1,1), dtype=torch.long, device=device)

# Generate 2000 new tokens from the model and decode them back into readable text
print(decode(model.generate(index=firstindex, max_new_tokens=1000)[0].tolist()))

 unde   hist alongrausteng aroned whatles with casneer on the   chantima to sheele of than in spexic when is soecculded                gending to deain shaprvybuch had it as way beeare a hue asy must lestowng rame we arnd do bery are caintle father was napte hemady it markan cen be hough dain was an didove diacking lates and reverycummalandy so ariung dy mower fal deeppaine butchacil is all a but ut a gexisuppentere why he gual conners the clay the un of thebegenatog 52 witherls arere earn i usybeir upply whh evemby with uxher posts coerfable dayter foll by abbean and and heow the wourk for whiling sa myeng hous go and maketh sime and from evening of achish the selatureesh it   a foughtread to more gaintard grending out use and butage theanunnere pmawing a nated in froms a tranginemmothere head  greadelw to that up t2     with by pelughtly grainned e7pongivel thas in therve prijust the fach to nowss plenices the regare at goend ten earchiangs in i campomme afcure sawe hing from rest in