# Model

In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer


# -------------------------------
# Define helper modules
# -------------------------------


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        """
        embed_dim: model embedding dimension.
        num_heads: number of attention heads.
        """
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Projections for query, key, and value
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        """
        x: Tensor of shape [batch_size, seq_len, embed_dim]
        """
        batch_size, seq_len, embed_dim = x.size()

        # Compute Q, K, V
        Q = self.q_proj(x)  # (batch_size, seq_len, embed_dim)
        K = self.k_proj(x)
        V = self.v_proj(x)

        # Reshape for multi-head attention: (batch, seq_len, num_heads, head_dim) and then transpose
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention.
        # Q, K: (batch, num_heads, seq_len, head_dim)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (batch, num_heads, seq_len, seq_len)

        # Create a causal mask so that position i cannot attend to j > i.
        mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
        # Expand mask to (batch, num_heads, seq_len, seq_len)
        attn_scores = attn_scores.masked_fill(mask, float('-inf'))

        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_probs, V)  # (batch, num_heads, seq_len, head_dim)

        # Reassemble all heads
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
        output = self.out_proj(attn_output)
        return output

class FeedForward(nn.Module):
    """
    Implements a simple two-layer feed-forward network used within Transformer architectures.
    """
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        """
        embed_dim (int): Dimensionality of the input embeddings.
        ff_dim (int): Dimensionality of the feed-forward hidden layer.
        dropout (float, optional): Dropout probability for regularization. Default is 0.1.
        """
        super().__init__()
        self.linear1 = nn.Linear(embed_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, embed_dim)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class TransformerBlock(nn.Module):
    """
    Implements a single Transformer block as used in decoder-only architectures.
    """
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        """
        embed_dim (int): Dimensionality of the input embeddings.
        num_heads (int): Number of attention heads in the self-attention mechanism.
        ff_dim (int): Dimensionality of the hidden layer in the feed-forward network.
        dropout (float, optional): Dropout probability for regularization. Default is 0.1.
        """
        super().__init__()
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention with residual connection
        attn_out = self.attention(x)
        x = self.norm1(x + self.dropout(attn_out))
        # Feed-forward network with residual connection
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

# -------------------------------
# Define the full Decoder-only Transformer
# -------------------------------

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_seq_len=512, dropout=0.1):
        """
        vocab_size: Size of the vocabulary.
        embed_dim: Embedding dimension.
        num_heads: Number of attention heads.
        ff_dim: Hidden dimension of feed-forward layer.
        num_layers: Number of Transformer blocks.
        max_seq_len: Maximum sequence length.
        """
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_seq_len, embed_dim)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.output_proj = nn.Linear(embed_dim, vocab_size)
        self.max_seq_len = max_seq_len

    def forward(self, x):
        """
        x: Tensor of shape [batch_size, seq_len] containing token ids.
        Returns logits of shape [batch_size, seq_len, vocab_size].
        """
        batch_size, seq_len = x.size()
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.token_emb(x) + self.pos_emb(positions)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

# Set up Tokenizer, Model, and Device

In [5]:
# -------------------------------
# Set up tokenizer, model, and device
# -------------------------------

# Initialize the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token for padding

# Define hyperparameters
embed_dim = 128
num_heads = 4
ff_dim = 512
num_layers = 2
max_seq_len = 64  # maximum sequence length for each example
batch_size = 4
num_epochs = 4  # you can increase this as needed
lr = 1e-3

# Instantiate the model
model = TransformerDecoder(vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_seq_len)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

## Simple Test with Sample Test

In [None]:
# -------------------------------
# Set up sample data
# -------------------------------

# Create some sample texts
sample_texts = [
    "Hello, my name is ChatGPT.",
    "I love working with Transformers.",
    "Let's build a model from scratch."
]

# Tokenize the texts (padding/truncation as needed)
encodings = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_len)
input_ids = encodings["input_ids"].to(device)  # shape: [batch, seq_len]

# -------------------------------
# Training loop (demo for one batch)
# -------------------------------

# For language modeling, we predict the next token. We shift the inputs by 1.
def shift_right(x):
    # Remove the first token and use it as input
    return x[:, :-1], x[:, 1:]

inputs, targets = shift_right(input_ids)  # both of shape: [batch, seq_len-1]

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# One training step
model.train()
optimizer.zero_grad()
logits = model(inputs)  # logits: [batch, seq_len-1, vocab_size] (if our input length is seq_len-1)
# Because our model is causal, we can compute loss on next-token predictions.
# If necessary, you could trim logits to match the targets.
if logits.size(1) != targets.size(1):
    logits = logits[:, :targets.size(1), :]

loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
loss.backward()
optimizer.step()

print("Training loss:", loss.item())

# -------------------------------
# Tests & Inference
# -------------------------------

# Test 1: Check output shape
model.eval()
with torch.no_grad():
    test_logits = model(input_ids)
print("Logits shape (should be [batch, seq_len, vocab_size]):", test_logits.shape)

# Test 2: Generate text by sampling one token at a time.
def generate_text(model, tokenizer, prompt, max_new_tokens=20):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated = input_ids
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # For generation, we pass the full sequence (or the last max_seq_len tokens)
            if generated.size(1) > max_seq_len:
                generated = generated[:, -max_seq_len:]
            logits = model(generated)
            # Take the last token's logits and sample from the distribution
            next_token_logits = logits[:, -1, :]
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    output_text = tokenizer.decode(generated[0])
    return output_text

prompt = "In a world"
generated_text = generate_text(model, tokenizer, prompt)
print("\nGenerated text:")
print(generated_text)

#Dataset

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

## Training OpenWeb Dataset

In [None]:
from torch.utils.data import DataLoader
from datasets import load_dataset


# Load a small fraction of OpenWebText (a dataset similar in spirit to The Pile)
openwebtext_dataset = load_dataset("openwebtext", split="train[:1%]")

def tokenize_function(examples):
    '''
    Tokenize the "text" field with truncation and padding to max_seq_len
    '''
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_seq_len)

tokenized_dataset = openwebtext_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")

# Create a DataLoader for batching
dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True)

Map:   0%|          | 0/80138 [00:00<?, ? examples/s]

## Training and Validation OpenWeb Datasets

In [4]:
from datasets import load_dataset
from torch.utils.data import DataLoader

# -------------------------------
# Load a larger fraction (5%) of OpenWebText dataset
# -------------------------------
dataset_fraction = "train[:5%]"
openwebtext_dataset = load_dataset("openwebtext", split=dataset_fraction)

# -------------------------------
# Split the dataset into training and validation sets (80% train, 20% validation)
# -------------------------------
split_dataset = openwebtext_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print("Training set size:", len(train_dataset))
print("Validation set size:", len(val_dataset))

# -------------------------------
# Define the tokenization function
# -------------------------------
def tokenize_function(examples):
    """
    Tokenize the "text" field with truncation and padding to max_seq_len.
    """
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_seq_len)

# -------------------------------
# Apply tokenization to both training and validation sets in parallel (if desired)
# -------------------------------
# Adjust num_proc (e.g., 4) according to your available CPU cores.
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"], num_proc=4)
val_dataset   = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"], num_proc=4)

# Set the format for PyTorch tensors
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# -------------------------------
# Create DataLoaders for batching
# -------------------------------
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

README.md:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

openwebtext.py:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

The repository for openwebtext contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/openwebtext.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0/21 [00:00<?, ?files/s]

urlsf_subset00.tar:   0%|          | 0.00/633M [00:00<?, ?B/s]

urlsf_subset01.tar:   0%|          | 0.00/629M [00:00<?, ?B/s]

urlsf_subset02.tar:   0%|          | 0.00/629M [00:00<?, ?B/s]

urlsf_subset03.tar:   0%|          | 0.00/628M [00:00<?, ?B/s]

urlsf_subset04.tar:   0%|          | 0.00/627M [00:00<?, ?B/s]

urlsf_subset05.tar:   0%|          | 0.00/630M [00:00<?, ?B/s]

urlsf_subset06.tar:   0%|          | 0.00/626M [00:00<?, ?B/s]

urlsf_subset07.tar:   0%|          | 0.00/625M [00:00<?, ?B/s]

urlsf_subset08.tar:   0%|          | 0.00/625M [00:00<?, ?B/s]

urlsf_subset09.tar:   0%|          | 0.00/626M [00:00<?, ?B/s]

urlsf_subset10.tar:   0%|          | 0.00/625M [00:00<?, ?B/s]

urlsf_subset11.tar:   0%|          | 0.00/625M [00:00<?, ?B/s]

urlsf_subset12.tar:   0%|          | 0.00/624M [00:00<?, ?B/s]

urlsf_subset13.tar:   0%|          | 0.00/629M [00:00<?, ?B/s]

urlsf_subset14.tar:   0%|          | 0.00/627M [00:00<?, ?B/s]

urlsf_subset15.tar:   0%|          | 0.00/621M [00:00<?, ?B/s]

urlsf_subset16.tar:   0%|          | 0.00/619M [00:00<?, ?B/s]

urlsf_subset17.tar:   0%|          | 0.00/619M [00:00<?, ?B/s]

urlsf_subset18.tar:   0%|          | 0.00/618M [00:00<?, ?B/s]

urlsf_subset19.tar:   0%|          | 0.00/619M [00:00<?, ?B/s]

urlsf_subset20.tar:   0%|          | 0.00/377M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

Training set size: 320550
Validation set size: 80138


Map (num_proc=4):   0%|          | 0/320550 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/80138 [00:00<?, ? examples/s]

NameError: name 'batch_size' is not defined

#Training

## Training I: no-validation

In [None]:
# -------------------------------
# Prepare helper function for training targets (shift tokens)
# -------------------------------
def shift_right(x):
    '''
    For language modeling, predict the next token.
    '''
    return x[:, :-1], x[:, 1:]


# -------------------------------
# Training loop using OpenWeb dataset
# -------------------------------
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)  # [batch, seq_len]
        inputs, targets = shift_right(input_ids)
        optimizer.zero_grad()
        logits = model(inputs)
        # Ensure logits and targets have matching sequence lengths
        if logits.size(1) != targets.size(1):
            logits = logits[:, :targets.size(1), :]
        loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} average loss: {avg_loss}")

Epoch 1 average loss: 6.304511387996135
Epoch 2 average loss: 5.704159862881502
Epoch 3 average loss: 5.487709370080688


###Simple Inference Test

In [None]:
# -------------------------------
# Simple Inference Test
# -------------------------------

def generate_text(model, tokenizer, prompt, max_new_tokens=20):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated = input_ids
    with torch.no_grad():
        for _ in range(max_new_tokens):
            if generated.size(1) > max_seq_len:
                generated = generated[:, -max_seq_len:]
            logits = model(generated)
            next_token_logits = logits[:, -1, :]
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return tokenizer.decode(generated[0])

prompt = "In a world"
print("\nGenerated text:")
print(generate_text(model, tokenizer, prompt,64))


Generated text:
 world, has been a mechanism of Requiot can launch bitcoin. In America, in a dispute view from which he helped prevent Wales Battlefield next month. The US strip month, ESPN, perhaps on.

There are some EUload this year and speculation from the Olympic Games into the International Monetary Fund on Saturday, a significant


##Trainig II: with training and validation loss tracking

In [None]:
# -------------------------------
# Prepare helper function for training targets (shift tokens)
# -------------------------------
def shift_right(x):
    '''
    For language modeling, predict the next token.
    '''
    return x[:, :-1], x[:, 1:]

# -------------------------------
# Training loop with validation and loss tracking
# -------------------------------

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)  # shape: [batch_size, seq_len]
        inputs, targets = shift_right(input_ids)    # shift tokens for next-token prediction
        optimizer.zero_grad()
        logits = model(inputs)  # forward pass; output shape: [batch_size, seq_len, vocab_size]
        # Ensure logits and targets have matching sequence lengths
        if logits.size(1) != targets.size(1):
            logits = logits[:, :targets.size(1), :]
        loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        running_train_loss += loss.item()

    avg_train_loss = running_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Evaluate on validation set
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            inputs, targets = shift_right(input_ids)
            logits = model(inputs)
            if logits.size(1) != targets.size(1):
                logits = logits[:, :targets.size(1), :]
            loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
            running_val_loss += loss.item()
    avg_val_loss = running_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

### Training and Validation Loss Curves

In [None]:
# -------------------------------
# Plot training and validation loss curves
# -------------------------------

plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss")
plt.legend()
plt.show()