In [5]:
# Comprehensive Scaling Study of Transformer Architectures
# This script defines, trains, and evaluates three core model types:
# 1. Decoder-only (like GPT) for next-token prediction.
# 2. Encoder-only (like BERT) for sequence classification.
# 3. Encoder-Decoder (like T5/BART) for sequence-to-sequence tasks.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import json
import time
import random
import math
from tqdm.auto import tqdm
import os

# --- Setup ---
try:
    import torch._dynamo
    torch._dynamo.disable()
    print("✓ torch._dynamo disabled to prevent environment-specific errors.")
except (ImportError, AttributeError):
    print("! torch._dynamo not found or disabled. This is normal.")

def get_device():
    if torch.cuda.is_available(): return 'cuda'
    if torch.backends.mps.is_available(): return 'mps'
    return 'cpu'

device = get_device()
print(f"🖥️ Using device: {device.upper()}")

# ============================================================================
# SECTION 1: CORE TRANSFORMER COMPONENTS
# ============================================================================

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (seq_len, batch_size, d_model)
        x = x + self.pe[:x.size(0)]
        return x

# ============================================================================
# SECTION 2: THREE DISTINCT ARCHITECTURES
# ============================================================================

# --- 2.1: Decoder-Only Architecture (e.g., GPT) ---
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_encoder = PositionalEncoding(config.d_model, config.seq_length)
        decoder_layer = nn.TransformerDecoderLayer(d_model=config.d_model, nhead=config.n_head, dim_feedforward=4*config.d_model, batch_first=True, dropout=0.1)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=config.n_layer)
        self.head = nn.Linear(config.d_model, config.vocab_size)

    def forward(self, input_ids, labels=None):
        # input_ids shape: (batch_size, seq_len)
        src = self.embedding(input_ids) * math.sqrt(self.config.d_model)
        src = self.pos_encoder(src.transpose(0, 1)).transpose(0, 1) # Transpose for pos encoding

        # Create a causal mask for the decoder
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(input_ids.size(1)).to(device)

        output = self.transformer_decoder(tgt=src, memory=src, tgt_mask=tgt_mask, memory_mask=tgt_mask)
        logits = self.head(output)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits.view(-1, self.config.vocab_size), labels.view(-1))

        return {'loss': loss, 'logits': logits}

# --- 2.2: Encoder-Only Architecture (e.g., BERT) ---
class EncoderOnlyTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_encoder = PositionalEncoding(config.d_model, config.seq_length)
        encoder_layer = nn.TransformerEncoderLayer(d_model=config.d_model, nhead=config.n_head, dim_feedforward=4*config.d_model, batch_first=True, dropout=0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.n_layer)
        # Classification head
        self.head = nn.Linear(config.d_model, config.num_classes)

    def forward(self, input_ids, labels=None):
        # input_ids shape: (batch_size, seq_len)
        src = self.embedding(input_ids) * math.sqrt(self.config.d_model)
        src = self.pos_encoder(src.transpose(0, 1)).transpose(0, 1)

        output = self.transformer_encoder(src)

        # Use the representation of the first token ([CLS]) for classification
        cls_output = output[:, 0, :]
        logits = self.head(cls_output)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)

        return {'loss': loss, 'logits': logits}

# --- 2.3: Encoder-Decoder Architecture (e.g., T5) ---
class EncoderDecoderTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_encoder = PositionalEncoding(config.d_model, config.seq_length)
        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.n_head,
            num_encoder_layers=config.n_layer,
            num_decoder_layers=config.n_layer,
            dim_feedforward=4*config.d_model,
            batch_first=True,
            dropout=0.1
        )
        self.head = nn.Linear(config.d_model, config.vocab_size)

    def forward(self, src_ids, tgt_ids, labels=None):
        # src_ids, tgt_ids shape: (batch_size, seq_len)
        src = self.embedding(src_ids) * math.sqrt(self.config.d_model)
        src = self.pos_encoder(src.transpose(0, 1)).transpose(0, 1)

        tgt = self.embedding(tgt_ids) * math.sqrt(self.config.d_model)
        tgt = self.pos_encoder(tgt.transpose(0, 1)).transpose(0, 1)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

        output = self.transformer(src, tgt, tgt_mask=tgt_mask)
        logits = self.head(output)

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits.view(-1, self.config.vocab_size), labels.view(-1))

        return {'loss': loss, 'logits': logits}

# ============================================================================
# SECTION 3: DATASETS & CONFIGURATIONS
# ============================================================================

class ModelConfig:
    def __init__(self, name, arch_type, n_layer, n_head, d_model, vocab_size=5000, seq_length=32, num_classes=10):
        self.name = name
        self.arch_type = arch_type
        self.n_layer = n_layer
        self.n_head = n_head
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        self.num_classes = num_classes

# --- Define model configurations (p scaling) ---
MODEL_CONFIGS = {
    "decoder_small": ModelConfig("decoder_small", "decoder_only", n_layer=2, n_head=2, d_model=64),
    "decoder_base": ModelConfig("decoder_base", "decoder_only", n_layer=4, n_head=4, d_model=128),
    "encoder_small": ModelConfig("encoder_small", "encoder_only", n_layer=2, n_head=2, d_model=64),
    "encoder_base": ModelConfig("encoder_base", "encoder_only", n_layer=4, n_head=4, d_model=128),
    "encdec_small": ModelConfig("encdec_small", "encoder_decoder", n_layer=2, n_head=2, d_model=64),
    "encdec_base": ModelConfig("encdec_base", "encoder_decoder", n_layer=4, n_head=4, d_model=128),
}

# --- Task-specific Datasets ---
class LMDataset(Dataset): # For Decoder-only
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return {'input_ids': torch.tensor(self.data[idx], dtype=torch.long)}

class ClassificationDataset(Dataset): # For Encoder-only
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.data[idx]['seq'], dtype=torch.long),
            'labels': torch.tensor(self.data[idx]['label'], dtype=torch.long)
        }

class Seq2SeqDataset(Dataset): # For Encoder-Decoder
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        return {
            'src_ids': torch.tensor(self.data[idx]['src'], dtype=torch.long),
            'tgt_ids': torch.tensor(self.data[idx]['tgt'], dtype=torch.long)
        }

# --- Synthetic Data Generation ---
def generate_data(task, n_samples, seq_length=32, vocab_size=5000, num_classes=10):
    data = []
    if task == 'decoder_only':
        for _ in range(n_samples):
            start = random.randint(1, vocab_size // 2)
            step = random.randint(1, 5)
            data.append([(start + j * step) % vocab_size for j in range(seq_length)])
    elif task == 'encoder_only':
        for _ in range(n_samples):
            label = random.randint(0, num_classes - 1)
            # Create sequences where the sum of tokens modulo num_classes equals the label
            seq = [random.randint(1, vocab_size-1) for _ in range(seq_length - 1)]
            checksum = sum(seq) % num_classes
            correction = (label - checksum + num_classes) % num_classes
            seq.append(correction + 1) # Ensure not 0
            random.shuffle(seq)
            data.append({'seq': seq, 'label': label})
    elif task == 'encoder_decoder':
        for _ in range(n_samples):
            seq = [random.randint(1, vocab_size-1) for _ in range(seq_length)]
            # Task: reverse the sequence
            data.append({'src': seq, 'tgt': seq[::-1]})
    return data

# ============================================================================
# SECTION 4: FLEXIBLE TRAINER
# ============================================================================

class UniversalTrainer:
    def __init__(self, model, config):
        self.model = model.to(device)
        self.config = config
        self.use_amp = (device == 'cuda')
        self.scaler = torch.amp.GradScaler(enabled=self.use_amp)

    def train(self, train_loader, val_loader, epochs=2, lr=1e-4):
        optimizer = optim.AdamW(self.model.parameters(), lr=lr)
        history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training", leave=False):
                batch = {k: v.to(device) for k, v in batch.items()}

                with torch.autocast(device_type=device, dtype=torch.float16, enabled=self.use_amp):
                    if self.config.arch_type == 'decoder_only':
                        outputs = self.model(input_ids=batch['input_ids'], labels=batch['input_ids'])
                    elif self.config.arch_type == 'encoder_only':
                        outputs = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
                    else: # encoder_decoder
                        outputs = self.model(src_ids=batch['src_ids'], tgt_ids=batch['tgt_ids'], labels=batch['tgt_ids'])
                    loss = outputs['loss']

                optimizer.zero_grad(set_to_none=True)
                self.scaler.scale(loss).backward()
                self.scaler.step(optimizer)
                self.scaler.update()
                total_loss += loss.item()

            avg_train_loss = total_loss / len(train_loader)
            history['train_loss'].append(avg_train_loss)

            # --- Validation ---
            self.model.eval()
            total_val_loss, total_correct, total_samples = 0, 0, 0
            with torch.no_grad():
                for batch in val_loader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    with torch.autocast(device_type=device, dtype=torch.float16, enabled=self.use_amp):
                        if self.config.arch_type == 'decoder_only':
                            outputs = self.model(input_ids=batch['input_ids'], labels=batch['input_ids'])
                        elif self.config.arch_type == 'encoder_only':
                            outputs = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
                            preds = torch.argmax(outputs['logits'], dim=1)
                            total_correct += (preds == batch['labels']).sum().item()
                            total_samples += batch['labels'].size(0)
                        else: # encoder_decoder
                            outputs = self.model(src_ids=batch['src_ids'], tgt_ids=batch['tgt_ids'], labels=batch['tgt_ids'])

                        val_loss = outputs['loss']
                    total_val_loss += val_loss.item()

            avg_val_loss = total_val_loss / len(val_loader)
            history['val_loss'].append(avg_val_loss)

            val_accuracy = (total_correct / total_samples) if total_samples > 0 else 0
            history['val_accuracy'].append(val_accuracy)

            print(f"  Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val Acc={val_accuracy:.4f}")

        return history

# ============================================================================
# SECTION 5: EXPERIMENT EXECUTION
# ============================================================================

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def run_experiment():
    print("🚀 Starting Comprehensive Transformer Architecture Scaling Study")

    # Define dataset sizes to test (n scaling)
    dataset_sizes = [1000, 5000, 10000]

    all_results = {}

    for config_name, config in MODEL_CONFIGS.items():
        print(f"\n{'='*20} ARCHITECTURE: {config_name.upper()} {'='*20}")

        for n_samples in dataset_sizes:
            print(f"\n--- Dataset Size (n): {n_samples} ---")

            # 1. Create data tailored to the architecture
            train_data = generate_data(config.arch_type, n_samples)
            val_data = generate_data(config.arch_type, n_samples // 5) # 20% validation set

            if config.arch_type == 'decoder_only':
                train_dataset, val_dataset = LMDataset(train_data), LMDataset(val_data)
            elif config.arch_type == 'encoder_only':
                train_dataset, val_dataset = ClassificationDataset(train_data), ClassificationDataset(val_data)
            else:
                train_dataset, val_dataset = Seq2SeqDataset(train_data), Seq2SeqDataset(val_data)

            train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=32)

            # 2. Instantiate the model
            if config.arch_type == 'decoder_only': model = DecoderOnlyTransformer(config)
            elif config.arch_type == 'encoder_only': model = EncoderOnlyTransformer(config)
            else: model = EncoderDecoderTransformer(config)

            num_params = count_params(model)
            print(f"  Model: {config_name}, Parameters (p): {num_params:,}")

            # 3. Train and evaluate
            trainer = UniversalTrainer(model, config)
            start_time = time.time()
            history = trainer.train(train_loader, val_loader)
            end_time = time.time()

            # 4. Store results
            result_key = f"{config_name}_n{n_samples}"
            all_results[result_key] = {
                'config': {k: v for k, v in vars(config).items()},
                'n_params': num_params,
                'dataset_size': n_samples,
                'final_val_loss': history['val_loss'][-1],
                'final_val_accuracy': history['val_accuracy'][-1], # Will be 0 for non-classification tasks
                'training_time_sec': end_time - start_time,
                'full_history': history
            }

            # Clean up memory
            del model, trainer, train_loader, val_loader
            if device == 'cuda': torch.cuda.empty_cache()

    return all_results

# ============================================================================
# SECTION 6: MAIN EXECUTION AND SAVING
# ============================================================================

if __name__ == '__main__':
    # Run the full suite of experiments
    results = run_experiment()

    # Save the results to a JSON file
    output_filename = 'transformer_architecture_scaling_results.json'
    with open(output_filename, 'w') as f:
        json.dump(results, f, indent=4)

    print(f"\n🎉🎉🎉 Experiment Complete! 🎉🎉🎉")
    print(f"📄 All results saved to '{output_filename}'")



✓ torch._dynamo disabled to prevent environment-specific errors.
🖥️ Using device: CUDA
🚀 Starting Comprehensive Transformer Architecture Scaling Study


--- Dataset Size (n): 1000 ---
  Model: decoder_small, Parameters (p): 778,504


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.6648, Val Loss=8.6116, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.6005, Val Loss=8.5472, Val Acc=0.0000

--- Dataset Size (n): 5000 ---
  Model: decoder_small, Parameters (p): 778,504


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.5197, Val Loss=8.2721, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.1140, Val Loss=7.9450, Val Acc=0.0000

--- Dataset Size (n): 10000 ---
  Model: decoder_small, Parameters (p): 778,504


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.3302, Val Loss=7.9349, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=7.8117, Val Loss=7.5762, Val Acc=0.0000


--- Dataset Size (n): 1000 ---
  Model: decoder_base, Parameters (p): 2,343,304


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.6582, Val Loss=8.5992, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.5470, Val Loss=8.4818, Val Acc=0.0000

--- Dataset Size (n): 5000 ---
  Model: decoder_base, Parameters (p): 2,343,304


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.4089, Val Loss=8.0536, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=7.9255, Val Loss=7.7881, Val Acc=0.0000

--- Dataset Size (n): 10000 ---
  Model: decoder_base, Parameters (p): 2,343,304


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.1741, Val Loss=7.7771, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=7.5719, Val Loss=7.1425, Val Acc=0.0000


--- Dataset Size (n): 1000 ---
  Model: encoder_small, Parameters (p): 420,618


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.4704, Val Loss=2.4778, Val Acc=0.1000


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.4056, Val Loss=2.4690, Val Acc=0.1250

--- Dataset Size (n): 5000 ---
  Model: encoder_small, Parameters (p): 420,618


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.4280, Val Loss=2.4264, Val Acc=0.1000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.3595, Val Loss=2.3995, Val Acc=0.1040

--- Dataset Size (n): 10000 ---
  Model: encoder_small, Parameters (p): 420,618


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.4151, Val Loss=2.3832, Val Acc=0.0980


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.3432, Val Loss=2.3494, Val Acc=0.0990


--- Dataset Size (n): 1000 ---
  Model: encoder_base, Parameters (p): 1,434,378


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.3947, Val Loss=2.4701, Val Acc=0.0600


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.2595, Val Loss=2.4120, Val Acc=0.0800

--- Dataset Size (n): 5000 ---
  Model: encoder_base, Parameters (p): 1,434,378


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.3662, Val Loss=2.3268, Val Acc=0.1000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.2994, Val Loss=2.3312, Val Acc=0.0970

--- Dataset Size (n): 10000 ---
  Model: encoder_base, Parameters (p): 1,434,378


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=2.3494, Val Loss=2.3152, Val Acc=0.0885


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=2.3045, Val Loss=2.3218, Val Acc=0.0960


--- Dataset Size (n): 1000 ---
  Model: encdec_small, Parameters (p): 878,728


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.6657, Val Loss=8.6580, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.6007, Val Loss=8.6083, Val Acc=0.0000

--- Dataset Size (n): 5000 ---
  Model: encdec_small, Parameters (p): 878,728


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.5782, Val Loss=8.4301, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.3172, Val Loss=8.1483, Val Acc=0.0000

--- Dataset Size (n): 10000 ---
  Model: encdec_small, Parameters (p): 878,728


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.4542, Val Loss=8.1412, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=7.8924, Val Loss=7.4800, Val Acc=0.0000


--- Dataset Size (n): 1000 ---
  Model: encdec_base, Parameters (p): 3,136,904


Epoch 1 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.6633, Val Loss=8.6409, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/32 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.5807, Val Loss=8.5946, Val Acc=0.0000

--- Dataset Size (n): 5000 ---
  Model: encdec_base, Parameters (p): 3,136,904


Epoch 1 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.5671, Val Loss=8.3421, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/157 [00:00<?, ?it/s]

  Epoch 2: Train Loss=8.1024, Val Loss=7.6803, Val Acc=0.0000

--- Dataset Size (n): 10000 ---
  Model: encdec_base, Parameters (p): 3,136,904


Epoch 1 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 1: Train Loss=8.3579, Val Loss=7.6812, Val Acc=0.0000


Epoch 2 Training:   0%|          | 0/313 [00:00<?, ?it/s]

  Epoch 2: Train Loss=7.1890, Val Loss=6.2158, Val Acc=0.0000

🎉🎉🎉 Experiment Complete! 🎉🎉🎉
📄 All results saved to 'transformer_architecture_scaling_results.json'
