# Image Captioning: Model Comparison Analysis

This notebook compares the baseline and attention models:
- Load trained models
- Compare performance metrics
- Analyze generated captions
- Visualize attention mechanisms

In [None]:
# Setup and imports
import os
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Add project root to path
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.utils.manager import ConfigManager
from src.utils.constants import ROOT_DIR
from src.utils.io import load_pickle, load_json
from src.preprocessing.vocabulary import Vocabulary
from src.preprocessing.dataset import FlickrDataset
from src.preprocessing.transforms import get_transforms
from src.models.baseline import BaselineCaptionModel
from src.models.attention import AttentionCaptionModel
from src.comparison.evaluator import ModelEvaluator
from src.visualization.captioning import compare_models, plot_training_history
from src.visualization.attention import visualize_attention

# Verify we found the correct project root
print(f"Project root: {ROOT_DIR}")
print(f"Config directory exists: {os.path.exists(os.path.join(ROOT_DIR, 'config'))}")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Initialize configuration manager
config_manager = ConfigManager()

# Load vocabulary
vocab_path = config_manager.paths['vocab']
print(f"Loading vocabulary from: {vocab_path}")
vocab = Vocabulary.load(vocab_path)
print(f"Vocabulary size: {len(vocab)}")

# Load data splits
splits_path = config_manager.paths['splits']
splits = load_pickle(splits_path)
test_df = splits['test']
print(f"\nTest set: {len(test_df)} samples")

## Load Trained Models

In [None]:
# Load baseline model
baseline_config = config_manager.get_model_config('baseline')
baseline_model = BaselineCaptionModel(
    embed_size=baseline_config['embed_size'],
    hidden_size=baseline_config['hidden_size'],
    vocab_size=len(vocab),
    num_layers=baseline_config['num_layers'],
    dropout=baseline_config['dropout']
).to(device)

# Load baseline weights
baseline_checkpoint_path = os.path.join(config_manager.get_model_dir('baseline'), 'best_model.pth')
if os.path.exists(baseline_checkpoint_path):
    print(f"Loading baseline model from: {baseline_checkpoint_path}")
    checkpoint = torch.load(baseline_checkpoint_path, map_location=device)
    baseline_model.load_state_dict(checkpoint['state_dict'])
    baseline_model.eval()
    print("Baseline model loaded successfully")
else:
    print("⚠ Baseline model not found. Please train it first using scripts/baseline.py")

In [None]:
# Load attention model
attention_config = config_manager.get_model_config('attention')
attention_model = AttentionCaptionModel(
    embed_size=attention_config['embed_size'],
    hidden_size=attention_config['hidden_size'],
    vocab_size=len(vocab),
    attention_dim=attention_config['attention_dim'],
    num_layers=attention_config['num_layers'],
    dropout=attention_config['dropout']
).to(device)

# Load attention weights
attention_checkpoint_path = os.path.join(config_manager.get_model_dir('attention'), 'best_model.pth')
if os.path.exists(attention_checkpoint_path):
    print(f"Loading attention model from: {attention_checkpoint_path}")
    checkpoint = torch.load(attention_checkpoint_path, map_location=device)
    attention_model.load_state_dict(checkpoint['state_dict'])
    attention_model.eval()
    print("Attention model loaded successfully")
else:
    print("⚠ Attention model not found. Please train it first using scripts/attention.py")

## Load Training Histories

In [None]:
# Load training histories
baseline_history_path = os.path.join(config_manager.get_model_dir('baseline'), 'training_history.json')
attention_history_path = os.path.join(config_manager.get_model_dir('attention'), 'training_history.json')

baseline_history = None
attention_history = None

if os.path.exists(baseline_history_path):
    baseline_history = load_json(baseline_history_path)
    print("Loaded baseline training history")

if os.path.exists(attention_history_path):
    attention_history = load_json(attention_history_path)
    print("Loaded attention training history")

In [None]:
# Plot training histories
if baseline_history:
    print("\nBaseline Model Training History:")
    plot_training_history(baseline_history, model_name="Baseline")

if attention_history:
    print("\nAttention Model Training History:")
    plot_training_history(attention_history, model_name="Attention")

## Compare Model Performance

In [None]:
# Load test results
baseline_results_path = os.path.join(config_manager.get_model_dir('baseline'), 'test_results.json')
attention_results_path = os.path.join(config_manager.get_model_dir('attention'), 'test_results.json')

results = {}

if os.path.exists(baseline_results_path):
    baseline_results = load_json(baseline_results_path)
    results['Baseline'] = baseline_results['test_bleu']
    results['Baseline']['params'] = baseline_results['model_params']

if os.path.exists(attention_results_path):
    attention_results = load_json(attention_results_path)
    results['Attention'] = attention_results['test_bleu']
    results['Attention']['params'] = attention_results['model_params']

# Create comparison dataframe
if results:
    comparison_df = pd.DataFrame(results).T
    print("Model Performance Comparison:")
    display(comparison_df)

In [None]:
# Visualize performance comparison
if results:
    # BLEU scores comparison
    metrics = ['bleu1', 'bleu2', 'bleu3', 'bleu4']
    baseline_scores = [results.get('Baseline', {}).get(m, 0) for m in metrics]
    attention_scores = [results.get('Attention', {}).get(m, 0) for m in metrics]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # BLEU scores bar plot
    rects1 = ax1.bar(x - width/2, baseline_scores, width, label='Baseline', color='skyblue')
    rects2 = ax1.bar(x + width/2, attention_scores, width, label='Attention', color='lightcoral')
    
    ax1.set_ylabel('Score (%)')
    ax1.set_xlabel('Metric')
    ax1.set_title('BLEU Score Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels([m.upper() for m in metrics])
    ax1.legend()
    ax1.grid(False)
    
    # Add value labels on bars
    for rect in rects1 + rects2:
        height = rect.get_height()
        ax1.annotate(f'{height:.1f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')
    
    # Model complexity comparison
    if 'params' in results.get('Baseline', {}) and 'params' in results.get('Attention', {}):
        models = ['Baseline', 'Attention']
        params = [results['Baseline']['params'] / 1e6, results['Attention']['params'] / 1e6]
        
        bars = ax2.bar(models, params, color=['skyblue', 'lightcoral'])
        ax2.set_ylabel('Parameters (millions)')
        ax2.set_title('Model Complexity')
        ax2.grid(False)
        
        # Add value labels
        for bar, param in zip(bars, params):
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    f'{param:.1f}M', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## Compare Generated Captions

In [None]:
# Create test dataset
data_config = config_manager.get_data_params()
images_dir = data_config['dataset']['images_dir']

_, transform_val = get_transforms(
    resize=data_config['image']['resize_size'],
    crop=data_config['image']['crop_size']
)

test_dataset = FlickrDataset(
    data_df=test_df,
    root_dir=images_dir,
    vocab=vocab,
    transform=transform_val
)

print(f"Test dataset size: {len(test_dataset)}")

In [None]:
# Compare model outputs on sample images
if os.path.exists(baseline_checkpoint_path) and os.path.exists(attention_checkpoint_path):
    print("Comparing model outputs on sample images...\n")
    compare_models(baseline_model, attention_model, test_dataset, vocab, device, num_samples=5)

## Analyze Caption Characteristics

In [None]:
# Initialize evaluator
if os.path.exists(baseline_checkpoint_path) and os.path.exists(attention_checkpoint_path):
    models = {
        'Baseline': baseline_model,
        'Attention': attention_model
    }
    
    evaluator = ModelEvaluator(models, vocab, device)
    
    # Create data loader for analysis
    from torch.utils.data import DataLoader
    from src.preprocessing.dataset import FlickrCollate
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=FlickrCollate(pad_idx=vocab.stoi["<PAD>"])
    )
    
    # Analyze caption lengths
    print("Analyzing caption lengths...")
    length_stats = evaluator.analyze_caption_lengths(test_loader, max_samples=500)
    print("\nCaption Length Statistics:")
    display(length_stats)

In [None]:
# Visualize caption length distributions
if 'evaluator' in locals() and 'length_stats' in locals():
    plt.figure(figsize=(12, 5))
    
    # Bar plot of average lengths
    plt.subplot(1, 2, 1)
    models = length_stats['Model'].tolist()
    mean_lengths = length_stats['Mean Length'].tolist()
    std_lengths = length_stats['Std Length'].tolist()
    
    bars = plt.bar(models, mean_lengths, yerr=std_lengths, capsize=5,
                   color=['gray', 'skyblue', 'lightcoral'])
    plt.ylabel('Caption Length (words)')
    plt.title('Average Caption Lengths')
    plt.grid(False)
    
    # Add value labels
    for bar, mean in zip(bars, mean_lengths):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                f'{mean:.1f}', ha='center', va='bottom')
    
    # Length range plot
    plt.subplot(1, 2, 2)
    min_lengths = length_stats['Min Length'].tolist()
    max_lengths = length_stats['Max Length'].tolist()
    
    x = np.arange(len(models))
    width = 0.35
    
    plt.bar(x - width/2, min_lengths, width, label='Min', color='lightblue')
    plt.bar(x + width/2, max_lengths, width, label='Max', color='lightcoral')
    
    plt.ylabel('Caption Length (words)')
    plt.title('Caption Length Range')
    plt.xticks(x, models)
    plt.legend()
    plt.grid(False)
    
    plt.tight_layout()
    plt.show()

## Attention Visualization

In [None]:
# Visualize attention for sample images
if os.path.exists(attention_checkpoint_path):
    print("Visualizing attention mechanism...\n")
    
    # Select random samples
    sample_indices = np.random.choice(len(test_dataset), 3, replace=False)
    
    for idx in sample_indices:
        # Get image
        image, _ = test_dataset[idx]
        image = image.unsqueeze(0).to(device)
        
        # Generate caption with attention
        caption, attention_weights = attention_model.caption_image_with_attention(image, vocab)
        
        # Denormalize image for display
        from src.preprocessing.transforms import denormalize_image
        img_display = denormalize_image(image[0])
        
        print(f"\nGenerated caption: {caption}")
        
        # Visualize attention
        visualize_attention(img_display, caption.split(), attention_weights, show_every=1)

## Summary and Conclusions

In [None]:
# Create summary report
print("MODEL COMPARISON SUMMARY")
print()

if results:
    print("\nPerformance Metrics:")
    for model in ['Baseline', 'Attention']:
        if model in results:
            print(f"\n{model} Model:")
            print(f"  BLEU-1: {results[model]['bleu1']:.2f}%")
            print(f"  BLEU-2: {results[model]['bleu2']:.2f}%")
            print(f"  BLEU-3: {results[model]['bleu3']:.2f}%")
            print(f"  BLEU-4: {results[model]['bleu4']:.2f}%")
            print(f"  Parameters: {results[model]['params']:,}")
    
    # Calculate improvements
    if 'Baseline' in results and 'Attention' in results:
        print("\nAttention Model Improvements:")
        for metric in ['bleu1', 'bleu2', 'bleu3', 'bleu4']:
            baseline_score = results['Baseline'][metric]
            attention_score = results['Attention'][metric]
            improvement = (attention_score - baseline_score) / baseline_score * 100
            print(f"  {metric.upper()}: {improvement:+.1f}%")

print("\nKey Findings:")
print("1. The attention mechanism helps the model focus on relevant image regions")
print("2. Attention model generally produces more accurate and detailed captions")
print("3. Attention weights provide interpretable insights into model behavior")
print("4. Trade-off: Attention model has more parameters and is slower to train")