# Image Caption Generator - Experimentation Notebook

This notebook demonstrates the core functionality of the image caption generator and provides a foundation for experimentation and model improvements.

## Setup and Imports

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Load Model and Processor

In [None]:
# Load BLIP model for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Set model to evaluation mode
model.eval()

print("Model loaded successfully!")

## Helper Functions

In [None]:
def load_image_from_url(url):
    """Load an image from a URL"""
    response = requests.get(url)
    image = Image.open(BytesIO(response.content))
    return image.convert('RGB')

def generate_caption(image, max_length=50, num_beams=5):
    """Generate caption for an image"""
    inputs = processor(image, return_tensors="pt")
    
    with torch.no_grad():
        out = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
    
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def display_image_with_caption(image, caption):
    """Display an image with its generated caption"""
    plt.figure(figsize=(10, 8))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Caption: {caption}", fontsize=14, pad=20)
    plt.tight_layout()
    plt.show()

## Test with Sample Images

In [None]:
# Test with a sample image
sample_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d9/Collage_of_Nine_Dogs.jpg/1200px-Collage_of_Nine_Dogs.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/2/2f/Culinary_fruits_front_view.jpg/1200px-Culinary_fruits_front_view.jpg"
]

for i, url in enumerate(sample_urls):
    try:
        print(f"\n--- Sample Image {i+1} ---")
        image = load_image_from_url(url)
        caption = generate_caption(image)
        print(f"Generated Caption: {caption}")
        display_image_with_caption(image, caption)
    except Exception as e:
        print(f"Error processing image {i+1}: {e}")

## Experiment with Different Generation Parameters

In [None]:
# Load a test image
test_image = load_image_from_url(sample_urls[0])

# Test different parameters
parameters = [
    {"max_length": 30, "num_beams": 3},
    {"max_length": 50, "num_beams": 5},
    {"max_length": 100, "num_beams": 8}
]

print("Comparing different generation parameters:\n")
for i, params in enumerate(parameters):
    caption = generate_caption(test_image, **params)
    print(f"Config {i+1} (max_length={params['max_length']}, num_beams={params['num_beams']}):")
    print(f"  Caption: {caption}\n")

## Model Information and Analysis

In [None]:
# Model information
print("Model Configuration:")
print(f"Model type: {type(model).__name__}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Processor information
print("\nProcessor Configuration:")
print(f"Image processor: {type(processor.image_processor).__name__}")
print(f"Tokenizer: {type(processor.tokenizer).__name__}")
print(f"Vocabulary size: {processor.tokenizer.vocab_size:,}")

## Performance Benchmarking

In [None]:
import time

# Benchmark inference time
test_image = load_image_from_url(sample_urls[0])
num_runs = 5

print(f"Benchmarking inference time over {num_runs} runs...\n")

times = []
for i in range(num_runs):
    start_time = time.time()
    caption = generate_caption(test_image)
    end_time = time.time()
    
    inference_time = end_time - start_time
    times.append(inference_time)
    print(f"Run {i+1}: {inference_time:.3f}s - Caption: {caption}")

avg_time = sum(times) / len(times)
print(f"\nAverage inference time: {avg_time:.3f}s")
print(f"Min time: {min(times):.3f}s")
print(f"Max time: {max(times):.3f}s")

## Future Experiments

This notebook provides a foundation for further experimentation:

1. **Fine-tuning**: Train on domain-specific datasets
2. **Evaluation**: Implement BLEU, CIDEr, and other metrics
3. **Comparison**: Test different vision-language models
4. **Optimization**: Explore quantization and acceleration techniques
5. **Analysis**: Study model attention patterns and failure cases