# Image Captioning: Data Preprocessing Pipeline

This notebook runs the preprocessing pipeline:
1. Load and clean caption data
2. Create train/validation/test splits
3. Build vocabulary
4. Save preprocessed data

In [None]:
# Setup and imports
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Add project root to path
project_root = os.path.dirname(os.getcwd())
sys.path.append(project_root)

# Import project modules
from src.utils.manager import ConfigManager
from src.utils.constants import SEED
from src.utils.io import save_pickle, ensure_dir
from src.preprocessing.vocabulary import Vocabulary, preprocess_caption, analyze_vocab_coverage
from src.preprocessing.dataset import create_data_splits
from src.preprocessing.transforms import get_transforms, denormalize_image

# Set random seed
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Initialize configuration manager
config_manager = ConfigManager()

# Get configurations
data_config = config_manager.get_data_params()
debug_mode = config_manager.debug

print(f"Debug mode: {debug_mode}")
if debug_mode:
    print(f"Max images in debug mode: {data_config['debug']['max_images']}")
    print(f"Output directory: {config_manager.paths['processed']}")

# Ensure output directories exist
ensure_dir(config_manager.paths['processed'])

## Load and Process Captions

In [None]:
# Load captions
captions_file = data_config['dataset']['captions_file']
images_dir = data_config['dataset']['images_dir']

print(f"Loading captions from: {captions_file}")
captions_df = pd.read_csv(captions_file)
print(f"Loaded {len(captions_df)} captions")

# If debug mode, limit dataset
if debug_mode:
    max_images = data_config['debug']['max_images']
    unique_images = captions_df['image'].unique()[:max_images]
    captions_df = captions_df[captions_df['image'].isin(unique_images)].reset_index(drop=True)
    print(f"\nDEBUG MODE: Limited to {len(unique_images)} images, {len(captions_df)} captions")

In [None]:
# Process captions
print("\nProcessing captions...")
captions_df['processed_caption'] = captions_df['caption'].apply(preprocess_caption)

# Show examples
print("\nSample processed captions:")
for i in range(min(3, len(captions_df))):
    print(f"Original: {captions_df.iloc[i]['caption']}")
    print(f"Processed: {captions_df.iloc[i]['processed_caption']}")
    print()

In [None]:
# Caption length analysis
captions_df['caption_length'] = captions_df['processed_caption'].apply(lambda x: len(x.split()))

plt.figure(figsize=(10, 5))
plt.hist(captions_df['caption_length'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Caption Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Caption Lengths')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Caption length statistics:")
print(captions_df['caption_length'].describe())

## Create Data Splits

In [None]:
# Create splits
train_ratio = data_config['preprocessing']['train_split']
val_ratio = data_config['preprocessing']['val_split']
test_ratio = data_config['preprocessing']['test_split']

print(f"Creating splits: train={train_ratio}, val={val_ratio}, test={test_ratio}")

train_df, val_df, test_df = create_data_splits(
    captions_df, 
    train_ratio=train_ratio,
    val_ratio=val_ratio,
    test_ratio=test_ratio,
    random_state=SEED
)

print(f"\nSplit sizes:")
print(f"Train: {len(train_df)} captions, {len(train_df['image'].unique())} images")
print(f"Val: {len(val_df)} captions, {len(val_df['image'].unique())} images")
print(f"Test: {len(test_df)} captions, {len(test_df['image'].unique())} images")

In [None]:
# Verify no overlap between splits
train_images = set(train_df['image'].unique())
val_images = set(val_df['image'].unique())
test_images = set(test_df['image'].unique())

print("Checking for overlaps between splits:")
print(f"Train-Val overlap: {len(train_images & val_images)} images")
print(f"Train-Test overlap: {len(train_images & test_images)} images")
print(f"Val-Test overlap: {len(val_images & test_images)} images")

assert len(train_images & val_images) == 0, "Train and validation sets overlap!"
assert len(train_images & test_images) == 0, "Train and test sets overlap!"
assert len(val_images & test_images) == 0, "Validation and test sets overlap!"
print("\nâœ“ No overlaps found!")

## Build Vocabulary

In [None]:
# Build vocabulary from training set only
vocab_threshold = data_config['preprocessing']['vocab_threshold']
print(f"Building vocabulary with frequency threshold: {vocab_threshold}")

vocab = Vocabulary(freq_threshold=vocab_threshold)
vocab.build_vocabulary(train_df['processed_caption'].tolist())

# Show vocabulary statistics
print(f"\nVocabulary statistics:")
print(f"Total unique words seen: {len(vocab.word_frequencies)}")
print(f"Words in vocabulary: {len(vocab) - 4}")
print(f"Total vocabulary size (with special tokens): {len(vocab)}")

In [None]:
# Analyze vocabulary coverage
print("\nVocabulary coverage analysis:")
print("\nTraining set:")
train_coverage, _ = analyze_vocab_coverage(train_df, vocab)

print("\nValidation set:")
val_coverage, _ = analyze_vocab_coverage(val_df, vocab)

print("\nTest set:")
test_coverage, _ = analyze_vocab_coverage(test_df, vocab)

print(f"\nSummary:")
print(f"Train coverage: {train_coverage:.2f}%")
print(f"Val coverage: {val_coverage:.2f}%")
print(f"Test coverage: {test_coverage:.2f}%")

In [None]:
# Show most frequent words
print("\nMost frequent words in vocabulary:")
most_freq = vocab.get_most_frequent_words(20)
for i, (word, count) in enumerate(most_freq[:20], 1):
    print(f"{i:2d}. '{word}': {count} times")

## Test Dataset Loading

In [None]:
# Test dataset creation
import torch
from src.preprocessing.dataset import FlickrDataset, FlickrCollate

# Get transforms
transform_train, transform_val = get_transforms(
    resize=data_config['image']['resize_size'],
    crop=data_config['image']['crop_size']
)

# Create small test dataset
test_dataset = FlickrDataset(
    data_df=train_df.iloc[:10],
    root_dir=images_dir,
    vocab=vocab,
    transform=transform_val
)

print(f"Test dataset size: {len(test_dataset)}")

In [None]:
# Test data loading
from torch.utils.data import DataLoader

test_loader = DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=FlickrCollate(pad_idx=vocab.stoi["<PAD>"])
)

# Get one batch
images, captions, lengths = next(iter(test_loader))

print(f"Batch shapes:")
print(f"Images: {images.shape}")
print(f"Captions: {captions.shape}")
print(f"Lengths: {lengths}")

# Display batch
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for i in range(4):
    # Denormalize and display image
    img = denormalize_image(images[i])
    axes[i].imshow(img)
    
    # Decode caption
    caption_idx = captions[i].tolist()
    caption_words = []
    for idx in caption_idx:
        token = vocab.itos[idx]
        if token == "<EOS>":
            break
        if token not in ["<PAD>", "< SOS >"]:
            caption_words.append(token)
    
    axes[i].set_title(' '.join(caption_words), fontsize=10)
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## Save Preprocessed Data

In [None]:
# Save vocabulary
vocab_path = config_manager.paths['vocab']
vocab.save(vocab_path)
print(f"Saved vocabulary to: {vocab_path}")

# Save data splits
splits_path = config_manager.paths['splits']
splits = {
    'train': train_df,
    'val': val_df,
    'test': test_df
}
save_pickle(splits, splits_path)
print(f"Saved data splits to: {splits_path}")

# Save summary
summary = {
    'dataset': data_config['dataset']['name'],
    'debug_mode': debug_mode,
    'vocab_size': len(vocab),
    'vocab_threshold': vocab_threshold,
    'train_size': len(train_df),
    'val_size': len(val_df),
    'test_size': len(test_df),
    'train_images': len(train_df['image'].unique()),
    'val_images': len(val_df['image'].unique()),
    'test_images': len(test_df['image'].unique()),
    'train_coverage': train_coverage,
    'val_coverage': val_coverage,
    'test_coverage': test_coverage
}

from src.utils.io import save_json
summary_path = os.path.join(config_manager.paths['processed'], 'preprocessing_summary.json')
save_json(summary, summary_path)
print(f"\nSaved preprocessing summary to: {summary_path}")

In [None]:
# Print final summary
print("\nPREPROCESSING COMPLETE")
print()
print(f"Debug mode: {debug_mode}")
print(f"Output directory: {config_manager.paths['processed']}")
print(f"\nDataset:")
print(f"  Vocabulary size: {len(vocab)}")
print(f"  Training samples: {len(train_df)}")
print(f"  Validation samples: {len(val_df)}")
print(f"  Test samples: {len(test_df)}")
print(f"\nCoverage:")
print(f"  Train: {train_coverage:.2f}%")
print(f"  Val: {val_coverage:.2f}%")
print(f"  Test: {test_coverage:.2f}%")
print(f"\nFiles saved:")
print(f"  - {vocab_path}")
print(f"  - {splits_path}")
print(f"  - {summary_path}")