# Data Exploration

In this notebook, we will explore the MSCOCO dataset to understand the images and their corresponding captions. We will visualize some samples from the dataset and analyze the distribution of captions.

In [1]:
# Import necessary libraries
import os
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import json

# Define paths
image_dir = '../data/processed/images/'  # Adjust path as necessary
caption_file = '../data/processed/captions.json'  # Adjust path as necessary

# Load captions
with open(caption_file, 'r') as f:
    captions = json.load(f)

# Display some sample images and their captions
def display_samples(images, captions, num_samples=5):
    plt.figure(figsize=(15, 10))
    for i in range(num_samples):
        plt.subplot(1, num_samples, i + 1)
        img_path = os.path.join(image_dir, images[i])
        img = Image.open(img_path)
        plt.imshow(img)
        plt.axis('off')
        plt.title(captions[images[i]])
    plt.show()

# Get sample images
sample_images = list(captions.keys())[:5]
display_samples(sample_images, captions)

# Analyze caption lengths
caption_lengths = [len(caption.split()) for caption in captions.values()]
plt.figure(figsize=(10, 5))
plt.hist(caption_lengths, bins=30, color='blue', alpha=0.7)
plt.title('Distribution of Caption Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()