# Data Exploration - Driver Drowsiness Dataset

This notebook explores the Driver Drowsiness Dataset (DDD) to understand:
- Dataset structure and organization
- Class distribution
- Image properties and statistics
- Sample visualizations


In [None]:
import os
import sys
from pathlib import Path
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Dataset Structure


In [None]:
# Define data paths
data_dir = project_root / "Data"
drowsy_dir = data_dir / "Drowsy"
non_drowsy_dir = data_dir / "Non Drowsy"

print(f"Data directory: {data_dir}")
print(f"Drowsy directory: {drowsy_dir}")
print(f"Non Drowsy directory: {non_drowsy_dir}")
print(f"\nDrowsy directory exists: {drowsy_dir.exists()}")
print(f"Non Drowsy directory exists: {non_drowsy_dir.exists()}")


## 2. Count Images in Each Class


In [None]:
# Count images in each class
drowsy_images = list(drowsy_dir.glob("*.png"))
non_drowsy_images = list(non_drowsy_dir.glob("*.png"))

print(f"Number of Drowsy images: {len(drowsy_images)}")
print(f"Number of Non Drowsy images: {len(non_drowsy_images)}")
print(f"Total images: {len(drowsy_images) + len(non_drowsy_images)}")

# Visualize class distribution
fig, ax = plt.subplots(figsize=(8, 6))
classes = ['Drowsy', 'Non Drowsy']
counts = [len(drowsy_images), len(non_drowsy_images)]
colors = ['red', 'green']

bars = ax.bar(classes, counts, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Number of Images', fontsize=12)
ax.set_title('Class Distribution', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(counts) * 1.1)

# Add count labels on bars
for bar, count in zip(bars, counts):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count:,}', ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()

print(f"\nClass balance ratio (Non Drowsy / Drowsy): {len(non_drowsy_images) / len(drowsy_images):.2f}")


## 3. Image Properties Analysis


In [None]:
# Sample images to analyze properties
sample_size = min(100, len(drowsy_images), len(non_drowsy_images))
sample_drowsy = np.random.choice(drowsy_images, sample_size, replace=False)
sample_non_drowsy = np.random.choice(non_drowsy_images, sample_size, replace=False)

sizes_drowsy = []
sizes_non_drowsy = []

print("Analyzing image properties...")
for img_path in sample_drowsy:
    img = Image.open(img_path)
    sizes_drowsy.append(img.size)

for img_path in sample_non_drowsy:
    img = Image.open(img_path)
    sizes_non_drowsy.append(img.size)

# Analyze sizes
drowsy_widths = [s[0] for s in sizes_drowsy]
drowsy_heights = [s[1] for s in sizes_drowsy]
non_drowsy_widths = [s[0] for s in sizes_non_drowsy]
non_drowsy_heights = [s[1] for s in sizes_non_drowsy]

print(f"\nDrowsy images - Width: {np.mean(drowsy_widths):.1f} ± {np.std(drowsy_widths):.1f}")
print(f"Drowsy images - Height: {np.mean(drowsy_heights):.1f} ± {np.std(drowsy_heights):.1f}")
print(f"\nNon Drowsy images - Width: {np.mean(non_drowsy_widths):.1f} ± {np.std(non_drowsy_widths):.1f}")
print(f"Non Drowsy images - Height: {np.mean(non_drowsy_heights):.1f} ± {np.std(non_drowsy_heights):.1f}")

# Check if all images are 227x227 as specified
all_227x227 = all(s == (227, 227) for s in sizes_drowsy + sizes_non_drowsy)
print(f"\nAll sampled images are 227x227: {all_227x227}")


## 4. Sample Visualizations


In [None]:
# Display sample images from each class
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle('Sample Images from Dataset', fontsize=16, fontweight='bold')

# Drowsy samples
for i, ax in enumerate(axes[0]):
    img_path = np.random.choice(drowsy_images)
    img = Image.open(img_path)
    ax.imshow(img)
    ax.set_title('Drowsy', fontsize=10, color='red', fontweight='bold')
    ax.axis('off')

# Non Drowsy samples
for i, ax in enumerate(axes[1]):
    img_path = np.random.choice(non_drowsy_images)
    img = Image.open(img_path)
    ax.imshow(img)
    ax.set_title('Non Drowsy (Alert)', fontsize=10, color='green', fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()


## 5. Summary

This dataset contains:
- **Drowsy class**: Face images showing drowsy drivers
- **Non Drowsy class**: Face images showing alert drivers
- **Image size**: 227 × 227 pixels (RGB)
- **Total images**: > 41,790 images
- **Format**: PNG files

The dataset is ready for training a binary classification CNN model.
