In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import sys

sys.path.append('..')
from src.config import *

In [None]:
print("="*70)
print("ASL Alphabet Dataset - Exploratory Data Analysis")
print("="*70)

# Count images per class
class_counts = {}
for class_name in CLASS_NAMES:
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        num_images = len(list(class_dir.glob('*.jpg')))
        class_counts[class_name] = num_images

# Create DataFrame
df = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
print("\nDataset Summary:")
print(df)
print(f"\nTotal Images: {df['Count'].sum():,}")
print(f"Average per class: {df['Count'].mean():.0f}")

In [None]:
plt.figure(figsize=(15, 6))
plt.bar(df['Class'], df['Count'], color='steelblue')
plt.xlabel('ASL Class', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.title('ASL Alphabet Dataset - Class Distribution', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../results/class_distribution.png', dpi=300)
plt.show()

# Cell 4: Sample Images Visualization
fig, axes = plt.subplots(4, 7, figsize=(20, 12))
axes = axes.ravel()

for idx, class_name in enumerate(CLASS_NAMES):
    class_dir = RAW_DATA_DIR / class_name
    if class_dir.exists():
        sample_image = list(class_dir.glob('*.jpg'))[0]
        img = Image.open(sample_image)
        axes[idx].imshow(img)
        axes[idx].set_title(f'Class: {class_name}', fontsize=12, fontweight='bold')
        axes[idx].axis('off')

plt.tight_layout()
plt.savefig('../results/sample_images.png', dpi=300)
plt.show()

In [None]:
print("\n" + "="*70)
print("Image Statistics")
print("="*70)

sample_imgs = []
for class_name in CLASS_NAMES[:5]:  # Sample from first 5 classes
    class_dir = RAW_DATA_DIR / class_name
    sample_image = list(class_dir.glob('*.jpg'))[0]
    img = np.array(Image.open(sample_image))
    sample_imgs.append(img)

sample_imgs = np.array(sample_imgs)
print(f"Sample shape: {sample_imgs[0].shape}")
print(f"Mean pixel value: {sample_imgs.mean():.2f}")
print(f"Std pixel value: {sample_imgs.std():.2f}")
print(f"Min pixel value: {sample_imgs.min()}")
print(f"Max pixel value: {sample_imgs.max()}")
