# Beverage Detection - Data Exploration

This notebook helps explore and analyze the training dataset.

In [None]:
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from PIL import Image

# Set paths
PROJECT_ROOT = Path("../..").resolve()
DATA_DIR = PROJECT_ROOT / "data"
CONFIG_PATH = PROJECT_ROOT / "training" / "configs" / "beverages.yaml"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")

## Load Dataset Configuration

In [None]:
# Load config
if CONFIG_PATH.exists():
    with open(CONFIG_PATH) as f:
        config = yaml.safe_load(f)
    print(f"Classes: {config.get('nc', 'unknown')}")
    print(f"Class names: {list(config.get('names', {}).values())}")
else:
    print("Config not found - run prepare_dataset.py first")

## Dataset Statistics

In [None]:
def count_images_and_labels(split_dir: Path) -> tuple[int, int]:
    """Count images and labels in a split directory."""
    images_dir = split_dir / "images"
    labels_dir = split_dir / "labels"
    
    n_images = len(list(images_dir.glob("*"))) if images_dir.exists() else 0
    n_labels = len(list(labels_dir.glob("*.txt"))) if labels_dir.exists() else 0
    
    return n_images, n_labels

dataset_dir = DATA_DIR / "dataset"

if dataset_dir.exists():
    for split in ["train", "val", "test"]:
        n_img, n_lbl = count_images_and_labels(dataset_dir / split)
        print(f"{split}: {n_img} images, {n_lbl} labels")
else:
    print("Dataset not prepared - run prepare_dataset.py first")

## Class Distribution

In [None]:
def get_class_distribution(labels_dir: Path) -> dict[int, int]:
    """Count instances per class from YOLO label files."""
    class_counts = {}
    
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file) as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    class_id = int(parts[0])
                    class_counts[class_id] = class_counts.get(class_id, 0) + 1
    
    return class_counts

if dataset_dir.exists():
    train_labels = dataset_dir / "train" / "labels"
    if train_labels.exists():
        distribution = get_class_distribution(train_labels)
        
        # Plot
        if distribution:
            class_names = config.get("names", {}) if 'config' in dir() else {}
            
            labels = [class_names.get(k, f"class_{k}") for k in sorted(distribution.keys())]
            counts = [distribution[k] for k in sorted(distribution.keys())]
            
            plt.figure(figsize=(12, 6))
            plt.bar(labels, counts)
            plt.xticks(rotation=45, ha="right")
            plt.xlabel("Class")
            plt.ylabel("Count")
            plt.title("Class Distribution (Training Set)")
            plt.tight_layout()
            plt.show()
        else:
            print("No annotations found")

## Sample Images

In [None]:
def visualize_sample(image_path: Path, label_path: Path, class_names: dict) -> None:
    """Visualize an image with its bounding boxes."""
    img = Image.open(image_path)
    img_array = np.array(img)
    h, w = img_array.shape[:2]
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    ax.imshow(img_array)
    
    if label_path.exists():
        with open(label_path) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    cx, cy, bw, bh = map(float, parts[1:5])
                    
                    # Convert YOLO format to pixel coordinates
                    x1 = (cx - bw/2) * w
                    y1 = (cy - bh/2) * h
                    box_w = bw * w
                    box_h = bh * h
                    
                    rect = plt.Rectangle(
                        (x1, y1), box_w, box_h,
                        fill=False, edgecolor="red", linewidth=2
                    )
                    ax.add_patch(rect)
                    
                    label = class_names.get(class_id, f"class_{class_id}")
                    ax.text(x1, y1-5, label, color="red", fontsize=10)
    
    ax.set_title(image_path.name)
    ax.axis("off")
    plt.tight_layout()
    plt.show()

# Visualize a few samples
if dataset_dir.exists():
    images_dir = dataset_dir / "train" / "images"
    labels_dir = dataset_dir / "train" / "labels"
    
    if images_dir.exists():
        image_files = list(images_dir.glob("*"))[:3]
        class_names = config.get("names", {}) if 'config' in dir() else {}
        
        for img_path in image_files:
            label_path = labels_dir / (img_path.stem + ".txt")
            visualize_sample(img_path, label_path, class_names)

## Image Dimensions Analysis

In [None]:
def analyze_image_dimensions(images_dir: Path) -> pd.DataFrame:
    """Analyze image dimensions in dataset."""
    data = []
    
    for img_path in images_dir.glob("*"):
        try:
            with Image.open(img_path) as img:
                w, h = img.size
                data.append({"file": img_path.name, "width": w, "height": h})
        except Exception:
            pass
    
    return pd.DataFrame(data)

if dataset_dir.exists():
    images_dir = dataset_dir / "train" / "images"
    
    if images_dir.exists():
        df = analyze_image_dimensions(images_dir)
        
        if not df.empty:
            print("Image dimensions statistics:")
            print(df[["width", "height"]].describe())
            
            # Plot histogram
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            
            axes[0].hist(df["width"], bins=20)
            axes[0].set_xlabel("Width")
            axes[0].set_ylabel("Count")
            axes[0].set_title("Width Distribution")
            
            axes[1].hist(df["height"], bins=20)
            axes[1].set_xlabel("Height")
            axes[1].set_ylabel("Count")
            axes[1].set_title("Height Distribution")
            
            plt.tight_layout()
            plt.show()