# 01 - Data Exploration: PianoVAM Dataset

This notebook explores the PianoVAM dataset for piano fingering detection.

**Contents:**
- Load dataset from HuggingFace
- Visualize sample videos and frames
- Parse and visualize MIDI data
- Explore hand skeleton annotations
- Understand data quality and characteristics


In [None]:
# Install dependencies (run once in Colab)
# !pip install -q datasets huggingface_hub opencv-python mido matplotlib seaborn tqdm

import os
import sys

# Setup for Colab
if 'google.colab' in str(get_ipython()):
    if not os.path.exists('computer-vision'):
        !git clone https://github.com/esnylmz/computer-vision.git
    os.chdir('computer-vision')
    !pip install -q -e .
else:
    # Local development
    sys.path.insert(0, '..')

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from tqdm.notebook import tqdm

print("Setup complete!")


## 1. Load PianoVAM Dataset

Load the dataset from HuggingFace and explore its structure.


In [None]:
# Import project modules
from src.data.dataset import PianoVAMDataset
from src.data.midi_utils import MidiProcessor
from src.data.video_utils import VideoProcessor
from src.utils.config import load_config

# Load configuration
config = load_config('configs/default.yaml')
print(f"Project: {config.project_name} v{config.version}")

# Load the dataset
# TIP: Use streaming=True (default) for faster loading and to avoid timeout errors.
# Use streaming=False only when you need to index into the dataset by position.
# NOTE: The dataset uses 'valid' for validation split, but 'validation' is automatically mapped.
# For exploration, we limit to 20 samples to avoid downloading all 107 samples.
print("\nLoading PianoVAM dataset with streaming mode...")
print("(This avoids timeout issues by not resolving all 432 files upfront)")
print("Limiting to 20 samples for quick exploration...")

# Load with streaming=True for reliability (recommended for exploration)
# max_samples=20 limits the dataset to first 20 samples for faster exploration
train_dataset = PianoVAMDataset(split='train', streaming=True, max_samples=20)
print("Train dataset loaded (streaming mode - first 20 samples)")

# Note: 'validation' is automatically mapped to 'valid' by the dataset loader
val_dataset = PianoVAMDataset(split='valid', streaming=True, max_samples=20)
print("Validation dataset loaded (streaming mode - first 20 samples)")

test_dataset = PianoVAMDataset(split='test', streaming=True, max_samples=20)
print("Test dataset loaded (streaming mode - first 20 samples)")

print("\nTo load fully for indexing (may take a few minutes):")
print("  dataset = PianoVAMDataset(split='train', streaming=False, timeout=120, max_retries=5)")
print("\nTo load more samples for exploration:")
print("  dataset = PianoVAMDataset(split='train', streaming=True, max_samples=100)")


In [None]:
# Explore a sample (using iteration for streaming mode)
sample = next(iter(train_dataset))

print(f"Sample ID: {sample.id}")
print(f"Composer: {sample.metadata['composer']}")
print(f"Piece: {sample.metadata['piece']}")
print(f"Performer: {sample.metadata['performer']}")
print(f"Skill Level: {sample.metadata['skill_level']}")
print(f"Duration: {sample.metadata['duration']:.1f}s")
print(f"\nKeyboard Corners: {sample.metadata['keyboard_corners']}")
print(f"\nPaths:")
print(f"  Video: {sample.video_path[:80]}...")
print(f"  MIDI: {sample.midi_path[:80]}...")
print(f"  Skeleton: {sample.skeleton_path[:80]}...")
