# Basketball Shot Analysis - Data Synchronization

This notebook synchronizes evaluation data from Supabase, downloads missing videos, and prepares the ground truth dataset for model evaluation.

## Setup

1. Ensure you have your Supabase credentials configured
2. Install required packages: `pip install -r ../requirements.txt`
3. Run cells in order to sync your evaluation dataset


In [None]:
import sys
import os
from pathlib import Path

# Add src directory to path
sys.path.append(str(Path('../src').resolve()))

import pandas as pd
from dotenv import load_dotenv
from data_manager import EvaluationDataManager

# Load environment variables
load_dotenv('../../.env')  # Adjust path as needed

# Supabase configuration
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_SERVICE_ROLE_KEY = os.getenv('SUPABASE_SERVICE_ROLE_KEY')  # Use service role for full access
SUPABASE_BUCKET = os.getenv('SUPABASE_STORAGE_BUCKET', 'clips')

if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
    print("⚠️  Missing Supabase credentials!")
    print("Please ensure SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are set in your .env file")
else:
    print("✅ Supabase credentials loaded")
    print(f"URL: {SUPABASE_URL}")
    print(f"Bucket: {SUPABASE_BUCKET}")


In [None]:
# Initialize data manager
data_manager = EvaluationDataManager(
    supabase_url=SUPABASE_URL,
    supabase_key=SUPABASE_SERVICE_ROLE_KEY,
    data_dir='../data'
)

print("✅ Data manager initialized")
print(f"📁 Data directory: {data_manager.data_dir}")
print(f"🎥 Videos directory: {data_manager.videos_dir}")


In [None]:
# Fetch evaluation dataset from Supabase
print("🔄 Fetching evaluation dataset from Supabase...")
df = data_manager.fetch_evaluation_dataset()

print(f"\n📊 Dataset Overview:")
print(f"Total clips: {len(df)}")
print(f"Columns: {list(df.columns)}")

# Display first few rows
df.head()


In [None]:
# Analyze ground truth distribution
print("📈 Ground Truth Analysis:")
print("\nShot Type Distribution:")
print(df['ground_truth_shot_type'].value_counts())

print("\nResult Distribution:")
print(df['ground_truth_result'].value_counts())

print("\nUser Corrections Analysis:")
shot_type_corrections = (df['ground_truth_shot_type'] != df['original_shot_type']).sum()
result_corrections = (df['ground_truth_result'] != df['original_result']).sum()

print(f"Shot type corrections: {shot_type_corrections}/{len(df)} ({shot_type_corrections/len(df)*100:.1f}%)")
print(f"Result corrections: {result_corrections}/{len(df)} ({result_corrections/len(df)*100:.1f}%)")

# Show examples of corrections
corrections_df = df[
    (df['ground_truth_shot_type'] != df['original_shot_type']) | 
    (df['ground_truth_result'] != df['original_result'])
]

if len(corrections_df) > 0:
    print(f"\n🔍 Sample Corrections:")
    for _, row in corrections_df.head(3).iterrows():
        print(f"Clip {row['clip_id'][:8]}:")
        if row['ground_truth_shot_type'] != row['original_shot_type']:
            print(f"  Shot type: {row['original_shot_type']} → {row['ground_truth_shot_type']}")
        if row['ground_truth_result'] != row['original_result']:
            print(f"  Result: {row['original_result']} → {row['ground_truth_result']}")
        print()


In [None]:
# Download missing videos
print("⬇️  Downloading missing videos...")
downloaded_clips = data_manager.download_missing_videos(df, SUPABASE_BUCKET)

if downloaded_clips:
    print(f"✅ Downloaded {len(downloaded_clips)} new videos")
else:
    print("✅ All videos already present locally")

# Check video availability
video_paths = [data_manager.videos_dir / f"{clip_id}.mp4" for clip_id in df['clip_id']]
videos_available = sum(1 for path in video_paths if path.exists())

print(f"\n📹 Video Status:")
print(f"Available locally: {videos_available}/{len(df)} ({videos_available/len(df)*100:.1f}%)")


In [None]:
# Save ground truth dataset
print("💾 Saving ground truth dataset...")
data_manager.save_ground_truth(df)

# Get dataset statistics
stats = data_manager.get_dataset_stats()
print("\n📊 Final Dataset Statistics:")
for key, value in stats.items():
    if isinstance(value, dict):
        print(f"{key}:")
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"{key}: {value}")

print(f"\n✅ Data synchronization complete!")
print(f"📁 Ground truth saved to: {data_manager.ground_truth_file}")
print(f"🎥 Videos stored in: {data_manager.videos_dir}")
print(f"\n🚀 Ready for model evaluation!")
