# EDA Insights Summary

This notebook synthesizes findings from all analyses and provides recommendations for the multi-factor scoring system.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from utils import categorize_federation_testing_status

print("Loading all analysis results...")

# Load saved metrics
try:
    experience_metrics = pd.read_parquet("../data/processed/experience_metrics.parquet")
    print(f"✓ Experience metrics loaded: {len(experience_metrics):,} rows")
except:
    print("✗ Experience metrics not found")
    experience_metrics = None

try:
    age_metrics = pd.read_parquet("../data/processed/age_metrics.parquet")
    print(f"✓ Age metrics loaded: {len(age_metrics):,} rows")
except:
    print("✗ Age metrics not found")
    age_metrics = None

try:
    meet_quality = pd.read_parquet("../data/processed/meet_quality_metrics.parquet")
    print(f"✓ Meet quality metrics loaded: {len(meet_quality):,} rows")
except:
    print("✗ Meet quality metrics not found")
    meet_quality = None

try:
    consistency_metrics = pd.read_parquet("../data/processed/consistency_metrics.parquet")
    print(f"✓ Consistency metrics loaded: {len(consistency_metrics):,} rows")
except:
    print("✗ Consistency metrics not found")
    consistency_metrics = None

print("\nAll metrics loaded successfully!")


## Key Findings by Factor

### 1. Experience Factor
### 2. Age Factor  
### 3. Competition Quality Factor
### 4. Consistency Factor


In [None]:
# Generate summary statistics for each factor
insights = {
    'experience': {},
    'age': {},
    'quality': {},
    'consistency': {}
}

if experience_metrics is not None:
    # Add testing status if not present
    if 'FederationTestingStatus' not in experience_metrics.columns:
        # Load full dataset to get testing status
        df_full = pd.read_parquet("../data/processed/full_dataset.parquet")
        df_full = categorize_federation_testing_status(df_full)
        experience_metrics = experience_metrics.merge(
            df_full[['Name', 'FederationTestingStatus']].drop_duplicates('Name'),
            on='Name', how='left'
        )
    
    insights['experience'] = {}
    for testing_status in ['Drug Tested', 'Untested']:
        status_metrics = experience_metrics[experience_metrics['FederationTestingStatus'] == testing_status]
        if len(status_metrics) > 0:
            insights['experience'][testing_status] = {
                'avg_years': status_metrics['YearsCompeting'].mean(),
                'avg_meets': status_metrics['TotalMeets'].mean(),
                'correlation': status_metrics['YearsCompeting'].corr(status_metrics.get('Dots', status_metrics.get('Wilks', status_metrics.get('TotalKg', 0))))
            }

if age_metrics is not None:
    insights['age'] = {
        'avg_age': age_metrics['Age'].mean(),
        'age_range': (age_metrics['Age'].min(), age_metrics['Age'].max()),
        'correlation': age_metrics['Age'].corr(age_metrics.iloc[:, -1])  # Last column should be score
    }

if meet_quality is not None:
    insights['quality'] = {}
    for testing_status in ['Drug Tested', 'Untested']:
        status_quality = meet_quality[meet_quality['FederationTestingStatus'] == testing_status]
        if len(status_quality) > 0:
            insights['quality'][testing_status] = {
                'avg_competitors': status_quality['CompetitorCount'].mean(),
                'avg_strength': status_quality['AvgScore'].mean(),
                'elite_threshold': status_quality['AvgScore'].quantile(0.90)
            }

if consistency_metrics is not None:
    # Add testing status if not present
    if 'FederationTestingStatus' not in consistency_metrics.columns:
        # Load full dataset to get testing status
        df_full = pd.read_parquet("../data/processed/full_dataset.parquet")
        df_full = categorize_federation_testing_status(df_full)
        consistency_metrics = consistency_metrics.merge(
            df_full[['Name', 'FederationTestingStatus']].drop_duplicates('Name'),
            on='Name', how='left'
        )
    
    insights['consistency'] = {}
    for testing_status in ['Drug Tested', 'Untested']:
        status_metrics = consistency_metrics[consistency_metrics['FederationTestingStatus'] == testing_status]
        if len(status_metrics) > 0:
            insights['consistency'][testing_status] = {
                'avg_cv': status_metrics['CV'].mean(),
                'median_cv': status_metrics['CV'].median(),
                'avg_peak_ratio': status_metrics['PeakToAvgRatio'].mean()
            }

print("=== Summary Insights ===")
for factor, data in insights.items():
    print(f"\n{factor.upper()}:")
    for key, value in data.items():
        print(f"  {key}: {value}")
