# Projection Accuracy Analysis

Compare projections to actual results. Which systems are most accurate? Which stats are hardest to predict?

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from fantasy_baseball_manager.db.connection import create_connection
from fantasy_baseball_manager.repos.batting_stats_repo import SqliteBattingStatsRepo
from fantasy_baseball_manager.repos.pitching_stats_repo import SqlitePitchingStatsRepo
from fantasy_baseball_manager.repos.projection_repo import SqliteProjectionRepo
from fantasy_baseball_manager.services.projection_evaluator import ProjectionEvaluator

conn = create_connection("../data/fbm.db")

projection_repo = SqliteProjectionRepo(conn)
batting_repo = SqliteBattingStatsRepo(conn)
pitching_repo = SqlitePitchingStatsRepo(conn)
evaluator = ProjectionEvaluator(projection_repo, batting_repo, pitching_repo)

## Single System Evaluation

Evaluate one projection system's accuracy across all stats.

In [None]:
SEASON = 2024
SYSTEM = "steamer"  # change to match your data
VERSION = "pre"  # change to match your data

result = evaluator.evaluate(SYSTEM, VERSION, SEASON)
metrics_df = pd.DataFrame(
    [
        {"stat": stat, "rmse": m.rmse, "mae": m.mae, "correlation": m.correlation, "n": m.n}
        for stat, m in result.metrics.items()
    ]
).sort_values("rmse", ascending=False)
metrics_df

## Cross-System Comparison

Compare accuracy metrics across multiple projection systems side-by-side.

In [None]:
# List the systems to compare as (system, version) tuples
SYSTEMS_TO_COMPARE = [
    ("steamer", "pre"),
    ("zips", "pre"),
    # Add more systems as available in your data
]
KEY_STATS = ["hr", "avg", "era", "whip", "war"]

comparison = evaluator.compare(SYSTEMS_TO_COMPARE, SEASON, stats=KEY_STATS)

rows = []
for sys_metrics in comparison.systems:
    for stat, m in sys_metrics.metrics.items():
        rows.append(
            {"system": sys_metrics.system, "stat": stat, "rmse": m.rmse, "mae": m.mae, "correlation": m.correlation}
        )
comp_df = pd.DataFrame(rows)
comp_df.pivot(index="stat", columns="system", values="rmse")

## Visualize Accuracy

Bar charts comparing RMSE across systems for key stats.

In [None]:
if not comp_df.empty:
    fig, ax = plt.subplots(figsize=(10, 5))
    pivot = comp_df.pivot(index="stat", columns="system", values="rmse")
    pivot.plot(kind="bar", ax=ax)
    ax.set_ylabel("RMSE")
    ax.set_title(f"Projection RMSE by System — {SEASON}")
    ax.legend(title="System")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Correlation Scatter Plots

For a given system and stat, scatter projected vs actual values with a diagonal reference line.

In [None]:
SCATTER_STAT = "hr"  # change to any stat of interest

# Fetch projected and actual values via SQL for scatter plot
scatter_query = f"""
SELECT pr.{SCATTER_STAT} AS projected,
       bs.{SCATTER_STAT} AS actual
  FROM projection pr
  JOIN batting_stats bs ON bs.player_id = pr.player_id AND bs.season = pr.season
 WHERE pr.system = ?
   AND pr.version = ?
   AND pr.season = ?
   AND pr.player_type = 'batter'
   AND bs.source = 'fangraphs'
   AND pr.{SCATTER_STAT} IS NOT NULL
   AND bs.{SCATTER_STAT} IS NOT NULL
"""

scatter_df = pd.read_sql(scatter_query, conn, params=[SYSTEM, VERSION, SEASON])

if not scatter_df.empty:
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(scatter_df["actual"], scatter_df["projected"], alpha=0.5, s=20)
    lims = [
        min(scatter_df["actual"].min(), scatter_df["projected"].min()),
        max(scatter_df["actual"].max(), scatter_df["projected"].max()),
    ]
    ax.plot(lims, lims, "--", color="gray", linewidth=1)
    ax.set_xlabel(f"Actual {SCATTER_STAT.upper()}")
    ax.set_ylabel(f"Projected {SCATTER_STAT.upper()}")
    ax.set_title(f"{SYSTEM} {SCATTER_STAT.upper()} — Projected vs Actual ({SEASON})")
    plt.tight_layout()
    plt.show()