# In-Season Talent-Delta Analysis

Compare statcast-gbm true-talent estimates to actual stats to surface regression candidates (sell high) and buy-low targets.

In [None]:
from dataclasses import asdict

import matplotlib.pyplot as plt
import pandas as pd

from fantasy_baseball_manager.db.connection import create_connection
from fantasy_baseball_manager.repos.batting_stats_repo import SqliteBattingStatsRepo
from fantasy_baseball_manager.repos.pitching_stats_repo import SqlitePitchingStatsRepo
from fantasy_baseball_manager.repos.player_repo import SqlitePlayerRepo
from fantasy_baseball_manager.repos.projection_repo import SqliteProjectionRepo
from fantasy_baseball_manager.services.performance_report import PerformanceReportService

conn = create_connection("../data/fbm.db")
report_service = PerformanceReportService(
    SqliteProjectionRepo(conn),
    SqlitePlayerRepo(conn),
    SqliteBattingStatsRepo(conn),
    SqlitePitchingStatsRepo(conn),
)

SYSTEM = "statcast-gbm"
VERSION = "latest"
SEASON = 2025
PLAYER_TYPE = "batter"
MIN_PA = 200

In [None]:
deltas = report_service.compute_deltas(SYSTEM, VERSION, SEASON, PLAYER_TYPE, min_pa=MIN_PA)
df = pd.DataFrame([asdict(d) for d in deltas])
print(f"{len(df)} player-stat deltas computed")
df.head()

## Regression Candidates

Players whose actual stats are running above their true-talent estimate — sell-high targets.

In [None]:
TOP_N = 10

regression = df[df["performance_delta"] > 0].copy()
for stat, group in regression.groupby("stat_name"):
    top = group.sort_values("performance_delta", ascending=False).head(TOP_N)
    display(
        top[["player_name", "stat_name", "actual", "expected", "delta", "percentile"]]
        .style.set_caption(f"{stat} — Regression Candidates")
        .format({"actual": "{:.3f}", "expected": "{:.3f}", "delta": "{:+.3f}", "percentile": "{:.0f}"})
    )

## Buy-Low Targets

Players whose actual stats are running below their true-talent estimate — buy-low opportunities.

In [None]:
buylow = df[df["performance_delta"] < 0].copy()
for stat, group in buylow.groupby("stat_name"):
    top = group.sort_values("performance_delta", ascending=True).head(TOP_N)
    display(
        top[["player_name", "stat_name", "actual", "expected", "delta", "percentile"]]
        .style.set_caption(f"{stat} — Buy-Low Targets")
        .format({"actual": "{:.3f}", "expected": "{:.3f}", "delta": "{:+.3f}", "percentile": "{:.0f}"})
    )

## Delta Distribution

Histogram of performance deltas per stat — are they normally distributed or skewed?

In [None]:
stats = df["stat_name"].unique()
n_stats = len(stats)
fig, axes = plt.subplots(1, n_stats, figsize=(5 * n_stats, 4), squeeze=False)
for ax, stat in zip(axes[0], stats):
    subset = df[df["stat_name"] == stat]
    ax.hist(subset["delta"], bins=20, edgecolor="black", alpha=0.7)
    ax.axvline(0, color="red", linestyle="--", linewidth=1)
    ax.set_title(f"{stat} delta distribution")
    ax.set_xlabel("Delta (actual - talent)")
    ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

## Actual vs Talent Scatter

Scatter plot for a chosen stat with a diagonal reference line.

In [None]:
SCATTER_STAT = "avg"  # change to any stat of interest

scatter = df[df["stat_name"] == SCATTER_STAT]
if not scatter.empty:
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(scatter["actual"], scatter["expected"], alpha=0.5, s=20)
    lims = [
        min(scatter["actual"].min(), scatter["expected"].min()),
        max(scatter["actual"].max(), scatter["expected"].max()),
    ]
    ax.plot(lims, lims, "--", color="gray", linewidth=1)
    ax.set_xlabel(f"Actual {SCATTER_STAT}")
    ax.set_ylabel(f"Talent {SCATTER_STAT}")
    ax.set_title(f"{SYSTEM} {SCATTER_STAT} — Actual vs True Talent ({SEASON})")
    plt.tight_layout()
    plt.show()
else:
    print(f"No data for stat '{SCATTER_STAT}'")