# Stat Distributions & Exploration

Histograms, scatter plots, and summary statistics across players and projection systems.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from fantasy_baseball_manager.db.connection import create_connection

conn = create_connection("../data/fbm.db")

SEASON = 2025

In [None]:
# Load all projections into a DataFrame
proj_query = """
SELECT p.name_first || ' ' || p.name_last AS player,
       pr.system,
       pr.player_type,
       pr.pa,
       pr.hr,
       pr.avg,
       pr.obp,
       pr.slg,
       pr.war,
       pr.era,
       pr.whip,
       pr.ip,
       pr.so
  FROM projection pr
  JOIN player p ON p.id = pr.player_id
 WHERE pr.season = ?
"""
df = pd.read_sql(proj_query, conn, params=[SEASON])
batters = df[df["player_type"] == "batter"].copy()
pitchers = df[df["player_type"] == "pitcher"].copy()
print(f"{len(batters)} batter rows, {len(pitchers)} pitcher rows across {df['system'].nunique()} systems")

## Summary Statistics

Key batting and pitching stats grouped by system.

In [None]:
batting_stats = ["pa", "hr", "avg", "obp", "slg", "war"]
batters[batting_stats] = batters[batting_stats].apply(pd.to_numeric, errors="coerce")
batters.groupby("system")[batting_stats].describe().T

In [None]:
pitching_stats = ["era", "whip", "ip", "so", "war"]
pitchers[pitching_stats] = pitchers[pitching_stats].apply(pd.to_numeric, errors="coerce")
pitchers.groupby("system")[pitching_stats].describe().T

## Histograms

Distribution of HR, AVG, ERA, WAR across all players in a single system.

In [None]:
HIST_SYSTEM = batters["system"].iloc[0] if not batters.empty else "steamer"
sys_batters = batters[batters["system"] == HIST_SYSTEM]
sys_pitchers = pitchers[pitchers["system"] == HIST_SYSTEM]

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

sys_batters["hr"].dropna().plot.hist(bins=30, ax=axes[0, 0], title=f"{HIST_SYSTEM} — HR Distribution")
sys_batters["avg"].dropna().plot.hist(bins=30, ax=axes[0, 1], title=f"{HIST_SYSTEM} — AVG Distribution")
sys_pitchers["era"].dropna().plot.hist(bins=30, ax=axes[1, 0], title=f"{HIST_SYSTEM} — ERA Distribution")
sys_batters["war"].dropna().plot.hist(bins=30, ax=axes[1, 1], title=f"{HIST_SYSTEM} — WAR Distribution (Batters)")

plt.tight_layout()
plt.show()

## Scatter Plots

Relationships between stats: HR vs PA, ERA vs IP, batter WAR vs pitcher WAR.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# HR vs PA
axes[0].scatter(sys_batters["pa"], sys_batters["hr"], alpha=0.4, s=15)
axes[0].set_xlabel("PA")
axes[0].set_ylabel("HR")
axes[0].set_title("HR vs PA")

# ERA vs IP
axes[1].scatter(sys_pitchers["ip"], sys_pitchers["era"], alpha=0.4, s=15)
axes[1].set_xlabel("IP")
axes[1].set_ylabel("ERA")
axes[1].set_title("ERA vs IP")

# Batter WAR vs Pitcher WAR (side by side histograms)
axes[2].hist(sys_batters["war"].dropna(), bins=30, alpha=0.6, label="Batters")
axes[2].hist(sys_pitchers["war"].dropna(), bins=30, alpha=0.6, label="Pitchers")
axes[2].set_xlabel("WAR")
axes[2].set_title("WAR Distribution")
axes[2].legend()

plt.tight_layout()
plt.show()

## System Comparison

Overlay distributions from different projection systems.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for system in batters["system"].unique():
    sys_data = batters[batters["system"] == system]
    sys_data["hr"].dropna().plot.hist(bins=30, alpha=0.4, ax=axes[0], label=system)
    sys_data["war"].dropna().plot.hist(bins=30, alpha=0.4, ax=axes[1], label=system)

axes[0].set_title("HR Distribution by System")
axes[0].legend()
axes[1].set_title("WAR Distribution by System")
axes[1].legend()

plt.tight_layout()
plt.show()

## Top & Bottom Players

Rank players by projected WAR, HR, ERA within a system.

In [None]:
RANK_SYSTEM = HIST_SYSTEM
rank_batters = batters[batters["system"] == RANK_SYSTEM].copy()
rank_pitchers = pitchers[pitchers["system"] == RANK_SYSTEM].copy()

print(f"--- Top 15 Batters by WAR ({RANK_SYSTEM}) ---")
display(rank_batters.nlargest(15, "war")[["player", "war", "hr", "avg", "obp", "slg"]])

print(f"\n--- Top 15 Batters by HR ({RANK_SYSTEM}) ---")
display(rank_batters.nlargest(15, "hr")[["player", "hr", "war", "pa"]])

print(f"\n--- Top 15 Pitchers by WAR ({RANK_SYSTEM}) ---")
display(rank_pitchers.nlargest(15, "war")[["player", "war", "era", "whip", "ip"]])

print(f"\n--- Top 15 Pitchers by ERA (min 100 IP, {RANK_SYSTEM}) ---")
qualified = rank_pitchers[rank_pitchers["ip"] >= 100]
display(qualified.nsmallest(15, "era")[["player", "era", "whip", "war", "ip"]])