# Feature Engineering Playground

Explore the feature DSL â€” build feature sets from projections and stats, materialize datasets, inspect the results.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from fantasy_baseball_manager.db.connection import create_connection
from fantasy_baseball_manager.features import FeatureSet, SpineFilter, batting, delta, player, projection
from fantasy_baseball_manager.features.assembler import SqliteDatasetAssembler
from fantasy_baseball_manager.features.sql import generate_sql

conn = create_connection("../data/fbm.db")
assembler = SqliteDatasetAssembler(conn)

## Build a Simple Feature Set

Define features using the fluent builder API and materialize them into a dataset.

In [None]:
simple_features = FeatureSet(
    name="simple_batting",
    features=(
        batting.col("hr").lag(1).alias("hr_1"),
        batting.col("hr").lag(2).alias("hr_2"),
        batting.col("avg").lag(1).alias("avg_1"),
        batting.col("obp").lag(1).alias("obp_1"),
        batting.col("war").lag(1).alias("war_1"),
        player.age(),
    ),
    seasons=(2023, 2024),
    spine_filter=SpineFilter(min_pa=200),
)

handle = assembler.materialize(simple_features)
rows = assembler.read(handle)
df = pd.DataFrame(rows)
print(f"{len(df)} rows, {len(df.columns)} columns")
df.head(10)

## Projection Features

Pull projection data as features. The `.system()` call is required for projection sources.

In [None]:
PROJ_SYSTEM = "steamer"  # change to match your data

proj_features = FeatureSet(
    name="projection_features",
    features=(
        projection.col("hr").system(PROJ_SYSTEM).alias("proj_hr"),
        projection.col("avg").system(PROJ_SYSTEM).alias("proj_avg"),
        projection.col("war").system(PROJ_SYSTEM).alias("proj_war"),
        batting.col("hr").lag(1).alias("actual_hr_prev"),
        batting.col("war").lag(1).alias("actual_war_prev"),
        player.age(),
    ),
    seasons=(2024,),
    spine_filter=SpineFilter(min_pa=200),
)

handle = assembler.materialize(proj_features)
proj_df = pd.DataFrame(assembler.read(handle))
proj_df.head(10)

## Delta Features

Compute differences between projected and actual stats.

In [None]:
projected_hr = projection.col("hr").system(PROJ_SYSTEM).alias("proj_hr")
actual_hr = batting.col("hr").lag(0).alias("actual_hr")

delta_features = FeatureSet(
    name="delta_demo",
    features=(
        projected_hr,
        actual_hr,
        delta("hr_diff", projected_hr, actual_hr),
        player.age(),
    ),
    seasons=(2024,),
    spine_filter=SpineFilter(min_pa=200),
)

handle = assembler.materialize(delta_features)
delta_df = pd.DataFrame(assembler.read(handle))
print(f"Mean HR diff (proj - actual): {delta_df['hr_diff'].mean():.2f}")
print(f"Std HR diff: {delta_df['hr_diff'].std():.2f}")
delta_df.head(10)

## Rate Features & Rolling Means

Use `.per()` for rate computation and `.rolling_mean()` for multi-year averages.

In [None]:
rate_features = FeatureSet(
    name="rate_demo",
    features=(
        batting.col("hr").lag(1).per("pa").alias("hr_rate_1"),
        batting.col("hr").lag(1).rolling_mean(3).alias("hr_3yr_avg"),
        batting.col("bb").lag(1).per("pa").alias("bb_rate_1"),
        batting.col("so").lag(1).per("pa").alias("so_rate_1"),
        batting.col("war").lag(1).alias("war_1"),
        player.age(),
    ),
    seasons=(2023, 2024),
    spine_filter=SpineFilter(min_pa=300),
)

handle = assembler.materialize(rate_features)
rate_df = pd.DataFrame(assembler.read(handle))
rate_df.head(10)

## Inspect SQL

Use `generate_sql()` to see the underlying SQL that the feature system generates.

In [None]:
sql, params = generate_sql(rate_features)
print(sql)
print("\nParameters:", params)

## Visualize Features

Scatter plots and correlations between features in the materialized dataset.

In [None]:
if not rate_df.empty:
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))

    # HR rate vs WAR
    axes[0].scatter(rate_df["hr_rate_1"], rate_df["war_1"], alpha=0.4, s=15)
    axes[0].set_xlabel("HR/PA (prev season)")
    axes[0].set_ylabel("WAR (prev season)")
    axes[0].set_title("HR Rate vs WAR")

    # SO rate vs BB rate
    axes[1].scatter(rate_df["bb_rate_1"], rate_df["so_rate_1"], alpha=0.4, s=15)
    axes[1].set_xlabel("BB/PA")
    axes[1].set_ylabel("SO/PA")
    axes[1].set_title("Walk Rate vs Strikeout Rate")

    # 3-year avg HR vs single year HR rate
    axes[2].scatter(rate_df["hr_3yr_avg"], rate_df["hr_rate_1"], alpha=0.4, s=15)
    axes[2].set_xlabel("HR 3-Year Avg")
    axes[2].set_ylabel("HR/PA (prev season)")
    axes[2].set_title("3-Year HR Avg vs Single Season HR Rate")

    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix for all numeric features
if not rate_df.empty:
    numeric_cols = rate_df.select_dtypes(include="number").columns.tolist()
    # Exclude spine columns like player_id, season
    feature_cols = [c for c in numeric_cols if c not in ("player_id", "season")]
    corr = rate_df[feature_cols].corr()

    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(corr, cmap="RdBu_r", vmin=-1, vmax=1)
    ax.set_xticks(range(len(feature_cols)))
    ax.set_yticks(range(len(feature_cols)))
    ax.set_xticklabels(feature_cols, rotation=45, ha="right")
    ax.set_yticklabels(feature_cols)
    plt.colorbar(im, ax=ax, label="Correlation")
    ax.set_title("Feature Correlation Matrix")
    plt.tight_layout()
    plt.show()