In [17]:
from __future__ import annotations
import pandas as pd
import numpy as np
from ngboost import NGBRegressor
from ngboost.distns import Normal
from ngboost.scores import MLE
from scipy.stats import norm

# ---------- Core training/backtest ----------
def train_backtest(
    input_csv: str = "../data/data_files/data_advanced_features.csv",
    output_csv: str = "../data/data_files/backtest_bets.csv",
    feature_cols: list[str] = None,
    target_col: str = "score_diff",
    # rolling-window training knobs
    train_months_start: int = 3,
    min_train_games: int = 300,
    max_back_months: int = 12,
    # test range (by season label)
    test_season_start: int = 2019,
    test_season_end: int = 2021,
    # betting knobs
    edge_margin: float = 0.10,
    base_stake: float = 100.0,
    random_state: int = 42,
    n_estimators: int = 500,
) -> pd.DataFrame:
    if feature_cols is None:
        feature_cols = [
            'fg_pct_diff','ft_pct_diff','fg3_pct_diff','ast_diff','reb_diff',
            'moneyline_diff','tiredness_diff','fg_pct_vs_opp_diff',
            'ft_pct_vs_opp_diff','fg3_pct_vs_opp_diff','ast_vs_opp_diff',
            'reb_vs_opp_diff','pts_vs_opp_diff'
        ]

    # Load & prep
    df = pd.read_csv(input_csv, parse_dates=["date"]).sort_values("date")
    df = df.dropna(subset=feature_cols + [target_col, 'spread'])
    if "season" not in df.columns:
        df["season"] = df["date"].apply(season_from_date)

    # Limit seasons if desired (keeps data light for both train/test logic)
    df = df[df["season"].between(test_season_start, test_season_end)]

    # Containers
    combined_tests = []
    month_summaries = []

    # Test window bounds
    test_mask_all = df["season"].between(test_season_start, test_season_end)
    if not test_mask_all.any():
        raise ValueError("No rows in requested test season range.")
    test_range_min = df.loc[test_mask_all, "date"].min()
    test_range_max = df.loc[test_mask_all, "date"].max()

    cursor = pd.Timestamp(test_range_min.to_period("M").start_time)
    last_month_start = pd.Timestamp(test_range_max.to_period("M").start_time)

    # Rolling monthly backtest
    while cursor <= last_month_start:
        next_start = cursor + pd.offsets.MonthBegin(1)

        # Test set = current month within seasons
        test_mask = (df["date"] >= cursor) & (df["date"] < next_start) & test_mask_all
        test_df = df.loc[test_mask].copy().sort_values("date")
        if test_df.empty:
            cursor = next_start
            continue

        # Training set = previous ~N months, expand if few games
        months_back = train_months_start
        train_end = cursor
        train_start = cursor - pd.DateOffset(months=months_back)
        train_mask = (df["date"] >= train_start) & (df["date"] < train_end)
        train_df = df.loc[train_mask].copy().sort_values("date")

        while (len(train_df) < min_train_games) and (months_back < max_back_months):
            months_back += 1
            train_start = cursor - pd.DateOffset(months=months_back)
            train_mask = (df["date"] >= train_start) & (df["date"] < train_end)
            train_df = df.loc[train_mask].copy().sort_values("date")

        # Guardrail
        if len(train_df) < max(20, min_train_games // 6):
            cursor = next_start
            continue

        # Fit NGBoost
        ngb = NGBRegressor(Dist=Normal, Score=MLE, verbose=False,
                           n_estimators=n_estimators, random_state=random_state)
        ngb.fit(train_df[feature_cols], train_df[target_col])

        pred_dist = ngb.pred_dist(test_df[feature_cols])
        test_df["pred_score_diff"] = pred_dist.loc
        test_df["pred_std"] = pred_dist.scale

        # Win probabilities
        test_df["A_win_prob"] = 1 - norm.cdf(0, loc=test_df["pred_score_diff"], scale=test_df["pred_std"])
        test_df["B_win_prob"] = norm.cdf(0,  loc=test_df["pred_score_diff"], scale=test_df["pred_std"])

        # Expected values (per $100 base stake)
        test_df["bet_A_EV"] = test_df.apply(
            lambda r: to_decimal_odds(r["teamA_moneyLine"]) * r["A_win_prob"] - (base_stake/100.0),
            axis=1
        )
        test_df["bet_B_EV"] = test_df.apply(
            lambda r: to_decimal_odds(r["teamB_moneyLine"]) * r["B_win_prob"] - (base_stake/100.0),
            axis=1
        )

        test_df["bet_A"] = test_df["bet_A_EV"] > edge_margin
        test_df["bet_B"] = test_df["bet_B_EV"] > edge_margin
        test_df["bet_team"] = np.where(test_df["bet_A"], "A",
                                np.where(test_df["bet_B"], "B", "None"))

        # Profit per game (win gets ML payout profit for $100 stake; lose = -100; no bet = 0)
        def bet_profit(row) -> float:
            if row["bet_team"] == "A":
                return stake_profit_from_moneyline(row["teamA_moneyLine"]) if row["score_diff"] > 0 else -base_stake
            if row["bet_team"] == "B":
                return stake_profit_from_moneyline(row["teamB_moneyLine"]) if row["score_diff"] < 0 else -base_stake
            return 0.0

        test_df["profit"] = test_df.apply(bet_profit, axis=1)

        # Keep only columns useful for analysis
        keep_cols = [
            "date","season","score_diff","spread",
            "teamA_moneyLine","teamB_moneyLine",
            "pred_score_diff","pred_std",
            "A_win_prob","B_win_prob",
            "bet_A_EV","bet_B_EV","bet_A","bet_B","bet_team",
            "profit"
        ]
        test_df = test_df[keep_cols].copy()
        test_df["month_start"] = cursor
        combined_tests.append(test_df)

        # quick monthly summary (optional)
        bets_made = (test_df["bet_team"] != "None").sum()
        total_profit = float(test_df["profit"].sum())
        roi = (total_profit / (bets_made * base_stake)) if bets_made else 0.0
        month_summaries.append({
            "month_start": cursor, "n_test_games": len(test_df),
            "train_games": len(train_df), "train_months_used": months_back,
            "bets_made": int(bets_made), "profit_$": total_profit, "ROI_per_bet": roi
        })

        cursor = next_start

    if not combined_tests:
        raise ValueError("No test windows produced results in the requested season range.")

    out_df = pd.concat(combined_tests, ignore_index=True).sort_values("date")
    out_df.to_csv(output_csv, index=False)
    return out_df

# ---------- Helpers ----------
def season_from_date(d: pd.Timestamp) -> int:
    # NBA-style: Oct–Dec belong to the season labeled by that calendar year
    return d.year if d.month >= 10 else d.year - 1

def odds_indicator(moneyline: float) -> float:
    if moneyline > 0:
        return 1.0 + moneyline / 100.0
    elif moneyline < 0:
        return 1.0 + 100.0 / moneyline
    else:
        return 1.0

def stake_profit_from_moneyline(ml: float) -> float:
    """Profit for a $100 stake if bet wins (loss is -100 if loses)."""
    return (ml if ml > 0 else 10000.0 / abs(ml))

In [26]:
from __future__ import annotations
import pandas as pd
import numpy as np
from ngboost import NGBRegressor
from ngboost.distns import Normal
from ngboost.scores import MLE
from scipy.stats import norm

# ---------- Core training/backtest ----------
def train_backtest(
    input_csv: str = "../data/data_files/data_advanced_features.csv",
    output_csv: str = "../data/data_files/backtest_bets.csv",
    feature_cols: list[str] = None,
    target_col: str = "score_diff",
    # rolling-window training knobs
    train_months_start: int = 3,
    min_train_games: int = 300,
    max_back_months: int = 12,
    # test range (by season label)
    test_season_start: int = 2019,
    test_season_end: int = 2021,
    # betting knobs
    edge_margin: float = 0.10,
    base_stake: float = 100.0,
    random_state: int = 42,
    n_estimators: int = 500,
) -> pd.DataFrame:
    if feature_cols is None:
        feature_cols = [
            'fg_pct_diff','ft_pct_diff','fg3_pct_diff','ast_diff','reb_diff',
            'moneyline_diff','tiredness_diff','fg_pct_vs_opp_diff',
            'ft_pct_vs_opp_diff','fg3_pct_vs_opp_diff','ast_vs_opp_diff',
            'reb_vs_opp_diff','pts_vs_opp_diff'
        ]

    # Load & prep
    df = pd.read_csv(input_csv, parse_dates=["date"]).sort_values("date")
    df = df.dropna(subset=feature_cols + [target_col, 'spread'])
    if "season" not in df.columns:
        df["season"] = df["date"].apply(season_from_date)

    # Limit seasons if desired (keeps data light for both train/test logic)
    df = df[df["season"].between(test_season_start, test_season_end)]

    # Containers
    combined_tests = []
    month_summaries = []

    # Test window bounds
    test_mask_all = df["season"].between(test_season_start, test_season_end)
    if not test_mask_all.any():
        raise ValueError("No rows in requested test season range.")
    test_range_min = df.loc[test_mask_all, "date"].min()
    test_range_max = df.loc[test_mask_all, "date"].max()

    cursor = pd.Timestamp(test_range_min.to_period("M").start_time)
    last_month_start = pd.Timestamp(test_range_max.to_period("M").start_time)

    # Rolling monthly backtest
    while cursor <= last_month_start:
        next_start = cursor + pd.offsets.MonthBegin(1)

        # Test set = current month within seasons
        test_mask = (df["date"] >= cursor) & (df["date"] < next_start) & test_mask_all
        test_df = df.loc[test_mask].copy().sort_values("date")
        if test_df.empty:
            cursor = next_start
            continue

        # Training set = previous ~N months, expand if few games
        months_back = train_months_start
        train_end = cursor
        train_start = cursor - pd.DateOffset(months=months_back)
        train_mask = (df["date"] >= train_start) & (df["date"] < train_end)
        train_df = df.loc[train_mask].copy().sort_values("date")

        while (len(train_df) < min_train_games) and (months_back < max_back_months):
            months_back += 1
            train_start = cursor - pd.DateOffset(months=months_back)
            train_mask = (df["date"] >= train_start) & (df["date"] < train_end)
            train_df = df.loc[train_mask].copy().sort_values("date")

        # Guardrail
        if len(train_df) < max(20, min_train_games // 6):
            cursor = next_start
            continue

        # Fit NGBoost
        ngb = NGBRegressor(Dist=Normal, Score=MLE, verbose=False,
                           n_estimators=n_estimators, random_state=random_state)
        ngb.fit(train_df[feature_cols], train_df[target_col])

        pred_dist = ngb.pred_dist(test_df[feature_cols])
        test_df["pred_score_diff"] = pred_dist.loc
        test_df["pred_std"] = pred_dist.scale

        # Win probabilities
        test_df["A_win_prob"] = 1 - norm.cdf(0, loc=test_df["pred_score_diff"], scale=test_df["pred_std"])
        test_df["B_win_prob"] = norm.cdf(0,  loc=test_df["pred_score_diff"], scale=test_df["pred_std"])

        # Adjusted expected values (per $100 base stake)
        # Note: adj_EV = adjusted expected value (probability-weighted payoff adjusted for stake)
        test_df["bet_A_adj_EV"] = test_df.apply(
            lambda r: to_decimal_odds(r["teamA_moneyLine"]) * r["A_win_prob"] - (base_stake/100.0),
            axis=1
        )
        test_df["bet_B_adj_EV"] = test_df.apply(
            lambda r: to_decimal_odds(r["teamB_moneyLine"]) * r["B_win_prob"] - (base_stake/100.0),
            axis=1
        )

        test_df["bet_A"] = test_df["bet_A_adj_EV"] > edge_margin
        test_df["bet_B"] = test_df["bet_B_adj_EV"] > edge_margin
        test_df["bet_team"] = np.where(test_df["bet_A"], "A",
                                np.where(test_df["bet_B"], "B", "None"))

        # Profit per game (win gets ML payout profit for $100 stake; lose = -100; no bet = 0)
        def bet_profit(row) -> float:
            if row["bet_team"] == "A":
                return stake_profit_from_moneyline(row["teamA_moneyLine"]) if row["score_diff"] > 0 else -base_stake
            if row["bet_team"] == "B":
                return stake_profit_from_moneyline(row["teamB_moneyLine"]) if row["score_diff"] < 0 else -base_stake
            return 0.0

        test_df["profit"] = test_df.apply(bet_profit, axis=1)

        # Keep only columns useful for analysis
        keep_cols = [
            "date","season","score_diff","spread",
            "teamA_moneyLine","teamB_moneyLine",
            "pred_score_diff","pred_std",
            "A_win_prob","B_win_prob",
            "bet_A_adj_EV","bet_B_adj_EV","bet_A","bet_B","bet_team",
            "profit"
        ]
        test_df = test_df[keep_cols].copy()
        test_df["month_start"] = cursor
        combined_tests.append(test_df)

        # quick monthly summary (optional)
        bets_made = (test_df["bet_team"] != "None").sum()
        total_profit = float(test_df["profit"].sum())
        roi = (total_profit / (bets_made * base_stake)) if bets_made else 0.0
        month_summaries.append({
            "month_start": cursor, "n_test_games": len(test_df),
            "train_games": len(train_df), "train_months_used": months_back,
            "bets_made": int(bets_made), "profit_$": total_profit, "ROI_per_bet": roi
        })

        cursor = next_start

    if not combined_tests:
        raise ValueError("No test windows produced results in the requested season range.")

    out_df = pd.concat(combined_tests, ignore_index=True).sort_values("date")
    out_df.to_csv(output_csv, index=False)
    return out_df

# ---------- Helpers ----------
def season_from_date(d: pd.Timestamp) -> int:
    # NBA-style: Oct–Dec belong to the season labeled by that calendar year
    return d.year if d.month >= 10 else d.year - 1

def odds_indicator(moneyline: float) -> float:
    if moneyline > 0:
        return 1.0 + moneyline / 100.0
    elif moneyline < 0:
        return 1.0 + 100.0 / moneyline
    else:
        return 1.0

def stake_profit_from_moneyline(ml: float) -> float:
    """Profit for a $100 stake if bet wins (loss is -100 if loses)."""
    return (ml if ml > 0 else 10000.0 / abs(ml))

In [27]:
# parameters
INPUT_CSV   = "../data/data_files/data_advanced_features.csv"
OUTPUT_CSV  = "../data/data_files/backtest_bets.csv"

# Adjustable training/backtest knobs
TRAIN_MONTHS_START = 3
MIN_TRAIN_GAMES    = 300
MAX_BACK_MONTHS    = 12

TEST_SEASON_START  = 2019   # inclusive
TEST_SEASON_END    = 2021   # inclusive

EDGE_MARGIN   = 0.10        # EV threshold (in $100 stakes units)
BASE_STAKE    = 100.0
RANDOM_STATE  = 42
N_ESTIMATORS  = 500

In [28]:
# run training/backtest and save CSV
_ = train_backtest(
    input_csv=INPUT_CSV,
    output_csv=OUTPUT_CSV,
    train_months_start=TRAIN_MONTHS_START,
    min_train_games=MIN_TRAIN_GAMES,
    max_back_months=MAX_BACK_MONTHS,
    test_season_start=TEST_SEASON_START,
    test_season_end=TEST_SEASON_END,
    edge_margin=EDGE_MARGIN,
    base_stake=BASE_STAKE,
    random_state=RANDOM_STATE,
    n_estimators=N_ESTIMATORS,
)

print(f"Backtest saved to: {OUTPUT_CSV}")

Backtest saved to: ../data/data_files/backtest_bets.csv
