In [None]:
def To_decimal_odds(odds_A):
    if odds_A > 0:
        return 1 + odds_A / 100
    elif odds_A < 0:
        return 1 + 100 / odds_A
    
from scipy.stats import norm
import pandas as pd
import numpy as np
from ngboost import NGBRegressor
from ngboost.distns import Normal
from ngboost.scores import MLE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load and prepare data
df_model = pd.read_csv("merged_data_features_with_tiredness.csv", parse_dates=["date"])
#df_model = pd.read_csv("merged_data_features.csv", parse_dates=["date"])
df_model = df_model.sort_values("date")

# Drop rows with NaN in key columns
feature_cols = ['fg_pct_diff', 'ft_pct_diff', 'fg3_pct_diff', 'ast_diff', 'reb_diff', \
                'moneyline_diff','tiredness_diff','fg_pct_vs_opp_diff', \
                'ft_pct_vs_opp_diff', 'fg3_pct_vs_opp_diff', 'ast_vs_opp_diff', \
                'reb_vs_opp_diff', 'pts_vs_opp_diff']
target_col = 'score_diff'
df_model = df_model.dropna(subset=feature_cols + [target_col, 'spread'])

# ==== Rolling monthly test; training = previous ~3 months of games ====
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt

# --- knobs you can tweak ---
TRAIN_MONTHS_START = 3      # target months of history for training
MIN_TRAIN_GAMES    = 300    # if fewer games than this, expand history
MAX_BACK_MONTHS    = 12     # cap for expansion to avoid going too far back
TEST_SEASON_START  = 2019   # inclusive
TEST_SEASON_END    = 2021   # inclusive

# --- prep ---
df_all = df_model.copy()
df_all['date'] = pd.to_datetime(df_all['date'])

# Ensure integer 'season' exists
if 'season' not in df_all.columns:
    def season_int(dt):
        return dt.year if dt.month >= 10 else dt.year - 1
    df_all['season'] = df_all['date'].apply(season_int)

# Containers (clear if re-running)
all_preds, all_actuals, all_dates = [], [], []
all_profits, all_bet_teams = [], []
all_bet_A, all_bet_B, all_spreads = [], [], []
all_pred_std = []

edge_margin = 0.1 if 'edge_margin' not in globals() else edge_margin
base = 1 if 'base' not in globals() else base

# Test range bounds (by season)
test_mask_all = df_all['season'].between(TEST_SEASON_START, TEST_SEASON_END)
test_range_min = df_all.loc[test_mask_all, 'date'].min()
test_range_max = df_all.loc[test_mask_all, 'date'].max()

# Month cursor from first test month start to last test month start
cursor = pd.Timestamp(test_range_min.to_period('M').start_time)
last_month_start = pd.Timestamp(test_range_max.to_period('M').start_time)

combined_tests = []   # for plotting across all months
month_summaries = []  # optional per-month metrics

while cursor <= last_month_start:
    next_start = cursor + pd.offsets.MonthBegin(1)

    # Test = games in this month AND in the target seasons
    test_mask = (df_all['date'] >= cursor) & (df_all['date'] < next_start) & test_mask_all
    test_df = df_all.loc[test_mask].copy().sort_values('date')

    if test_df.empty:
        cursor = next_start
        continue

    # Training = previous ~3 months of games (expand if off-season yields too few)
    months_back = TRAIN_MONTHS_START
    train_start = cursor - pd.DateOffset(months=months_back)
    train_end   = cursor
    train_mask  = (df_all['date'] >= train_start) & (df_all['date'] < train_end)
    train_df    = df_all.loc[train_mask].copy().sort_values('date')

    while (len(train_df) < MIN_TRAIN_GAMES) and (months_back < MAX_BACK_MONTHS):
        months_back += 1
        train_start = cursor - pd.DateOffset(months=months_back)
        train_mask  = (df_all['date'] >= train_start) & (df_all['date'] < train_end)
        train_df    = df_all.loc[train_mask].copy().sort_values('date')

    # If still too small, skip this month
    if len(train_df) < max(20, MIN_TRAIN_GAMES // 6):  # tiny guard
        print(f"[Skip] {cursor.date()} train too small: {len(train_df)} games (expanded {months_back} months)")
        cursor = next_start
        continue

    # ---- Fit & predict (same NGBoost logic) ----
    ngb = NGBRegressor(Dist=Normal, Score=MLE, verbose=False,
                       n_estimators=500, random_state=42)
    ngb.fit(train_df[feature_cols], train_df[target_col])

    pred_dist = ngb.pred_dist(test_df[feature_cols])
    test_df['pred_score_diff'] = pred_dist.loc
    test_df['pred_std'] = pred_dist.scale

    test_df['A_win_prob'] = 1 - norm.cdf(0, loc=test_df['pred_score_diff'], scale=test_df['pred_std'])
    test_df['B_win_prob'] =     norm.cdf(0, loc=test_df['pred_score_diff'], scale=test_df['pred_std'])

    test_df['bet_A_EV'] = test_df.apply(lambda r: To_decimal_odds(r['teamA_moneyLine']) * r['A_win_prob'] - base, axis=1)
    test_df['bet_B_EV'] = test_df.apply(lambda r: To_decimal_odds(r['teamB_moneyLine']) * r['B_win_prob'] - base, axis=1)
    test_df['bet_A'] = test_df['bet_A_EV'] > edge_margin
    test_df['bet_B'] = test_df['bet_B_EV'] > edge_margin
    test_df['bet_team'] = np.where(test_df['bet_A'], 'A', np.where(test_df['bet_B'], 'B', 'None'))

    def bet_profit(row):
        if row['bet_team'] == 'A':
            return ((row['teamA_moneyLine'] / 100) * 100 if row['teamA_moneyLine'] > 0 else 10000 / abs(row['teamA_moneyLine'])) if row['score_diff'] > 0 else -100
        if row['bet_team'] == 'B':
            return ((row['teamB_moneyLine'] / 100) * 100 if row['teamB_moneyLine'] > 0 else 10000 / abs(row['teamB_moneyLine'])) if row['score_diff'] < 0 else -100
        return 0

    test_df['profit'] = test_df.apply(bet_profit, axis=1)
    
    # Put this INSIDE your monthly (or other) backtest loop right after you compute test_df['profit']
    # One-time setup (outside the loop):
    analysis_rows = []

    # Then inside the loop, append the key columns:
    store_cols = [
        'date', 'score_diff', 'teamA_moneyLine', 'teamB_moneyLine',
        'bet_team', 'bet_A_EV', 'bet_B_EV', 'profit',
        'pred_score_diff', 'pred_std'
    ]
    analysis_rows.append(test_df[store_cols].copy())


    # --- store for downstream compatibility ---
    all_preds.append(test_df['pred_score_diff'])
    all_pred_std.append(test_df['pred_std'])
    all_actuals.append(test_df['score_diff'])
    all_dates.append(test_df['date'])
    all_profits.append(test_df['profit'])
    all_bet_teams.append(test_df['bet_team'])
    all_bet_A.append(test_df['bet_A'])
    all_bet_B.append(test_df['bet_B'])
    all_spreads.append(test_df['spread'])

    # For plotting and reporting
    test_df['month'] = cursor  # label by month start
    combined_tests.append(test_df[['date','profit','month','season']].copy())

    bets_made = (test_df['bet_team'] != 'None').sum()
    total_profit = test_df['profit'].sum()
    roi = total_profit / (bets_made * 100) if bets_made else 0.0
    month_summaries.append({
        'month_start': cursor, 'n_test_games': len(test_df),
        'train_games': len(train_df), 'train_months_used': months_back,
        'bets_made': bets_made, 'profit_$': total_profit, 'ROI_per_bet': roi
    })

    cursor = next_start
    
# After the while loop finishes
all_tests_df = pd.concat(combined_tests, ignore_index=True)

print(all_tests_df.shape)
print(all_tests_df.head())

import pandas as pd
import numpy as np

# --- Helper: season labeling (NBA) ---
def season_year(dt: pd.Timestamp) -> int:
    # Oct–Dec belong to next year's season; Jan–Jun belong to same year season.
    return dt.year + 1 if dt.month >= 10 else dt.year

def compute_season_roi_and_sharpe(df: pd.DataFrame):
    dfx = df.copy()
    dfx["date"] = pd.to_datetime(dfx["date"])

    # Ensure season column
    if "season" not in dfx.columns:
        dfx["season"] = dfx["date"].apply(season_year)

    # Detect bets:
    # Prefer explicit 'bet_team' if available; otherwise infer from profit != 0
    if "bet_team" in dfx.columns:
        bet_mask = dfx["bet_team"].astype(str) != "None"
    else:
        bet_mask = dfx["profit"] != 0

    dfx = dfx[bet_mask].copy()

    # Keep seasons 2019, 2020, 2021
    dfx = dfx[dfx["season"].isin([2019, 2020, 2021])]
    if dfx.empty:
        raise ValueError("No bets found in seasons 2019–2021.")

    # ---------- ROI per season ----------
    season_stats = (
        dfx.groupby("season")
           .agg(total_profit=("profit", "sum"),
                n_bets=("profit", "count"))
           .assign(ROI=lambda g: g["total_profit"] / (100 * g["n_bets"]))
           .sort_index()
    )
    avg_roi_across_seasons = season_stats["ROI"].mean()

    # ---------- Sharpe over all three seasons (daily, annualized) ----------
    # Aggregate to daily returns: sum(profit) / (100 * number_of_bets_that_day)
    dfx["date_only"] = dfx["date"].dt.date
    daily = (
        dfx.groupby("date_only")
           .agg(profit_sum=("profit", "sum"),
                n_bets=("profit", "count"))
           .assign(daily_ret=lambda g: g["profit_sum"] / (100 * g["n_bets"]))
    )
    daily_ret = daily["daily_ret"].dropna()
    if len(daily_ret) < 2:
        raise ValueError("Not enough daily observations to compute Sharpe.")

    mu = daily_ret.mean()
    sigma = daily_ret.std(ddof=1)
    sharpe_annualized = np.nan if sigma == 0 else (mu / sigma) * np.sqrt(365)

    return season_stats, float(avg_roi_across_seasons), float(sharpe_annualized)

# ---- Example usage ----
# Use whichever you built:
# season_stats, avg_roi, sharpe = compute_season_roi_and_sharpe(all_analysis_df)
# or:
season_stats, avg_roi, sharpe = compute_season_roi_and_sharpe(all_tests_df)

print(season_stats)            # ROI per season (2019, 2020, 2021)
print("Average ROI:", avg_roi) # simple average across seasons
print("Sharpe (annualized):", sharpe)

# ==== Plot cumulative profit across the entire test horizon (2019–2022) ====
if combined_tests:
    test_all = (pd.concat(combined_tests, ignore_index=True)
                  .sort_values('date')
                  .reset_index(drop=True))
    test_all['cum_profit'] = test_all['profit'].cumsum()

    plt.figure(figsize=(10,6))
    plt.plot(test_all['date'], test_all['cum_profit'])
    plt.axhline(0, linewidth=1,color = 'k')
    plt.xlabel('Date'); plt.ylabel('Cumulative Profit')
    plt.title('Cumulative Profit vs Time')
    plt.grid(True); plt.tight_layout(); plt.show()

    # Optional: bar of monthly ROI
    month_df = (pd.DataFrame(month_summaries)
                  .sort_values('month_start'))
    if not month_df.empty:
        plt.figure(figsize=(10,4))
        plt.bar(month_df['month_start'].dt.strftime('%Y-%m'), month_df['ROI_per_bet'])
        plt.xticks(rotation=60, ha='right'); plt.ylabel('ROI per Bet'); plt.xlabel('Month')
        plt.title('Monthly ROI (test windows)'); plt.grid(True, axis='y'); plt.tight_layout(); plt.show()

    # If you want the summary table:
    # display(month_df)
else:
    print("No test windows produced results in the requested season range.")
