<a href="https://colab.research.google.com/github/cpsanzone/nba-player-projection-backtester/blob/main/NBA_Projection_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import io
import numpy as np
import os

# --- 1. Define the player_configs list ---
player_configs = [
    {"name": "LeBron James", "player_files": ["/lebron_2023.csv", "/lebron_2024.csv", "/lebron_2025.csv"], "team_abbr": "LAL"},
    {"name": "Nikola Jokic", "player_files": ["/jokic_2023.csv", "/jokic_2024.csv", "/jokic_2025.csv"], "team_abbr": "DEN"},
    {"name": "Luka Doncic", "player_files": ["/luka_2023.csv", "/luka_2024.csv", "/luka_2025.csv"], "team_abbr": None}, # Handles trades
    {"name": "Giannis Antetokounmpo", "player_files": ["/giannis_2023.csv", "/giannis_2024.csv", "/giannis_2025.csv"], "team_abbr": "MIL"},
    {"name": "Shai Gilgeous-Alexander", "player_files": ["/sga_2023.csv", "/sga_2024.csv", "/sga_2025.csv"], "team_abbr": "OKC"},
    {"name": "Austin Reaves", "player_files": ["/reaves_2023.csv", "/reaves_2024.csv", "/reaves_2025.csv"], "team_abbr": "LAL"},
    {"name": "Tyrese Maxey", "player_files": ["/maxey_2023.csv", "/maxey_2024.csv", "/maxey_2025.csv"], "team_abbr": "PHI"},
    {"name": "Donovan Mitchell", "player_files": ["/mitchell_2023.csv", "/mitchell_2024.csv", "/mitchell_2025.csv"], "team_abbr": "CLE"},
    {"name": "Devin Booker", "player_files": ["/booker_2023.csv", "/booker_2024.csv", "/booker_2025.csv"], "team_abbr": "PHO"},
    {"name": "Lauri Markkanen", "player_files": ["/lauri_2023.csv", "/lauri_2024.csv", "/lauri_2025.csv"], "team_abbr": "UTA"},
    {"name": "Jalen Brunson", "player_files": ["/brunson_2023.csv", "/brunson_2024.csv", "/brunson_2025.csv"], "team_abbr": "NYK"},
    {"name": "Jaylen Brown", "player_files": ["/brown_2023.csv", "/brown_2024.csv", "/brown_2025.csv"], "team_abbr": "BOS"},
    {"name": "Cade Cunningham", "player_files": ["/cade_2023.csv", "/cade_2024.csv", "/cade_2025.csv"], "team_abbr": "DET"}
]

# --- 2. Define the team_odds_map dictionary ---
team_odds_map = {
    "LAL": {"files": ["/odds_data_2023.csv", "/odds_data_2024.csv", "/odds_data_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "DEN": {"files": ["/nuggets_odds_2023.csv", "/nuggets_odds_2024.csv", "/nuggets_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "DAL": {"files": ["/mavs_odds_2023.csv", "/mavs_odds_2024.csv", "/mavs_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "MIL": {"files": ["/bucks_odds_2023.csv", "/bucks_odds_2024.csv", "/bucks_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "OKC": {"files": ["/thunder_odds_2023.csv", "/thunder_odds_2024.csv", "/thunder_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "PHI": {"files": ["/76ers_odds_2023.csv", "/76ers_odds_2024.csv", "/76ers_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "CLE": {"files": ["/cavs_odds_2023.csv", "/cavs_odds_2024.csv", "/cavs_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "PHO": {"files": ["/suns_odds_2023.csv", "/suns_odds_2024.csv", "/suns_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "UTA": {"files": ["/jazz_odds_2023.csv", "/jazz_odds_2024.csv", "/jazz_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "NYK": {"files": ["/knicks_odds_2023.csv", "/knicks_odds_2024.csv", "/knicks_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "BOS": {"files": ["/celtics_odds_2023.csv", "/celtics_odds_2024.csv", "/celtics_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]},
    "DET": {"files": ["/pistons_odds_2023.csv", "/pistons_odds_2024.csv", "/pistons_odds_2025.csv"], "seasons": ["2022-23", "2023-24", "2024-25"]}
}

# Define directories
player_gamelogs_dir = "/player_gamelogs/"
odds_dir = "/"

# --- 3. Implement create_dummy_player_file ---
def create_dummy_player_file(player_config, directory):
    player_name = player_config["name"]
    team_abbr_config = player_config["team_abbr"]

    for file_path_template in player_config["player_files"]:
        year_str = os.path.basename(file_path_template).split('_')[-1].split('.')[0]
        try:
            file_year = int(year_str)
            season_start_year = file_year - 1 # Season YYYY is (YYYY-1)-YYYY
        except ValueError:
            print(f"Skipping {file_path_template} due to invalid year format: {year_str}")
            continue

        # Determine the team for this specific game log
        if team_abbr_config is None:
            # For Luka, randomly assign a team from the odds map teams
            teams_for_random = [tc for tc in team_odds_map.keys() if tc is not None]
            if not teams_for_random:
                actual_team_for_log = "UNK" # Fallback
            else:
                actual_team_for_log = np.random.choice(teams_for_random)
        else:
            actual_team_for_log = team_abbr_config

        data = {
            'Date': pd.to_datetime(pd.date_range(f'{season_start_year}-10-01', periods=20, freq='D')),
            'Tm': [actual_team_for_log] * 20,
            'PTS': np.random.randint(10, 40, 20),
            'AST': np.random.randint(2, 15, 20),
            'TRB': np.random.randint(3, 18, 20),
            'FG3M': np.random.randint(0, 7, 20),
            'GS': np.random.randint(0, 1, 20),
            'MP': np.random.randint(20, 40, 20)
        }
        df = pd.DataFrame(data)

        filepath = os.path.join(directory, os.path.basename(file_path_template))
        df.to_csv(filepath, index=False)


# --- 4. Implement create_dummy_odds_file ---
def create_dummy_odds_file(team_abbr, season_start_year, expected_filename, directory):
    data = {
        'Date': pd.to_datetime(pd.date_range(f'{season_start_year}-10-01', periods=20, freq='D')),
        'O/U': [f"O/U {np.random.randint(220, 240)}.0"] * 20,
        'ATS': [f"ATS {np.random.randint(-10, 10)}.0"] * 20
    }
    df = pd.DataFrame(data)
    os.makedirs(directory, exist_ok=True)

    filepath = os.path.join(directory, os.path.basename(expected_filename))
    df.to_csv(filepath, index=False)


# --- 5. Create directories and clear existing dummy files ---
os.makedirs(player_gamelogs_dir, exist_ok=True)
for f in os.listdir(player_gamelogs_dir):
    os.remove(os.path.join(player_gamelogs_dir, f))
print(f"‚úÖ Cleared existing dummy player files in {player_gamelogs_dir}.")

# Clear existing dummy odds files
# Note: This clears files from the root directory which might be sensitive.
# For a real project, use a dedicated sub-directory for dummy odds.
for team_abbr, config in team_odds_map.items():
    for f in config["files"]:
        filepath = os.path.join(odds_dir, os.path.basename(f))
        if os.path.exists(filepath):
            os.remove(filepath)
print(f"‚úÖ Cleared existing dummy odds files in {odds_dir}.")


# --- 6. Generate all player dummy game log CSVs ---
print("--- Generating Player Dummy Files ---")
for config in player_configs:
    create_dummy_player_file(config, player_gamelogs_dir)
print("‚úÖ All player dummy game log CSVs generated.")


# --- 7. Generate all team dummy odds CSVs ---
print("\n--- Generating Team Odds Dummy Files ---")
for team_abbr, config in team_odds_map.items():
    for i, f in enumerate(config["files"]):
        year1 = int(config["seasons"][i].split('-')[0])
        create_dummy_odds_file(team_abbr, year1, f, odds_dir)
print("‚úÖ All team dummy odds CSVs generated.")


# --- 8. Load, rename, and concatenate player data into df_player_master_comprehensive ---
all_player_dfs_comprehensive = []
print("\n--- Loading All Player Stats from Dummy Files ---")
for config in player_configs:
    player_name = config["name"]
    for file_path_template in config["player_files"]:
        filename = os.path.basename(file_path_template)
        filepath = os.path.join(player_gamelogs_dir, filename)
        if not os.path.exists(filepath):
            print(f"‚ö†Ô∏è Warning: Dummy player file {filepath} not found during loading. Skipping.")
            continue
        try:
            df_player = pd.read_csv(filepath)
            df_player['Player'] = player_name # Assign the correct player name
            df_player = df_player.rename(columns={'Tm': 'Team_Abbr'}) # Ensure 'Team_Abbr' column
            all_player_dfs_comprehensive.append(df_player)
        except Exception as e:
            print(f"‚ö†Ô∏è Warning: Could not load player file {filepath}. Error: {e}")

if all_player_dfs_comprehensive:
    df_player_master_comprehensive = pd.concat(all_player_dfs_comprehensive, ignore_index=True)
    print("‚úÖ All player dummy data loaded and concatenated into df_player_master_comprehensive.")
else:
    print("‚ö†Ô∏è No player game log files found or could not be loaded for df_player_master_comprehensive.")
    df_player_master_comprehensive = pd.DataFrame() # Ensure it's defined


# --- 9. Clean df_player_master_comprehensive ---
print("\n--- Cleaning and Type Converting df_player_master_comprehensive ---")
# Ensure 'Date' column is datetime and normalize it (remove time component if any)
df_player_master_comprehensive['Date'] = pd.to_datetime(df_player_master_comprehensive['Date'], errors='coerce').dt.normalize()

# Convert essential stat columns to numeric types, coercing errors
stat_columns_to_numeric = ['PTS', 'AST', 'TRB', 'FG3M']
for col in stat_columns_to_numeric:
    if col in df_player_master_comprehensive.columns:
        df_player_master_comprehensive[col] = pd.to_numeric(df_player_master_comprehensive[col], errors='coerce')
    else:
        print(f"‚ö†Ô∏è Warning: Column '{col}' not found in df_player_master_comprehensive. Skipping conversion.")

# Drop rows with any missing values in essential columns after conversions
essential_columns_for_dropna = ['Date', 'PTS', 'Team_Abbr', 'AST', 'TRB', 'FG3M']
existing_essential_columns = [col for col in essential_columns_for_dropna if col in df_player_master_comprehensive.columns]
initial_rows_comp = len(df_player_master_comprehensive)
df_player_master_comprehensive = df_player_master_comprehensive.dropna(subset=existing_essential_columns)
rows_dropped_comp = initial_rows_comp - len(df_player_master_comprehensive)
print(f"‚úÖ df_player_master_comprehensive cleaned. Dropped {rows_dropped_comp} rows with missing essential values.")
print(f"Final df_player_master_comprehensive has {len(df_player_master_comprehensive)} rows after cleaning.")


# --- 10. Add 'SEASON', 'PRA', and 'Performance_Score' to df_player_master_comprehensive ---
print("\n--- Engineering Features for df_player_master_comprehensive ---")
def get_season_str(date_obj):
    if pd.isna(date_obj):
        return None
    if date_obj.month >= 10:
        return f"{date_obj.year}-{(date_obj.year + 1) % 100:02d}"
    else:
        return f"{date_obj.year - 1}-{(date_obj.year) % 100:02d}"

df_player_master_comprehensive['SEASON'] = df_player_master_comprehensive['Date'].apply(get_season_str)
df_player_master_comprehensive = df_player_master_comprehensive.sort_values(by=['Player', 'Date'])

df_player_master_comprehensive['PRA'] = df_player_master_comprehensive['PTS'] + df_player_master_comprehensive['TRB'] + df_player_master_comprehensive['AST']
df_player_master_comprehensive['Performance_Score'] = df_player_master_comprehensive['PTS'] + df_player_master_comprehensive['AST'] + df_player_master_comprehensive['TRB'] + df_player_master_comprehensive['FG3M']

df_player_master_comprehensive = df_player_master_comprehensive.dropna(subset=['SEASON', 'PRA', 'Performance_Score'])
print("‚úÖ 'SEASON', 'PRA', and 'Performance_Score' added to df_player_master_comprehensive.")
print(f"Final df_player_master_comprehensive has {len(df_player_master_comprehensive)} rows after feature engineering.")


# --- 11. Group by and calculate average performance scores ---
print("\n--- Calculating Average Performance Scores per Player/Team/Season ---")
player_season_team_avg_stats = df_player_master_comprehensive.groupby(['SEASON', 'Team_Abbr', 'Player']).agg(
    Avg_Performance_Score=('Performance_Score', 'mean'),
    Avg_PTS=('PTS', 'mean'),
    Avg_AST=('AST', 'mean'),
    Avg_TRB=('TRB', 'mean'),
    Avg_FG3M=('FG3M', 'mean')
).reset_index()
print("‚úÖ Average performance scores computed.")


# --- 12. Select top 1 or 2 players per team per season ---
print("\n--- Selecting Top Players per Team per Season ---")
player_season_team_avg_stats_sorted = player_season_team_avg_stats.sort_values(
    by=['SEASON', 'Team_Abbr', 'Avg_Performance_Score'],
    ascending=[True, True, False]
)
top_players_per_team_season = player_season_team_avg_stats_sorted.groupby(['SEASON', 'Team_Abbr']).head(2)
print("‚úÖ Top 1-2 players identified per team per season.")


# --- 13. Filter df_player_master_comprehensive to create df_player_master ---
print("\n--- Creating df_player_master with Selected Players ---")
top_player_identifiers = top_players_per_team_season[['SEASON', 'Player']].drop_duplicates()
df_player_master = pd.merge(
    df_player_master_comprehensive,
    top_player_identifiers,
    on=['SEASON', 'Player'],
    how='inner'
)
print("‚úÖ df_player_master created, containing only games from dynamically selected top players.")
print(f"Final df_player_master has {len(df_player_master)} rows.")


# --- 14. Load, rename, and concatenate odds data into df_odds_master ---
all_odds_dfs = []
print("\n--- Loading All Team Odds from Dummy Files ---")
for team_abbr, config in team_odds_map.items():
    for i, f in enumerate(config["files"]):
        year1 = int(config["seasons"][i].split('-')[0])
        filepath = os.path.join(odds_dir, os.path.basename(f))
        if not os.path.exists(filepath):
            print(f"‚ö†Ô∏è Warning: Dummy odds file {filepath} not found during loading. Skipping.")
            continue
        try:
            df_season = pd.read_csv(filepath)
            df_season['Team_Abbr'] = team_abbr
            # Normalize date to remove time component for consistent merging
            df_season['Date'] = pd.to_datetime(df_season['Date'], errors='coerce').dt.normalize()
            all_odds_dfs.append(df_season)
        except Exception as e:
            print(f"‚ö†Ô∏è Warning: Could not load odds file {filepath}. Error: {e}")

if all_odds_dfs:
    df_odds_master = pd.concat(all_odds_dfs)
    print("‚úÖ All team dummy odds loaded and concatenated into df_odds_master.")
else:
    print("‚ö†Ô∏è No odds dataframes loaded or could not be loaded for df_odds_master.")
    df_odds_master = pd.DataFrame() # Ensure it's defined


# --- 15. Process df_odds_master into df_odds_clean ---
print("\n--- Processing df_odds_master into df_odds_clean ---")
df_odds_master = df_odds_master.rename(columns={'O/U': 'GAME_TOTAL_RAW', 'ATS': 'GAME_SPREAD_RAW'})
df_odds_master['GAME_TOTAL'] = df_odds_master['GAME_TOTAL_RAW'].str.split(' ').str[-1]
df_odds_master['GAME_TOTAL'] = pd.to_numeric(df_odds_master['GAME_TOTAL'], errors='coerce')
df_odds_master['GAME_SPREAD'] = df_odds_master['GAME_SPREAD_RAW'].str.split(' ').str[-1]
df_odds_master['GAME_SPREAD'] = pd.to_numeric(df_odds_master['GAME_SPREAD'], errors='coerce').abs()

df_odds_clean = df_odds_master.dropna(subset=['Date', 'GAME_SPREAD', 'GAME_TOTAL', 'Team_Abbr'])
df_odds_clean = df_odds_clean[['Date', 'GAME_SPREAD', 'GAME_TOTAL', 'Team_Abbr']].copy().drop_duplicates()
print("‚úÖ df_odds_clean created and cleaned.")
print(f"Final df_odds_clean has {len(df_odds_clean)} rows.")

print("\n--- Block 1: Simulation Setup & Data Generation (Hardcoded Configurations) Completed --- ")

‚úÖ Cleared existing dummy player files in /player_gamelogs/.
‚úÖ Cleared existing dummy odds files in /.
--- Generating Player Dummy Files ---
‚úÖ All player dummy game log CSVs generated.

--- Generating Team Odds Dummy Files ---
‚úÖ All team dummy odds CSVs generated.

--- Loading All Player Stats from Dummy Files ---
‚úÖ All player dummy data loaded and concatenated into df_player_master_comprehensive.

--- Cleaning and Type Converting df_player_master_comprehensive ---
‚úÖ df_player_master_comprehensive cleaned. Dropped 0 rows with missing essential values.
Final df_player_master_comprehensive has 780 rows after cleaning.

--- Engineering Features for df_player_master_comprehensive ---
‚úÖ 'SEASON', 'PRA', and 'Performance_Score' added to df_player_master_comprehensive.
Final df_player_master_comprehensive has 780 rows after feature engineering.

--- Calculating Average Performance Scores per Player/Team/Season ---
‚úÖ Average performance scores computed.

--- Selecting Top Player

In [2]:
import pandas as pd
import numpy as np

# 1. Initialize an empty list named `all_player_results`
all_player_results = []

# 2. Define the grid search parameters (same values as original STEP 2)
spread_values_to_test = [3, 4, 5, 6, 7, 8, 9, 10, 100]
total_values_to_test = [
    0, 220, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235,
    236, 237, 238, 239, 240, 241, 242
]
rolling_windows_to_test = [5, 10, 15]
pts_adjust_to_test = [-2.0, -2.5, -3.0, -3.5, -4.0, -4.5, -5.0, -5.5, -6.0]
ast_adjust_to_test = [-0.5, -1.0, -1.5, -2.0, -2.5, -3.0]
trb_adjust_to_test = [-0.5, -1.0, -1.5, -2.0, -2.5, -3.0]
pra_adjust_to_test = [-2.0, -3.0, -4.0, -5.0, -6.0, -7.0]
tpm_adjust_to_test = [-0.5, -1.0, -1.5, -2.0, -2.5, -3.0] # for FG3M

print("--- Starting Forecasting Engine ---")

# 3. Merge df_player_master and df_odds_clean into df_merged
# Ensure df_player_master and df_odds_clean are available from previous steps
if 'df_player_master' not in locals() or df_player_master.empty:
    raise ValueError("df_player_master is empty or not defined. Please ensure Block 1 is executed correctly.")
if 'df_odds_clean' not in locals() or df_odds_clean.empty:
    raise ValueError("df_odds_clean is empty or not defined. Please ensure Block 1 is executed correctly.")

df_merged = pd.merge(df_player_master, df_odds_clean, on=['Date', 'Team_Abbr'], how='inner')
print(f"‚úÖ Master stats and odds merged. Found {len(df_merged)} total matching games.")

# 4. Define a function `get_season_str`
def get_season_str(date_obj):
    if pd.isna(date_obj):
        return None
    if date_obj.month >= 10:
        return f"{date_obj.year}-{(date_obj.year + 1) % 100:02d}"
    else:
        return f"{date_obj.year - 1}-{(date_obj.year) % 100:02d}"

# 5. Apply `get_season_str` to create 'SEASON' column
df_merged['SEASON'] = df_merged['Date'].apply(get_season_str)

# 6. Sort `df_merged` by 'Player' and 'Date'
df_merged = df_merged.sort_values(by=['Player', 'Date'])

# 7. Ensure statistical columns are numeric
stat_cols_numeric_check = ['PTS', 'TRB', 'AST', 'FG3M']
for col in stat_cols_numeric_check:
    if col in df_merged.columns:
        df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')
    else:
        df_merged[col] = 0.0 # Create with zeros if missing to prevent errors
        print(f"‚ö†Ô∏è Warning: Column '{col}' not found in df_merged, created with zeros.")

# 8. Calculate 'PRA'
df_merged['PRA'] = df_merged['PTS'] + df_merged['TRB'] + df_merged['AST']

# Drop any rows that might have become NaN due to coercing errors after calculations
df_merged = df_merged.dropna(subset=['PTS', 'TRB', 'AST', 'FG3M', 'PRA', 'SEASON'])
print("‚úÖ Features engineered and data cleaned for grid search.")

# 9. Iterate through each `window` in `rolling_windows_to_test`
print(f"\n--- Running Full Grid Search for {len(df_merged)} games across {df_merged['Player'].nunique()} players ---")

for window in rolling_windows_to_test:
    print(f"Testing {window}-game window...")
    # a. Calculate rolling mean for 'PTS', 'AST', 'TRB', 'PRA', and 'FG3M'
    df_merged[f'AVG_PTS'] = df_merged.groupby(['Player', 'SEASON'])['PTS'].shift(1).rolling(window, min_periods=window).mean()
    df_merged[f'AVG_AST'] = df_merged.groupby(['Player', 'SEASON'])['AST'].shift(1).rolling(window, min_periods=window).mean()
    df_merged[f'AVG_TRB'] = df_merged.groupby(['Player', 'SEASON'])['TRB'].shift(1).rolling(window, min_periods=window).mean()
    df_merged[f'AVG_PRA'] = df_merged.groupby(['Player', 'SEASON'])['PRA'].shift(1).rolling(window, min_periods=window).mean()
    df_merged[f'AVG_FG3M'] = df_merged.groupby(['Player', 'SEASON'])['FG3M'].shift(1).rolling(window, min_periods=window).mean()

    # b. Create `df_testable` by dropping rows with NaN in new average columns
    df_testable = df_merged.dropna(subset=['AVG_PTS', 'AVG_AST', 'AVG_TRB', 'AVG_PRA', 'AVG_FG3M'])

    # c. Iterate through each `max_spread` and `min_total`
    for max_spread in spread_values_to_test:
        for min_total in total_values_to_test:
            # i. Filter `df_testable` to create `df_filtered`
            df_filtered = df_testable[(df_testable['GAME_SPREAD'] <= max_spread) & (df_testable['GAME_TOTAL'] >= min_total)].copy()

            # ii. Calculate `total_games`
            total_games = len(df_filtered)
            if total_games == 0: continue

            # iii. For each adjustment value, calculate `bet_line`, `wins`, and append results
            for adj in pts_adjust_to_test:
                bet_line = df_filtered[f'AVG_PTS'] + adj
                wins = (df_filtered['PTS'] > bet_line).sum()
                all_player_results.append({'stat': 'PTS', 'window': window, 'adj': adj, 'spread': max_spread, 'total': min_total, 'wins': wins, 'bets': total_games})

            for adj in ast_adjust_to_test:
                bet_line = df_filtered[f'AVG_AST'] + adj
                wins = (df_filtered['AST'] > bet_line).sum()
                all_player_results.append({'stat': 'AST', 'window': window, 'adj': adj, 'spread': max_spread, 'total': min_total, 'wins': wins, 'bets': total_games})

            for adj in trb_adjust_to_test:
                bet_line = df_filtered[f'AVG_TRB'] + adj
                wins = (df_filtered['TRB'] > bet_line).sum()
                all_player_results.append({'stat': 'TRB', 'window': window, 'adj': adj, 'spread': max_spread, 'total': min_total, 'wins': wins, 'bets': total_games})

            for adj in pra_adjust_to_test:
                bet_line = df_filtered[f'AVG_PRA'] + adj
                wins = (df_filtered['PRA'] > bet_line).sum()
                all_player_results.append({'stat': 'PRA', 'window': window, 'adj': adj, 'spread': max_spread, 'total': min_total, 'wins': wins, 'bets': total_games})

            for adj in tpm_adjust_to_test:
                bet_line = df_filtered[f'AVG_FG3M'] + adj
                wins = (df_filtered['FG3M'] > bet_line).sum()
                all_player_results.append({'stat': 'FG3M', 'window': window, 'adj': adj, 'spread': max_spread, 'total': min_total, 'wins': wins, 'bets': total_games})

print("\n‚úÖ‚úÖ‚úÖ All players processed. Grid search completed. ‚úÖ‚úÖ‚úÖ")

--- Starting Forecasting Engine ---
‚úÖ Master stats and odds merged. Found 780 total matching games.
‚úÖ Features engineered and data cleaned for grid search.

--- Running Full Grid Search for 780 games across 13 players ---
Testing 5-game window...
Testing 10-game window...
Testing 15-game window...

‚úÖ‚úÖ‚úÖ All players processed. Grid search completed. ‚úÖ‚úÖ‚úÖ


In [3]:
import pandas as pd
import numpy as np

print("--- Aggregating Results & Identifying Best Strategies ---")

# 1. Convert the all_player_results list into a pandas DataFrame named df_results.
# Ensure all_player_results is not empty before proceeding
if not all_player_results:
    print("No results found in all_player_results. Exiting aggregation.")
    # Ensure df_results is defined, even if empty, for subsequent steps if desired.
    df_results = pd.DataFrame()
else:
    df_results = pd.DataFrame(all_player_results)

    # 2. Group df_results by stat, window, adj, spread, and total. Aggregate wins and bets.
    df_agg = df_results.groupby(['stat', 'window', 'adj', 'spread', 'total']).agg(
        total_wins=('wins', 'sum'),
        total_bets=('bets', 'sum')
    ).reset_index()
    print("‚úÖ Results aggregated by strategy parameters.")

    # 3. Calculate the win_rate for each strategy in df_agg
    df_agg['win_rate'] = (df_agg['total_wins'] / df_agg['total_bets']) * 100
    print("‚úÖ Win rates calculated.")

    # 4. Determine the 'universe size' for each stat and window combination.
    df_universe = df_agg[
        (df_agg['spread'] == 100) & (df_agg['total'] == 0)
    ].groupby(['stat', 'window'])['total_bets'].max().reset_index()
    df_universe = df_universe.rename(columns={'total_bets': 'universe_size'})
    print("‚úÖ Universe sizes calculated.")

    # 5. Merge df_agg with df_universe on stat and window
    df_agg = pd.merge(df_agg, df_universe, on=['stat', 'window'])
    print("‚úÖ Opportunity universe merged.")

    # 6. Calculate the opportunity_pct for each strategy in df_agg
    df_agg['opportunity_pct'] = (df_agg['total_bets'] / df_agg['universe_size']) * 100
    print("‚úÖ Opportunity percentages calculated.")

    # 7. Define min_win_rate and min_opportunity_pct
    min_win_rate = 75.0
    min_opportunity_pct = 15.0

    # 8. Filter df_agg to create df_agg_reliable
    df_agg_reliable = df_agg[
        (df_agg['win_rate'] >= min_win_rate) &
        (df_agg['opportunity_pct'] >= min_opportunity_pct)
    ].copy()
    print(f"‚úÖ Filtered for reliable strategies (Min {min_win_rate}% Win Rate, Min {min_opportunity_pct}% Opportunity Rate).")

    # 9. Sort df_agg_reliable by win_rate in descending order.
    df_agg_reliable = df_agg_reliable.sort_values(by='win_rate', ascending=False)
    print("‚úÖ Reliable strategies sorted by win rate.")

    # 10. Define a list of stats_to_compare
    stats_to_compare = ['PTS', 'AST', 'TRB', 'PRA', 'FG3M']

    print("\n--- BEST RELIABLE STRATEGIES (Win Rate vs. Opportunity) ---")
    print(f"(Based on {df_merged['Player'].nunique()} players, Min {min_win_rate}% Win Rate, Min {min_opportunity_pct}% Opportunity Rate)\n")

    # 11. Loop through stats_to_compare and print best strategy details
    for stat_type in stats_to_compare:
        df_stat = df_agg_reliable[df_agg_reliable['stat'] == stat_type]

        if df_stat.empty:
            print(f"No reliable strategy found for **{stat_type}**.\n")
            continue

        best_strategy = df_stat.iloc[0] # Get the top strategy after sorting

        print(f"üèÜ **Best for {stat_type}:**")
        print(f"   Bet **{best_strategy['stat']}** using **{best_strategy['window']}-game avg {best_strategy['adj']}**")
        print(f"   when: **Spread <= {best_strategy['spread']}** & **Total >= {best_strategy['total']}**")
        print(f"   Win Rate: **{best_strategy['win_rate']:.2f}%** ({best_strategy['total_wins']} wins in {best_strategy['total_bets']} games)")
        print(f"   (This strategy applies to **{best_strategy['opportunity_pct']:.1f}%** of all bettable games)\n")


--- Aggregating Results & Identifying Best Strategies ---
‚úÖ Results aggregated by strategy parameters.
‚úÖ Win rates calculated.
‚úÖ Universe sizes calculated.
‚úÖ Opportunity universe merged.
‚úÖ Opportunity percentages calculated.
‚úÖ Filtered for reliable strategies (Min 75.0% Win Rate, Min 15.0% Opportunity Rate).
‚úÖ Reliable strategies sorted by win rate.

--- BEST RELIABLE STRATEGIES (Win Rate vs. Opportunity) ---
(Based on 13 players, Min 75.0% Win Rate, Min 15.0% Opportunity Rate)

No reliable strategy found for **PTS**.

üèÜ **Best for AST:**
   Bet **AST** using **15-game avg -3.0**
   when: **Spread <= 4** & **Total >= 233**
   Win Rate: **83.64%** (46 wins in 55 games)
   (This strategy applies to **28.2%** of all bettable games)

No reliable strategy found for **TRB**.

üèÜ **Best for PRA:**
   Bet **PRA** using **15-game avg -6.0**
   when: **Spread <= 8** & **Total >= 237**
   Win Rate: **76.67%** (23 wins in 30 games)
   (This strategy applies to **15.4%** of all b

In [5]:
# --- BLOCK 4: PARLAY STRATEGY VALIDATOR ---
# Verifying the mathematical edge of combining high-confidence adjusted lines.

def validate_parlay_edge(leg_win_rate, num_legs):
    """
    Calculates the true win probability of a parlay compared to market odds.
    User Strategy: Combine 3-4 adjusted lines to reach ~+100 (Even Money) odds.
    """
    print(f"\n--- PARLAY EDGE CALCULATOR ({num_legs}-Leg Strategy) ---")

    # 1. Inputs
    market_odds_american = 100  # +100 Odds (Standard Target)
    market_implied_prob = 50.0  # +100 implies a 50% chance of winning

    # 2. Math
    # The 'True' probability is the leg_win_rate to the power of num_legs
    true_parlay_prob = (leg_win_rate / 100) ** num_legs * 100

    # 3. Output
    print(f"Stats per Leg:")
    print(f"  ‚Ä¢ Individual Leg Win Rate: {leg_win_rate:.2f}%")
    print(f"  ‚Ä¢ Number of Legs: {num_legs}")

    print(f"\nMathematical Reality:")
    print(f"  ‚Ä¢ Market Implied Win % (+100 Odds): {market_implied_prob:.1f}%")
    print(f"  ‚Ä¢ Your Model's Win %:               {true_parlay_prob:.1f}%")

    # 4. Verdict
    edge = true_parlay_prob - market_implied_prob
    if edge > 0:
        print(f"\n‚úÖ EDGE FOUND: +{edge:.1f}% Advantage vs The House")
        print("   Conclusion: This parlay hits significantly more often than the odds imply.")
    else:
        print(f"\n‚ùå NO EDGE: -{abs(edge):.1f}% Disadvantage")

# --- EXECUTE VALIDATION ---
if 'df_agg_reliable' in locals() and not df_agg_reliable.empty:
    # Grab the best win rate from Cell 3 (e.g., your 96.36% on FG3M)
    best_leg_win_rate = df_agg_reliable.iloc[0]['win_rate']

    # Test a 3-Leg Parlay
    validate_parlay_edge(leg_win_rate=best_leg_win_rate, num_legs=3)

    # Test a 4-Leg Parlay
    validate_parlay_edge(leg_win_rate=best_leg_win_rate, num_legs=4)
else:
    print("‚ö†Ô∏è No reliable strategies found in Block 3. Using baseline.")
    validate_parlay_edge(leg_win_rate=85.0, num_legs=3)


--- PARLAY EDGE CALCULATOR (3-Leg Strategy) ---
Stats per Leg:
  ‚Ä¢ Individual Leg Win Rate: 96.36%
  ‚Ä¢ Number of Legs: 3

Mathematical Reality:
  ‚Ä¢ Market Implied Win % (+100 Odds): 50.0%
  ‚Ä¢ Your Model's Win %:               89.5%

‚úÖ EDGE FOUND: +39.5% Advantage vs The House
   Conclusion: This parlay hits significantly more often than the odds imply.

--- PARLAY EDGE CALCULATOR (4-Leg Strategy) ---
Stats per Leg:
  ‚Ä¢ Individual Leg Win Rate: 96.36%
  ‚Ä¢ Number of Legs: 4

Mathematical Reality:
  ‚Ä¢ Market Implied Win % (+100 Odds): 50.0%
  ‚Ä¢ Your Model's Win %:               86.2%

‚úÖ EDGE FOUND: +36.2% Advantage vs The House
   Conclusion: This parlay hits significantly more often than the odds imply.
