# NBA Advanced Stats & Four Factors EDA Pipeline

## Objective
Ingest game data, calculate rolling advanced metrics, and visualize key predictors for win margins/spreads. This notebook focuses on NBA-specific metrics like Variance, Energy, and Schedule Spots.

## Metrics of Interest
1. **Net Rating (NetRtg)**: Point differential per 100 possessions.
2. **Four Factors**: eFG%, TOV%, OREB%, FT Rate.
3. **Schedule Spots**: SEGABABA (Second Game of Back-to-Back), 3-in-4, Altitude Advantage.


In [8]:
# 1. Environment Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from requests.exceptions import RequestException
from nba_api.stats.endpoints import leaguegamefinder, boxscoreadvancedv2, boxscorefourfactorsv2, boxscoretraditionalv2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

# Configuration
pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid")
%matplotlib inline

# NBA.com blocks scripts without browser-like headers; set once and reuse everywhere.
NBA_API_HEADERS = {
    'Host': 'stats.nba.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.nba.com/',
    'Origin': 'https://www.nba.com',
    'Connection': 'keep-alive',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true'
}
REQUEST_TIMEOUT = 100  # generous timeout to avoid hanging on slow responses

# Check nba_api version
try:
    import nba_api
    print(f"nba_api version: {nba_api.__version__}")
except:
    print("Could not determine nba_api version")


Could not determine nba_api version


## 2. Data Ingestion

We will fetch the game schedule and then loop through games to get advanced stats and four factors. **Updated to fetch 10 seasons.**

In [9]:
# Fetch Game Schedule for multiple seasons

def fetch_schedule(season='2024-25', season_type='Regular Season', retries=3, retry_delay=2.0):
    """Fetch a season schedule with headers/timeout and simple retries for timeouts/blocks."""
    last_err = None
    for attempt in range(1, retries + 1):
        try:
            gamefinder = leaguegamefinder.LeagueGameFinder(
                season_nullable=season,
                league_id_nullable='00',
                season_type_nullable=season_type,
                headers=NBA_API_HEADERS,
                timeout=REQUEST_TIMEOUT
            )
            games = gamefinder.get_data_frames()[0]
            return games
        except Exception as e:
            last_err = e
            print(f"Attempt {attempt}/{retries} failed for {season}: {e}")
            time.sleep(retry_delay * attempt)
    raise last_err

def fetch_all_seasons_schedule(start_year=2015, end_year=2025):
    all_games = []
    seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]
    error_log = []
    
    print(f"Fetching schedule for seasons: {seasons}")
    
    for season in seasons:
        print(f"Fetching {season}...")
        try:
            games = fetch_schedule(season=season)
            games['SEASON_ID'] = season  # Ensure we track the season
            all_games.append(games)
            time.sleep(1.5)  # Respect rate limits and avoid blocks
        except Exception as e:
            msg = str(e)
            print(f"Error fetching {season}: {msg}")
            error_log.append((season, msg))
            # Small pause before next season to avoid cascading timeouts
            time.sleep(2.5)
            continue
            
    if not all_games:
        raise RuntimeError("No schedules fetched; check network/API access and headers.")
    
    if error_log:
        print(f"Completed with {len(error_log)} season fetch errors: {error_log}")
    
    return pd.concat(all_games, ignore_index=True)

schedule_df = fetch_all_seasons_schedule()
print(f"Fetched {len(schedule_df)} games across {len(schedule_df['SEASON_ID'].unique())} seasons.")
schedule_df.head()


Fetching schedule for seasons: ['2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24', '2024-25', '2025-26']
Fetching 2015-16...
Fetching 2016-17...
Fetching 2017-18...
Fetching 2018-19...
Fetching 2019-20...
Fetching 2020-21...
Fetching 2021-22...
Fetching 2022-23...
Fetching 2023-24...
Fetching 2024-25...
Fetching 2025-26...
Fetched 24524 games across 11 seasons.


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,2015-16,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,56,86,0.651,13,28,0.464,19,23,0.826,5,38,43,41,14,8,13,20,35.0
1,2015-16,1610612759,SAS,San Antonio Spurs,21501223,2016-04-13,SAS @ DAL,W,240,96,35,74,0.473,6,23,0.261,20,26,0.769,8,34,42,24,8,3,14,19,5.0
2,2015-16,1610612742,DAL,Dallas Mavericks,21501223,2016-04-13,DAL vs. SAS,L,240,91,31,77,0.403,12,39,0.308,17,21,0.81,11,30,41,20,8,2,12,23,-5.0
3,2015-16,1610612744,GSW,Golden State Warriors,21501227,2016-04-13,GSW vs. MEM,W,240,125,46,87,0.529,20,47,0.426,13,16,0.813,12,39,51,35,7,7,17,14,21.0
4,2015-16,1610612758,SAC,Sacramento Kings,21501224,2016-04-13,SAC @ HOU,L,238,81,32,96,0.333,11,37,0.297,6,13,0.462,15,34,49,21,11,4,17,17,-35.0


In [10]:
# DIAGNOSTIC: Inspect raw API response to understand the issue
print("=" * 60)
print("NBA API DIAGNOSTICS")
print("=" * 60)

# Check if schedule_df exists
if 'schedule_df' not in globals() or schedule_df is None or len(schedule_df) == 0:
    print("ERROR: schedule_df not found or empty. Please run the schedule fetch cell first.")
else:
    # Get finished games from schedule_df
    finished_games_check = schedule_df[schedule_df['WL'].notna()]['GAME_ID'].unique()
    
    if len(finished_games_check) > 0:
    # Get the most recent game IDs (likely from later seasons)
        recent_games = schedule_df[schedule_df['WL'].notna()].sort_values('GAME_DATE', ascending=False)
    if len(recent_games) > 0:
        test_game_id = str(recent_games.iloc[0]['GAME_ID'])
        print(f"\n1. Testing with recent game ID: {test_game_id}")
        print(f"   Game Date: {recent_games.iloc[0]['GAME_DATE']}")
        print(f"   Matchup: {recent_games.iloc[0]['MATCHUP']}")
        
        # Test Advanced Stats API
        print(f"\n2. Testing BoxScoreAdvancedV2...")
        adv_test = None
        try:
            adv_test = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=test_game_id, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            
            # Check API endpoint and parameters
            print("   Checking API endpoint...")
            if hasattr(adv_test, 'endpoint'):
                print(f"   Endpoint: {adv_test.endpoint}")
            if hasattr(adv_test, 'parameters'):
                print(f"   Parameters: {adv_test.parameters}")
            if hasattr(adv_test, 'url'):
                print(f"   URL: {adv_test.url}")
            
            # CRITICAL: Inspect raw response BEFORE calling get_data_frames()
            print("   Inspecting raw API response (BEFORE get_data_frames)...")
            if hasattr(adv_test, 'get_dict'):
                try:
                    response_dict = adv_test.get_dict()
                    print(f"   ✓ Successfully got response dictionary")
                    print(f"   Response top-level keys: {list(response_dict.keys())}")
                    
                    # Check for resultSets (plural) vs resultSet (singular)
                    if 'resultSets' in response_dict:
                        print(f"   ✓ Found 'resultSets' key (plural)")
                        print(f"   Number of result sets: {len(response_dict['resultSets'])}")
                        for i, rs in enumerate(response_dict['resultSets']):
                            print(f"   ResultSet {i}: keys = {list(rs.keys()) if isinstance(rs, dict) else 'N/A'}")
                            if isinstance(rs, dict) and 'name' in rs:
                                print(f"     Name: {rs['name']}")
                            if isinstance(rs, dict) and 'rowSet' in rs:
                                print(f"     Rows: {len(rs['rowSet'])}")
                    elif 'resultSet' in response_dict:
                        print(f"   ⚠ Found 'resultSet' key (singular, not plural)")
                        print(f"   This might be the issue - API expects 'resultSets'")
                    else:
                        print(f"   ✗ Neither 'resultSets' nor 'resultSet' found")
                        print(f"   Full response keys: {list(response_dict.keys())}")
                        # Print a sample of the response
                        import json
                        print(f"   Response sample: {json.dumps({k: str(v)[:100] for k, v in list(response_dict.items())[:3]}, indent=2)}")
                except Exception as dict_err:
                    print(f"   ✗ Error getting dict: {dict_err}")
            
            # Now try to get dataframes (this is where the error occurs)
            print("\n   Attempting to get dataframes (this may fail)...")
            adv_test_frames = adv_test.get_data_frames()
            print(f"   ✓ Success! Got {len(adv_test_frames)} dataframes")
            if len(adv_test_frames) > 0:
                print(f"   First dataframe shape: {adv_test_frames[0].shape}")
                print(f"   Columns: {list(adv_test_frames[0].columns)[:10]}...")
                
        except KeyError as ke:
            print(f"   ✗ KeyError caught: {ke}")
            print(f"   Error message: {str(ke)}")
            if adv_test is not None:
                # Try to inspect the response even after error
                print("   Attempting to inspect response after error...")
                try:
                    if hasattr(adv_test, 'get_dict'):
                        response_dict = adv_test.get_dict()
                        print(f"   Response keys: {list(response_dict.keys())}")
                        # Check what's actually in the response
                        if 'resultSets' in response_dict:
                            print(f"   ✓ 'resultSets' exists in response!")
                        elif 'resultSet' in response_dict:
                            print(f"   ⚠ 'resultSet' exists (singular) but library expects 'resultSets'")
                        else:
                            print(f"   ✗ No resultSets/resultSet found")
                            print(f"   Available keys: {list(response_dict.keys())}")
                except Exception as inspect_err:
                    print(f"   Could not inspect: {inspect_err}")
            
            # Check if this is a known issue with the library
            import traceback
            tb = traceback.format_exc()
            if 'resultSet' in tb:
                print(f"\n   DIAGNOSIS: The error occurs when the library tries to access 'resultSet'")
                print(f"   This suggests the API response structure doesn't match what the library expects")
        except Exception as e:
            print(f"   ✗ Error: {type(e).__name__}: {e}")
            import traceback
            print(f"   Traceback (first 500 chars):\n{traceback.format_exc()[:500]}")
        
        # Test Four Factors API
        print(f"\n3. Testing BoxScoreFourFactorsV2...")
        try:
            ff_test = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id=test_game_id, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            
            # Inspect raw response
            print("   Inspecting raw API response...")
            if hasattr(ff_test, 'get_dict'):
                response_dict = ff_test.get_dict()
                print(f"   Response keys: {list(response_dict.keys())}")
                
                if 'resultSets' in response_dict:
                    print(f"   ✓ Found 'resultSets' key")
                    print(f"   Number of result sets: {len(response_dict['resultSets'])}")
                else:
                    print(f"   ✗ 'resultSets' key NOT found")
                    print(f"   Full response structure: {response_dict}")
            
            # Try to get dataframes
            print("   Attempting to get dataframes...")
            ff_test_frames = ff_test.get_data_frames()
            print(f"   ✓ Success! Got {len(ff_test_frames)} dataframes")
            if len(ff_test_frames) > 0:
                print(f"   First dataframe shape: {ff_test_frames[0].shape}")
                print(f"   Columns: {list(ff_test_frames[0].columns)[:10]}...")
                
        except KeyError as ke:
            print(f"   ✗ KeyError: {ke}")
            if hasattr(ff_test, 'get_dict'):
                response_dict = ff_test.get_dict()
                print(f"   Response structure: {list(response_dict.keys())}")
        except Exception as e:
            print(f"   ✗ Error: {type(e).__name__}: {e}")
        
        # Try alternative: Check if game exists using traditional boxscore (simpler endpoint)
        print(f"\n4. Verifying game exists with traditional boxscore...")
        try:
            trad_test = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=test_game_id, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            trad_frames = trad_test.get_data_frames()
            print(f"   ✓ Traditional boxscore works! Got {len(trad_frames)} dataframes")
            if len(trad_frames) > 0:
                print(f"   First dataframe shape: {trad_frames[0].shape}")
                print(f"   This confirms the game ID is valid")
        except Exception as e:
            print(f"   ✗ Traditional boxscore also failed: {e}")
            print(f"   This suggests the game ID might be invalid or game doesn't exist")
        
        # Try alternative: Check if game_id needs different format
        print(f"\n5. Testing alternative game ID formats...")
        # Try without leading zeros (if it's numeric)
        try:
            game_id_int = int(test_game_id)
            print(f"   Trying as integer: {game_id_int}")
            adv_test2 = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id_int, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            adv_test2_frames = adv_test2.get_data_frames()
            print(f"   ✓ Integer format works! Got {len(adv_test2_frames)} dataframes")
        except Exception as e:
            print(f"   ✗ Integer format failed: {e}")
        
        print("\n" + "=" * 60)
    else:
        print("No finished games found to test with")


NBA API DIAGNOSTICS

1. Testing with recent game ID: 0022500075
   Game Date: 2025-11-28
   Matchup: PHX @ OKC

2. Testing BoxScoreAdvancedV2...
   ✗ KeyError caught: 'resultSet'
   Error message: 'resultSet'

   DIAGNOSIS: The error occurs when the library tries to access 'resultSet'
   This suggests the API response structure doesn't match what the library expects

3. Testing BoxScoreFourFactorsV2...
   ✗ KeyError: 'resultSet'


NameError: name 'ff_test' is not defined

In [None]:
# Filter for finished games only to avoid errors
finished_games = schedule_df[schedule_df['WL'].notna()]['GAME_ID'].unique()
print(f"Processing {len(finished_games)} finished games...")

In [None]:
def fetch_game_details(game_ids, schedule_df=None, max_games=50, start_from_recent=True):
    advanced_list = []
    four_factors_list = []
    
    # Limit for testing - remove [:max_games] for full run
    # WARNING: Fetching 10 seasons of data will take HOURS.
    # Recommended: Run in batches or parallelize if possible (but API limits prevent parallel).
    
    # Optionally sort by date (newest first) to test with games more likely to have stats
    if start_from_recent and schedule_df is not None:
        # Create a mapping of game_id to date for sorting
        game_dates = schedule_df.set_index('GAME_ID')['GAME_DATE'].to_dict()
        game_ids_sorted = sorted(game_ids, key=lambda x: game_dates.get(x, ''), reverse=True)
        game_ids_to_fetch = game_ids_sorted[:max_games]
        print(f"Starting fetch for {max_games} games (newest first)...")
    else:
        game_ids_to_fetch = list(game_ids)[:max_games]
        print(f"Starting fetch for {max_games} games...")
    
    total_games = len(game_ids_to_fetch)
    success_count = 0
    error_count = 0
    
    for i, game_id in enumerate(game_ids_to_fetch):
        if i % 10 == 0:
            print(f"Fetching game {i+1}/{total_games}... (Success: {success_count}, Errors: {error_count})")
        
        try:
            # Ensure game_id is a string (API expects string format like "0021501226")
            game_id_str = str(game_id).strip()
            if not game_id_str or len(game_id_str) < 10:
                raise ValueError(f"Invalid game ID format: {game_id_str}")
            
            # 1. Advanced Stats
            adv = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id_str, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            # Check response before calling get_data_frames()
            try:
                adv_frames = adv.get_data_frames()
            except (KeyError, AttributeError) as api_err:
                # Check if it's a resultSet error or similar API structure issue
                if 'resultSet' in str(api_err) or 'result' in str(api_err).lower():
                    # Try to inspect the response object
                    if hasattr(adv, 'get_dict'):
                        response_dict = adv.get_dict()
                        if 'resultSets' not in response_dict or len(response_dict.get('resultSets', [])) == 0:
                            raise ValueError(f"API returned empty resultSets for game {game_id_str}")
                    raise ValueError(f"API response structure issue for game {game_id_str}: {api_err}")
                raise
            
            # Check which dataframe has the team-level data (usually index 1, but verify)
            if len(adv_frames) > 1:
                adv_df = adv_frames[1]  # Team-level stats
            elif len(adv_frames) > 0:
                adv_df = adv_frames[0]  # Fallback to first dataframe
            else:
                raise ValueError("No dataframes returned from advanced stats API")
            
            # Verify dataframe is not empty and has required columns
            if len(adv_df) > 0 and 'GAME_ID' in adv_df.columns and 'TEAM_ID' in adv_df.columns:
                advanced_list.append(adv_df)
            else:
                raise ValueError(f"Invalid advanced stats dataframe for game {game_id_str}")
            
            # 2. Four Factors
            ff = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id=game_id_str, headers=NBA_API_HEADERS, timeout=REQUEST_TIMEOUT)
            # Check response before calling get_data_frames()
            try:
                ff_frames = ff.get_data_frames()
            except (KeyError, AttributeError) as api_err:
                if 'resultSet' in str(api_err) or 'result' in str(api_err).lower():
                    if hasattr(ff, 'get_dict'):
                        response_dict = ff.get_dict()
                        if 'resultSets' not in response_dict or len(response_dict.get('resultSets', [])) == 0:
                            raise ValueError(f"API returned empty resultSets for game {game_id_str}")
                    raise ValueError(f"API response structure issue for game {game_id_str}: {api_err}")
                raise
            
            # Check which dataframe has the team-level data
            if len(ff_frames) > 1:
                ff_df = ff_frames[1]  # Team-level stats
            elif len(ff_frames) > 0:
                ff_df = ff_frames[0]  # Fallback to first dataframe
            else:
                raise ValueError("No dataframes returned from four factors API")
            
            # Verify dataframe is not empty and has required columns
            if len(ff_df) > 0 and 'GAME_ID' in ff_df.columns and 'TEAM_ID' in ff_df.columns:
                four_factors_list.append(ff_df)
            else:
                raise ValueError(f"Invalid four factors dataframe for game {game_id_str}")
            
            success_count += 1
            
            # Rate Limiting
            time.sleep(0.6)
            
        except (KeyError, ValueError, AttributeError) as e:
            # Handle API structure errors
            error_count += 1
            if error_count <= 5:
                error_msg = str(e)
                game_id_display = str(game_ids_to_fetch[i]) if i < len(game_ids_to_fetch) else "unknown"
                if 'resultSet' in error_msg:
                    print(f"Error fetching {game_id_display}: API response structure issue - game may not exist or stats unavailable")
                else:
                    print(f"Error fetching {game_id_display}: {error_msg}")
            elif error_count == 6:
                print(f"... (suppressing further error messages)")
            time.sleep(1.0)
            continue
        except Exception as e:
            error_count += 1
            if error_count <= 5:  # Only print first 5 errors to avoid spam
                game_id_display = str(game_ids_to_fetch[i]) if i < len(game_ids_to_fetch) else "unknown"
                print(f"Error fetching {game_id_display}: {e}")
            elif error_count == 6:
                print(f"... (suppressing further error messages)")
            time.sleep(1.0) 
    
    print(f"\nFetch complete: {success_count} successful, {error_count} errors")
    
    if not advanced_list:
        print("WARNING: No data was successfully fetched. Returning empty dataframes.")
        return pd.DataFrame(), pd.DataFrame()
        
    return pd.concat(advanced_list, ignore_index=True), pd.concat(four_factors_list, ignore_index=True)

advanced_stats, four_factors = fetch_game_details(finished_games, schedule_df=schedule_df, max_games=50, start_from_recent=True)
print(f"\nAdvanced stats shape: {advanced_stats.shape}")
print(f"Four factors shape: {four_factors.shape}")
if len(advanced_stats) > 0:
    print(f"\nAdvanced stats columns: {list(advanced_stats.columns)}")
if len(four_factors) > 0:
    print(f"Four factors columns: {list(four_factors.columns)}")
print("\nData Fetch Complete")

## 3. Feature Engineering
Now we merge the datasets and calculate rolling averages. **Crucially, we must group by SEASON_ID to prevent stats leaking across seasons.**

In [None]:
def process_data(advanced_df, four_factors_df, schedule_df):
    # Check if dataframes are empty
    if len(advanced_df) == 0:
        raise ValueError("advanced_df is empty. Check API calls.")
    if len(four_factors_df) == 0:
        raise ValueError("four_factors_df is empty. Check API calls.")
    
    # Check required columns exist
    required_adv_cols = ['GAME_ID', 'TEAM_ID']
    required_ff_cols = ['GAME_ID', 'TEAM_ID', 'EFG_PCT', 'TM_TOV_PCT', 'OREB_PCT', 'OPP_EFG_PCT', 'OPP_TOV_PCT', 'OPP_OREB_PCT']
    
    missing_adv = [col for col in required_adv_cols if col not in advanced_df.columns]
    if missing_adv:
        raise ValueError(f"Missing columns in advanced_df: {missing_adv}. Available columns: {list(advanced_df.columns)}")
    
    missing_ff = [col for col in required_ff_cols if col not in four_factors_df.columns]
    if missing_ff:
        print(f"Warning: Missing columns in four_factors_df: {missing_ff}")
        print(f"Available columns: {list(four_factors_df.columns)}")
        # Try to use available columns
        available_ff_cols = [col for col in required_ff_cols if col in four_factors_df.columns]
        available_ff_cols = ['GAME_ID', 'TEAM_ID'] + available_ff_cols[2:]  # Ensure GAME_ID and TEAM_ID are first
        four_factors_subset = four_factors_df[available_ff_cols]
    else:
        four_factors_subset = four_factors_df[required_ff_cols]
    
    # Merge Advanced + Four Factors
    merged_stats = pd.merge(
        advanced_df, 
        four_factors_subset, 
        on=['GAME_ID', 'TEAM_ID'], 
        how='inner'
    )
    
    if len(merged_stats) == 0:
        raise ValueError("Merge resulted in empty dataframe. Check GAME_ID and TEAM_ID matching.")
    
    # Merge with Schedule to get Dates and Matchups
    merged_stats['GAME_ID'] = merged_stats['GAME_ID'].astype(str)
    schedule_df['GAME_ID'] = schedule_df['GAME_ID'].astype(str)
    
    full_df = pd.merge(
        merged_stats, 
        schedule_df[['GAME_ID', 'TEAM_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS', 'SEASON_ID']], 
        on=['GAME_ID', 'TEAM_ID'], 
        how='inner'
    )
    
    if len(full_df) == 0:
        raise ValueError("Final merge resulted in empty dataframe. Check GAME_ID and TEAM_ID matching with schedule.")
    
    # Date Handling
    full_df['GAME_DATE'] = pd.to_datetime(full_df['GAME_DATE'])
    full_df = full_df.sort_values(['TEAM_ID', 'GAME_DATE'])
    
    print(f"Processed {len(full_df)} game records")
    return full_df

full_df = process_data(advanced_stats, four_factors, schedule_df)

In [None]:
def create_rolling_features(df):
    df = df.copy()
    
    metrics = ['OFF_RATING', 'DEF_RATING', 'NET_RATING', 'PACE', 'PIE', 'EFG_PCT', 'TM_TOV_PCT', 'OREB_PCT']
    
    # 1. Create Lag Features (Shift 1)
    # Group by SEASON_ID as well to avoid cross-season leakage
    for col in metrics:
        df[f'PREV_{col}'] = df.groupby(['TEAM_ID', 'SEASON_ID'])[col].shift(1)
        
    # 2. Rolling Averages (5 and 10 games)
    for col in metrics:
        # Rolling 5
        df[f'ROLLING_5_{col}'] = df.groupby(['TEAM_ID', 'SEASON_ID'])[f'PREV_{col}'].transform(lambda x: x.rolling(5).mean())
        # Rolling 10
        df[f'ROLLING_10_{col}'] = df.groupby(['TEAM_ID', 'SEASON_ID'])[f'PREV_{col}'].transform(lambda x: x.rolling(10).mean())
        
    return df

rolling_df = create_rolling_features(full_df)

In [None]:
def create_differentials(df):
    opp_df = df.copy()
    
    game_merged = pd.merge(df, opp_df, on='GAME_ID', suffixes=('', '_OPP'))
    game_merged = game_merged[game_merged['TEAM_ID'] != game_merged['TEAM_ID_OPP']]
    
    metrics = ['NET_RATING', 'EFG_PCT', 'TM_TOV_PCT', 'OREB_PCT']
    windows = ['ROLLING_5', 'ROLLING_10']
    
    for window in windows:
        for metric in metrics:
            col_name = f'{window}_{metric}'
            game_merged[f'{col_name}_DIFF'] = game_merged[col_name] - game_merged[f'{col_name}_OPP']
            
    return game_merged

final_df = create_differentials(rolling_df)

## 4. Modeling & Backtesting
Train on historical seasons (e.g., < 2025-26) and test on the current season (2025-26).

In [None]:
def train_and_backtest(df):
    # Drop NaNs
    df_model = df.dropna(subset=['ROLLING_5_NET_RATING_DIFF', 'PLUS_MINUS'])
    
    # Define Features and Target
    features = [
        'ROLLING_5_NET_RATING_DIFF', 'ROLLING_10_NET_RATING_DIFF',
        'ROLLING_5_EFG_PCT_DIFF', 'ROLLING_10_EFG_PCT_DIFF',
        'ROLLING_5_TM_TOV_PCT_DIFF', 'ROLLING_10_TM_TOV_PCT_DIFF',
        'ROLLING_5_OREB_PCT_DIFF', 'ROLLING_10_OREB_PCT_DIFF'
    ]
    target_reg = 'PLUS_MINUS' # Regression target
    target_clf = 'WL' # Classification target (W/L)
    
    # Convert WL to binary (W=1, L=0)
    df_model['WL_BINARY'] = df_model['WL'].apply(lambda x: 1 if x == 'W' else 0)
    
    # Split Train (Past) vs Test (Current Season)
    # Assuming '2025-26' is the current season ID format
    current_season = '2025-26'
    train_df = df_model[df_model['SEASON_ID'] != current_season]
    test_df = df_model[df_model['SEASON_ID'] == current_season]
    
    print(f"Training on {len(train_df)} games (Seasons < {current_season})")
    print(f"Testing on {len(test_df)} games (Season == {current_season})")
    
    if len(train_df) == 0 or len(test_df) == 0:
        print("Insufficient data for split. Check SEASON_ID values.")
        return
    
    X_train = train_df[features]
    y_train_reg = train_df[target_reg]
    y_train_clf = train_df['WL_BINARY']
    
    X_test = test_df[features]
    y_test_reg = test_df[target_reg]
    y_test_clf = test_df['WL_BINARY']
    
    # 1. Gradient Boosting Regressor (Predict Margin)
    gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gbr.fit(X_train, y_train_reg)
    preds_reg = gbr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test_reg, preds_reg))
    print(f"Regressor RMSE: {rmse:.2f}")
    
    # 2. Gradient Boosting Classifier (Predict Win/Loss)
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gbc.fit(X_train, y_train_clf)
    preds_clf = gbc.predict(X_test)
    accuracy = accuracy_score(y_test_clf, preds_clf)
    print(f"Classifier Accuracy: {accuracy:.2%}")
    print(classification_report(y_test_clf, preds_clf))
    
    # Feature Importance
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': gbc.feature_importances_})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance (Gradient Boosting Classifier)')
    plt.show()
    
    return gbc, gbr

model_clf, model_reg = train_and_backtest(final_df)

## 5. Export & Storage
Save the processed dataset for modeling.

In [None]:
def export_data(df, filename='final_nba_modeling_data.csv'):
    df.to_csv(filename, index=False)
    print(f"Saved data to {filename}")

export_data(final_df)