# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
test = pd.read_csv('data/nba_boxscores_2020-21.csv')

# Prepare Data

In [3]:
# Load data
seasons = ['2024-25', '2023-24', '2022-23', '2021-22', '2020-21']
data_frames = []

for season in seasons:
    df = pd.read_csv(f'data/nba_boxscores_{season}.csv')
    data_frames.append(df)

data = pd.concat(data_frames, ignore_index=True)

In [4]:
# Sort data based on player and game date
data['gameDate'] = pd.to_datetime(data['gameDate'])

data = data.sort_values('gameDate').reset_index(drop=True)

In [5]:
# Add seconds column
def parse_minutes_to_seconds(time_str):
    if pd.isna(time_str):
        return None
    try:
        # Split by colon
        minutes_part, seconds_part = str(time_str).split(':')
        # Convert minute to float, in case it's like "36.000000"
        minutes = float(minutes_part)
        seconds = int(seconds_part)
        total_seconds = int(minutes * 60 + seconds)
        return total_seconds
    except:
        return None  # if formatting fails
    
data['seconds'] = data['minutes'].apply(parse_minutes_to_seconds)


In [6]:
# Drop rows where player didn't play
data = data[data['seconds'] > 0]

# Feature Engineering

## Player Rolling

In [7]:
def player_rolling(stat, num_games):
    data[f'{stat}_rolling'] = (
        data.groupby('playerSlug')[stat]
            .transform(lambda x: x.shift().rolling(window=num_games).mean())
    )

In [8]:
player_stats = [
    'seconds',
    'fieldGoalsMade',
    'fieldGoalsAttempted',
    'fieldGoalsPercentage',
    'threePointersMade',
    'threePointersAttempted',
    'threePointersPercentage',
    'freeThrowsMade',
    'freeThrowsAttempted',
    'freeThrowsPercentage',
    'reboundsOffensive',
    'reboundsDefensive',
    'reboundsTotal',
    'assists',
    'steals',
    'blocks',
    'turnovers',
    'foulsPersonal',
    'points',
    'plusMinusPoints',
    'estimatedOffensiveRating',
    'offensiveRating',
    'estimatedDefensiveRating',
    'defensiveRating',
    'estimatedNetRating',
    'netRating',
    'assistPercentage',
    'assistToTurnover',
    'assistRatio',
    'offensiveReboundPercentage',
    'defensiveReboundPercentage',
    'reboundPercentage',
    'turnoverRatio',
    'effectiveFieldGoalPercentage',
    'trueShootingPercentage',
    'usagePercentage',
    'estimatedUsagePercentage',
    'estimatedPace',
    'pace',
    'pacePer40',
    'possessions',
    'PIE'
]

for player_stat in player_stats:
    player_rolling(player_stat, 10)

# Team Rolling

In [None]:
def opp_team_rolling(stat, num_games):
    games = data.drop_duplicates(subset=['gameId', 'teamId'])
    games = games[['gameId', 'teamId', stat]]

    games[f'{stat}_rolling'] = (
        games.groupby('teamId')[stat]
            .transform(lambda x: x.shift().rolling(window=num_games).mean())
    )

    merged_df = pd.merge(
        data,
        games,
        left_on=['gameId', 'opp_teamId'],
        right_on=['gameId', 'teamId'],
        how='left'
    )

    merged_df = merged_df.drop(columns=['teamId_y', f'{stat}_y'])

    merged_df = merged_df.rename(columns={
        'teamId_x': 'teamId',
        f'{stat}_x': f'{stat}',
        f'{stat}_rolling': f'opp_{stat}_rolling'
    })

    return merged_df

In [10]:
opp_stats = [
    'team_fieldGoalsMade',
    'team_fieldGoalsAttempted',
    'team_fieldGoalsPercentage',
    'team_threePointersMade',
    'team_threePointersAttempted',
    'team_threePointersPercentage',
    'team_freeThrowsMade',
    'team_freeThrowsAttempted',
    'team_freeThrowsPercentage',
    'team_reboundsOffensive',
    'team_reboundsDefensive',
    'team_reboundsTotal',
    'team_assists',
    'team_steals',
    'team_blocks',
    'team_turnovers',
    'team_foulsPersonal',
    'team_points',
    'team_plusMinusPoints',
    'team_estimatedOffensiveRating',
    'team_offensiveRating',
    'team_estimatedDefensiveRating',
    'team_defensiveRating',
    'team_estimatedNetRating',
    'team_netRating',
    'team_assistPercentage',
    'team_assistToTurnover',
    'team_assistRatio',
    'team_offensiveReboundPercentage',
    'team_defensiveReboundPercentage',
    'team_reboundPercentage',
    'team_estimatedTeamTurnoverPercentage',
    'team_turnoverRatio',
    'team_effectiveFieldGoalPercentage',
    'team_trueShootingPercentage',
    'team_usagePercentage',
    'team_estimatedUsagePercentage',
    'team_estimatedPace',
    'team_pace',
    'team_pacePer40',
    'team_possessions',
    'team_PIE',
]

In [11]:
for opp_stat in opp_stats:
    data = opp_team_rolling(opp_stat, 10)

# Model

In [12]:
combined = player_stats + opp_stats
rolling_features = [f"{feature}_rolling" for feature in combined]

In [13]:
# Select feature to predict
target = "points"

# Select features to train on
features = [
    "seconds_rolling",
    "fieldGoalsAttempted_rolling",
    "freeThrowsAttempted_rolling",
    "points_rolling",
    "offensiveRating_rolling",
    "turnoverRatio_rolling",
    "trueShootingPercentage_rolling",
    "usagePercentage_rolling",
    "possessions_rolling",
    "opp_defensiveRating_rolling",
]

# Drop rows with missing data (or impute if you prefer)
data = data.dropna(subset=features + [target])

# Split into features (X) and target (y)
X = data[features]
y = data[target]

# Split train/test by time order to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

# Train the XGBoost Regressor
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"Test RMSE: {rmse:.2f} points")
print(f"Test MAE: {mae:.2f} points")

# Evaluate baseline (10-game average)
y_baseline_pred = X_test["points_rolling"]
baseline_mse = mean_squared_error(y_test, y_baseline_pred)
baseline_rmse = np.sqrt(baseline_mse)
baseline_mae = mean_absolute_error(y_test, y_baseline_pred)
print(f"10-game Average Baseline RMSE: {baseline_rmse:.2f} points")
print(f"10-game Average Baseline MAE: {baseline_mae:.2f} points")

# Improvement
rmse_improvement = baseline_rmse - rmse
mae_improvement = baseline_mae - mae
print(f"RMSE Improvement over baseline: {rmse_improvement:.2f} points")
print(f"MAE Improvement over baseline: {mae_improvement:.2f} points")

# Plot Feature Importance
plot_importance(model, max_num_features=10)
plt.title("Top 10 Feature Importances")
plt.show()


KeyError: ['opp_defensiveRating_rolling']