# Imports

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

# Loading Data

In [5]:
season_2024_25 = pd.read_csv('data/nba_boxscores_2024-25.csv')

In [None]:
season_2023_24 = pd.read_csv('data/nba_boxscores_2023-24.csv')

In [None]:
season_2022_23 = pd.read_csv('data/nba_boxscores_2022-23.csv')

In [None]:
data = pd.concat([season_2024_25, season_2023_24, season_2022_23], ignore_index=True)

# Feature Engineering

## Rolling 5

In [None]:
data['GAME_DATE'] = pd.to_datetime(data['GAME_DATE'])

data = data.sort_values(['PLAYER_NAME', 'GAME_DATE']).reset_index(drop=True)

In [None]:
def parse_minutes_to_seconds(time_str):
    if pd.isna(time_str):
        return None
    try:
        # Split by colon
        minutes_part, seconds_part = str(time_str).split(':')
        # Convert minute to float, in case it's like "36.000000"
        minutes = float(minutes_part)
        seconds = int(seconds_part)
        total_seconds = int(minutes * 60 + seconds)
        return total_seconds
    except:
        return None  # if formatting fails
    
data['SEC'] = data['MIN'].apply(parse_minutes_to_seconds)


In [None]:
def rolling_5(stat):
    data[f'{stat}_rolling_5'] = (
        data.groupby('PLAYER_NAME')[stat]
            .transform(lambda x: x.shift().rolling(window=5).mean())
    )

In [None]:
rolling_5('PTS')
rolling_5('FGM')
rolling_5('FG_PCT')
rolling_5('SEC')
rolling_5('FGA')
rolling_5('FG3M')
rolling_5('FT_PCT')
rolling_5('AST')

## Opponent

# Model

In [None]:
player_data = data[data['PLAYER_NAME'] == 'Austin Reaves']

In [None]:
# Select feature to predict
target = "PTS"

# Select features to train on
features = [
    "PTS_rolling_5",
    "FGM_rolling_5",
    "FG_PCT_rolling_5",
    "SEC_rolling_5",
    "FGA_rolling_5",
    "FG3M_rolling_5",
    "FT_PCT_rolling_5",
    "AST_rolling_5"
]

# Drop rows with missing data (or impute if you prefer)
player_data = player_data.dropna(subset=features + [target])

# Split into features (X) and target (y)
X = player_data[features]
y = player_data[target]

# Split train/test by time order to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

# Train the XGBoost Regressor
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Test RMSE: {rmse:.2f} points")

# Plot Feature Importance
plot_importance(model, max_num_features=10)
plt.title("Top 10 Feature Importances")
plt.show()