In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [2]:
#load some data
base_url = "https://raw.githubusercontent.com/NocturneBear/NBA-Data-2010-2024/main/"
files = [
    "regular_season_box_scores_2010_2024_part_1.csv",
    "regular_season_box_scores_2010_2024_part_2.csv",
    "regular_season_box_scores_2010_2024_part_3.csv",
    "play_off_box_scores_2010_2024.csv"
]
dfs = [pd.read_csv(base_url + f) for f in files]


playoffs = dfs[3]
rs1 = dfs[0]
rs2 = dfs[1]
rs3 = dfs[2]




In [32]:
rs1

Unnamed: 0,season_year,game_date,gameId,matchup,teamId,teamCity,teamName,teamTricode,teamSlug,personId,...,reboundsOffensive,reboundsDefensive,reboundsTotal,assists,steals,blocks,turnovers,foulsPersonal,points,plusMinusPoints
0,2010-11,2010-11-10,21000112,NJN @ CLE,1610612751,New Jersey,Nets,NJN,nets,693,...,0,0,0,0,0,0,0,0,0,-4
1,2010-11,2010-11-17,21000165,NJN @ UTA,1610612751,New Jersey,Nets,NJN,nets,693,...,0,0,0,0,0,0,0,0,0,0
2,2010-11,2010-11-27,21000237,NJN @ PHI,1610612751,New Jersey,Nets,NJN,nets,693,...,0,0,0,0,0,0,0,0,0,0
3,2010-11,2010-12-12,21000351,NJN vs. LAL,1610612751,New Jersey,Nets,NJN,nets,693,...,0,0,0,0,0,0,0,0,0,0
4,2010-11,2010-10-29,21000020,NJN vs. SAC,1610612751,New Jersey,Nets,NJN,nets,693,...,0,1,1,1,0,0,1,1,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141488,2010-11,2010-12-17,21000389,HOU vs. MEM,1610612745,Houston,Rockets,HOU,rockets,2449,...,3,6,9,2,0,1,6,2,23,39
141489,2011-12,2012-03-02,21100534,HOU vs. DEN,1610612745,Houston,Rockets,HOU,rockets,2449,...,1,3,4,1,0,0,2,1,13,-4
141490,2010-11,2010-11-17,21000164,HOU @ OKC,1610612745,Houston,Rockets,HOU,rockets,2449,...,2,6,8,0,0,0,3,3,26,-8
141491,2011-12,2012-02-08,21100383,HOU @ POR,1610612745,Houston,Rockets,HOU,rockets,2449,...,1,1,2,2,0,1,4,3,12,-5


In [18]:

regular = pd.concat([rs1, rs2, rs3], ignore_index=True)
regular['is_playoff'] = 0
playoffs['is_playoff'] = 1

df = pd.concat([regular, playoffs], ignore_index=True)

#format
df.drop(columns=['jerseyNum', 'comment', 'position', 'matchup'], inplace=True)
df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce')

def convert_minutes(min_str):
    try:
        if pd.isna(min_str): return 0
        parts = min_str.split(':')
        return int(parts[0]) + int(parts[1]) / 60 if len(parts) == 2 else 0
    except: return 0

df['minutes_played'] = df['minutes'].apply(convert_minutes)
df.drop(columns=['minutes'], inplace=True)
df['season'] = df['season_year'].str[:4].astype(int)

#data aggs
agg = df.groupby(['personId', 'personName', 'season', 'is_playoff']).agg({
    'gameId': 'count',
    'minutes_played': 'sum',
    'points': 'sum',
    'assists': 'sum',
    'reboundsTotal': 'sum',
    'fieldGoalsMade': 'sum',
    'fieldGoalsAttempted': 'sum',
    'threePointersMade': 'sum',
    'freeThrowsMade': 'sum',
    'turnovers': 'sum',
    'plusMinusPoints': 'sum'
}).rename(columns={'gameId': 'games_played'}).reset_index()


pivoted = agg.pivot_table(index=['personId', 'personName', 'season'], columns='is_playoff',
                          values=['points', 'minutes_played', 'assists', 'reboundsTotal'], aggfunc='sum')
pivoted.columns = ['_'.join(map(str, col)) for col in pivoted.columns]
pivoted = pivoted.dropna()

#per min stats
pivoted['ppm_regular'] = pivoted['points_0'] / pivoted['minutes_played_0']
pivoted['ppm_playoff'] = pivoted['points_1'] / pivoted['minutes_played_1']
pivoted['residual_ppm'] = pivoted['ppm_playoff'] - pivoted['ppm_regular']


#resids
residual_series = pivoted.sort_values(['season', 'personId'])['residual_ppm'].reset_index(drop=True).dropna()

In [34]:
agg

Unnamed: 0,personId,personName,season,is_playoff,games_played,minutes_played,points,assists,reboundsTotal,fieldGoalsMade,fieldGoalsAttempted,threePointersMade,freeThrowsMade,turnovers,plusMinusPoints
0,255,Grant Hill,2010,0,80,2408.900000,1056,200,338,399,825,45,213,134,49
1,255,Grant Hill,2011,0,65,1377.716667,499,107,171,201,451,14,83,65,109
2,255,Grant Hill,2012,0,46,436.683333,93,26,49,38,98,3,14,25,1
3,255,Grant Hill,2012,1,6,20.133333,4,2,4,2,4,0,0,1,0
4,406,Shaquille O'Neal,2010,0,53,751.600000,341,26,178,134,201,0,73,56,193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10402,1641936,Miles Norris,2023,0,1,0.000000,0,0,0,0,0,0,0,0,0
10403,1641945,Jaylin Galloway,2023,0,3,0.000000,0,0,0,0,0,0,0,0,0
10404,1641970,Maozinha Pereira,2023,0,9,121.633333,48,2,37,18,35,5,7,5,9
10405,1641998,Trey Jemison,2023,0,30,573.983333,171,27,134,75,136,0,21,34,-110


In [21]:
#adf tests
adf_stat, adf_p = adfuller(residual_series)[:2]
print(f"\nADF Test:\nStat = {adf_stat:.4f}, p = {adf_p:.4f}")

#stationary :0


ADF Test:
Stat = -53.5442, p = 0.0000


In [26]:




#diff
series_diff = residual_series.diff().dropna() if adf_p > 0.05 else residual_series

# armima pqd
best_aic = np.inf
best_order = None
for p in range(4):
    for q in range(4):
        try:
            model = ARIMA(residual_series, order=(p, 1, q)).fit()
            if model.aic < best_aic:
                best_aic, best_order, best_model = model.aic, (p, 1, q), model
        except:
            continue

print(f"\nBest ARIMA order: {best_order} with AIC: {best_aic:.2f}")




Best ARIMA order: (0, 1, 1) with AIC: -1600.38


In [30]:
#garch lol
returns = series_diff.pct_change().replace([np.inf, -np.inf], np.nan).dropna() * 100
garch = arch_model(returns, vol='GARCH', p=1, q=1)
garch_fitted = garch.fit(disp='off')
print("\nGARCH Summary:\n", garch_fitted.summary())




GARCH Summary:
                      Constant Mean - GARCH Model Results                      
Dep. Variable:           residual_ppm   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:               -28255.1
Distribution:                  Normal   AIC:                           56518.2
Method:            Maximum Likelihood   BIC:                           56542.1
                                        No. Observations:                 2963
Date:                Mon, Mar 31 2025   Df Residuals:                     2962
Time:                        02:39:05   Df Model:                            1
                                 Mean Model                                
                 coef    std err          t      P>|t|     95.0% Conf. Int.
---------------------------------------------------------------------------
mu           -58.9071     61.641     -0.956 

In [31]:
#lasso garch x (prof loves this)
X = pivoted[['assists_0', 'reboundsTotal_0']].reset_index(drop=True).loc[series_diff.index]
y = returns.loc[X.index]
lasso = LassoCV(cv=5).fit(X, y)
selected = np.where(lasso.coef_ != 0)[0]
selected_features = X.columns[selected]
print(f"\nSelected features by Lasso: {selected_features.tolist()}")

exog = X.iloc[:, selected]
garchx = arch_model(y, x=exog, vol='GARCH', p=1, q=1)
garchx_fitted = garchx.fit(disp='off')
print("\nGARCH-X Summary:\n", garchx_fitted.summary())

# r2
r2_arima = r2_score(residual_series[1:], best_model.predict(start=1, end=len(residual_series)-1))
print(f"\nARIMA R2: {r2_arima:.4f}")

# arima fit
plt.figure(figsize=(12, 6))
plt.plot(residual_series, label='Observed residual PPM')
plt.plot(best_model.predict(start=1, end=len(residual_series)-1), label='ARIMA Predicted')
plt.title("ARIMA Fit on Residual PPM (Playoff - Regular)")
plt.legend()
plt.tight_layout()
plt.show()

KeyError: '[0, 757] not in index'