# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint

pd.set_option('display.max_columns', None)

# Configuration

In [2]:
seasons = ['2024-25', '2023-24', '2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17', '2015-16']

In [3]:
target = 'points'

# Prepare Data

In [4]:
# Load data
data_frames = []

for season in seasons:
    df = pd.read_csv(f"data/{season}/processed_{season}.csv")
    data_frames.append(df)

data = pd.concat(data_frames, ignore_index=True)

In [5]:
# Sort data based on player and game date
data['gameDate'] = pd.to_datetime(data['gameDate'])

data = data.sort_values('gameDate').reset_index(drop=True)

In [6]:
# Add seconds column
def parse_minutes_to_seconds(time_str):
    if pd.isna(time_str):
        return None
    try:
        # Split by colon
        minutes_part, seconds_part = str(time_str).split(':')
        # Convert minute to float, in case it's like "36.000000"
        minutes = float(minutes_part)
        seconds = int(seconds_part)
        total_seconds = int(minutes * 60 + seconds)
        return total_seconds
    except:
        return None
    
data['seconds'] = data['minutes'].apply(parse_minutes_to_seconds)

# Add Rolling

In [7]:
def player_rolling(stat, num_games):
    data_copy[f"{stat}_rolling"] = (
        data_copy.groupby('playerSlug')[stat]
            .transform(lambda x: x.shift().rolling(window=num_games).mean())
    )

# Baseline

In [8]:
best_rmse = float('inf')
best_mae = float('inf')
best_window = None

for i in range(1, 82):
    data_copy = data.copy()

    player_rolling(target, i)
    data_copy = data_copy.dropna(subset=[f"{target}_rolling", target])

    X = data_copy[[f"{target}_rolling"]]
    y = data_copy[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

    y_baseline_pred = X_test[f"{target}_rolling"]
    baseline_mse = mean_squared_error(y_test, y_baseline_pred)
    baseline_rmse = np.sqrt(baseline_mse)
    baseline_mae = mean_absolute_error(y_test, y_baseline_pred)

    print(f"Window {i}: RMSE = {baseline_rmse:.2f}, MAE = {baseline_mae:.2f}")

    if baseline_rmse < best_rmse:
        best_rmse = baseline_rmse
        best_mae = baseline_mae
        best_window = i

print("\nBest Result:")
print(f"Best Window Size: {best_window}")
print(f"Best RMSE: {best_rmse:.2f}")
print(f"Best MAE: {best_mae:.2f}")


Window 1: RMSE = 7.61, MAE = 5.22
Window 2: RMSE = 6.74, MAE = 4.71
Window 3: RMSE = 6.46, MAE = 4.56
Window 4: RMSE = 6.33, MAE = 4.49
Window 5: RMSE = 6.25, MAE = 4.45
Window 6: RMSE = 6.20, MAE = 4.43
Window 7: RMSE = 6.17, MAE = 4.42
Window 8: RMSE = 6.16, MAE = 4.43
Window 9: RMSE = 6.15, MAE = 4.43
Window 10: RMSE = 6.14, MAE = 4.44
Window 11: RMSE = 6.14, MAE = 4.44
Window 12: RMSE = 6.14, MAE = 4.45
Window 13: RMSE = 6.14, MAE = 4.46
Window 14: RMSE = 6.14, MAE = 4.46
Window 15: RMSE = 6.14, MAE = 4.48
Window 16: RMSE = 6.15, MAE = 4.49
Window 17: RMSE = 6.16, MAE = 4.50
Window 18: RMSE = 6.17, MAE = 4.51
Window 19: RMSE = 6.18, MAE = 4.52
Window 20: RMSE = 6.19, MAE = 4.53
Window 21: RMSE = 6.20, MAE = 4.54
Window 22: RMSE = 6.20, MAE = 4.55
Window 23: RMSE = 6.21, MAE = 4.56
Window 24: RMSE = 6.22, MAE = 4.57
Window 25: RMSE = 6.22, MAE = 4.58
Window 26: RMSE = 6.23, MAE = 4.59
Window 27: RMSE = 6.24, MAE = 4.60
Window 28: RMSE = 6.25, MAE = 4.61
Window 29: RMSE = 6.26, MAE =