In [None]:
import pandas as pd
import numpy as np
from modules import constants
from modules.functions import convert_box_score_dataframe_to_regression_format
from sklearn.linear_model import Ridge
from modules.objects import RegressionHub

team_id_name_dict = {v:k for k,v in constants.TEAM_NAME_ID_DICT.items()}

Read in full data filter on D1 games only

In [None]:
full_box_df = pd.read_parquet('parquet_files/box_scores_sports_reference_2024.gzip')
reg_df = convert_box_score_dataframe_to_regression_format(full_box_df)
games_df = reg_df.drop(['ortg','drtg','pace','to_pct','opp_to_pct','game_date'],axis = 1).copy()
y_df = reg_df[['ortg','drtg','pace']].copy()

In [None]:
parameter_dict = {}
master_df = pd.DataFrame()
for column in y_df.columns.tolist():
    
    reg = Ridge(alpha=2,fit_intercept=True)
    reg.fit(X = games_df ,y = y_df[column].to_numpy())
    reg_results = {"variable":[],f'coef_{column}':[],f'intercept_{column}':[],f'adj_{column}':[]}
    intercept = round(reg.intercept_,2)
    parameter_dict[f"{column}_regression"] = reg
    
    for variable,coefficient in zip(games_df.columns,reg.coef_):
        reg_results[f'intercept_{column}'].append(intercept)
        reg_results['variable'].append(variable)
        reg_results[f'coef_{column}'].append(coefficient)
        reg_results[f'adj_{column}'].append(coefficient + reg.intercept_)
    
    output_df = pd.DataFrame(reg_results)
    output_df = output_df.loc[(output_df.variable.str.startswith("TM")) | (output_df.variable == "home")].copy()
    output_df['tm_id'] = output_df.variable.map(lambda x: x.split("_")[1] if x != 'home' else -1).astype(int)
    output_df['team_name'] = output_df.tm_id.map(lambda x: team_id_name_dict.get(x,"HOME_COURT_ADVANTAGE"))
    output_df = output_df[['team_name',f'intercept_{column}',f'coef_{column}',f'adj_{column}']]
    
    if master_df.empty:
        master_df = output_df.copy()
        continue
    
    master_df = master_df.merge(output_df,on = 'team_name')
    

master_df['adj_nrtg'] = master_df.adj_ortg - master_df.adj_drtg
parameter_dict['regression_dict'] = master_df.set_index('team_name').to_dict(orient='index')
master_df.sort_values('adj_nrtg',ascending=False)

In [None]:
hub = RegressionHub(**parameter_dict)
print(hub.ortg_regression.intercept_,hub.drtg_regression.intercept_,hub.pace_regression.intercept_)

In [None]:
games_df['predicted_ortg'] = hub.ortg_regression.predict(games_df.iloc[:,:725])
games_df['actual_ortg'] = y_df.ortg
games_df['predicted_drtg'] = hub.drtg_regression.predict(games_df.iloc[:,:725])
games_df['actual_drtg'] = y_df.drtg
games_df['predicted_pace'] = hub.pace_regression.predict(games_df.iloc[:,:725])
games_df['actual_pace'] = y_df.pace
games_df['actual_nrtg'] = games_df.actual_ortg - games_df.actual_drtg
games_df['predicted_nrtg'] = games_df.predicted_ortg - games_df.predicted_drtg

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,8))
sns.regplot(x = games_df.predicted_ortg,y = games_df.actual_ortg,line_kws={'color':'red'})
plt.title("Offensive rating - prediction vs observed")
plt.xlabel("Predicted Offensive Rating")
plt.ylabel("Actual offensive rating")
plt.show();

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(games_df.apply(lambda x: np.abs(x.predicted_nrtg - x.actual_nrtg),axis = 1));