In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score

pd.set_option("display.max_columns", 50)

In [2]:
df = pd.read_csv("../data/processed/moneypuck_features_final.csv")
df.head()

Unnamed: 0,team,season,name,gameId,playerTeam,opposingTeam,home_or_away,gameDate,position,situation,xGoalsPercentage,corsiPercentage,fenwickPercentage,iceTime,xOnGoalFor,xGoalsFor,xReboundsFor,xFreezeFor,xPlayStoppedFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,flurryAdjustedxGoalsFor,scoreVenueAdjustedxGoalsFor,flurryScoreVenueAdjustedxGoalsFor,shotsOnGoalFor,...,highDangerShotsAgainst,lowDangerxGoalsAgainst,mediumDangerxGoalsAgainst,highDangerxGoalsAgainst,lowDangerGoalsAgainst,mediumDangerGoalsAgainst,highDangerGoalsAgainst,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,playoffGame,days_rest,back_to_back,rolling_goals_5,rolling_xg_5,fatigue_penalty,fatigue_adj_goals,fatigue_adj_xg
0,ANA,2017,ANA,2017020011,ANA,ARI,HOME,2017-10-05,Team Level,all,0.4752,0.5271,0.57,3600.0,40.89,3.632,2.791,9.51,1.307,22.583,17.177,3.533,3.461,3.364,41.0,...,5.0,0.8,1.087,2.123,1.0,1.0,2.0,64.86,43.0,45.608,10.0,0.518,1.251,1.251,3.278,3.43,3.386,0,3.0,0,,,1.0,,
1,ANA,2017,ANA,2017020029,ANA,PHI,HOME,2017-10-07,Team Level,all,0.3855,0.4579,0.4625,3644.0,25.656,1.617,1.63,5.922,0.859,14.976,11.997,1.608,1.548,1.54,23.0,...,4.0,0.843,0.475,1.26,2.0,1.0,0.0,59.958,43.0,44.544,2.0,0.481,0.533,0.533,2.526,2.609,2.436,0,1.0,0,,,1.0,,
2,ANA,2017,ANA,2017020038,ANA,CGY,HOME,2017-10-09,Team Level,all,0.5044,0.5493,0.5688,3600.0,41.493,3.616,3.125,9.367,1.565,25.928,18.4,3.499,3.511,3.396,43.0,...,4.0,0.805,0.885,1.863,0.0,1.0,1.0,67.842,47.0,49.699,6.0,0.485,0.0,0.0,4.038,4.212,3.999,0,1.0,0,,,1.0,,
3,ANA,2017,ANA,2017020049,ANA,NYI,HOME,2017-10-11,Team Level,all,0.5105,0.4403,0.47,3600.0,32.193,2.64,2.238,8.067,1.109,18.42,14.527,2.584,2.621,2.566,30.0,...,1.0,1.33,0.958,0.243,1.0,0.0,1.0,74.427,53.0,52.362,9.0,0.575,0.469,0.469,2.638,2.678,2.645,0,1.0,0,,,1.0,,
4,ANA,2017,ANA,2017020060,ANA,COL,AWAY,2017-10-13,Team Level,all,0.1913,0.3197,0.3372,3600.0,18.38,0.947,1.141,4.28,0.651,11.013,10.968,0.943,0.964,0.96,18.0,...,3.0,0.994,1.703,1.305,2.0,0.0,1.0,83.03,57.0,57.309,1.0,0.663,0.31,0.31,4.356,4.329,4.26,0,1.0,0,,,1.0,,


In [3]:
features = [
    "days_rest",
    "back_to_back",
    "rolling_goals_5",
    "rolling_xg_5",
    "fatigue_adj_goals",
    "fatigue_adj_xg"
]

X = df[features]
y = df["goalsFor"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

reg_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", LinearRegression())
])

In [7]:
reg_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
y_pred = reg_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2

(np.float64(1.7330926815477532), 0.010785205183687752)

In [10]:
df["win"] = (df["goalsFor"] > df["goalsAgainst"]).astype(int)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df["win"], test_size=0.2, random_state=42
)

In [13]:
clf_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", LogisticRegression(max_iter=1000))
])

In [14]:
clf_pipeline.fit(X_train, y_train)

win_pred = clf_pipeline.predict(X_test)
win_proba = clf_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, win_pred)
auc = roc_auc_score(y_test, win_proba)

accuracy, auc

(0.553575508339045, 0.5553529369855665)