---
title: "STAT468 Final Project"
author: "Devak Shah"
format: html
toc: true
number-sections: true
jupyter: python3
---

In [3]:
import TopDownHockey_Scraper.TopDownHockey_NHL_Scraper as tdhnhlscrape
import TopDownHockey_Scraper.TopDownHockey_EliteProspects_Scraper as tdhepscrape
from nhlpy import NHLClient

Welcome to the TopDownHockey NHL Scraper, built by Patrick Bacon.
If you enjoy the scraper and would like to support my work, or you have any comments, questions, or concerns, feel free to follow me on Twitter @TopDownHockey or reach out to me via email at patrick.s.bacon@gmail.com. Have fun!
Welcome to the TopDownHockey EliteProspects Scraper, built by Patrick Bacon.
This scraper is built strictly for personal use. For commercial or professional use, please look into the EliteProspects API.
If you enjoy the scraper and would like to support my work, feel free to follow me on Twitter @TopDownHockey. Have fun!


In [4]:
import pandas as pd
from datetime import date
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LogisticRegression

In [5]:
#Toggle to see if we want to rescrape, or just import prior scrape from existing file
scrape = False
filename = "regression_input.xlsx"

In [6]:
#Get OHL Player Data (stats, build, etc)

if scrape == True:
    #Can't include all years because of backend data issues in some years
    #years = ["2004-2005", "2006-2007", "2007-2008", "2008-2009", "2009-2010", "2010-2011", "2011-2012", "2012-2013", "2013-2014", 
    #         "2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019"]

    years = ["2016-2017", "2013-2014"]

    aggregated_output = pd.DataFrame()

    for year in years:
        df = tdhepscrape.get_skaters(("ohl"), (year))[0:10]

        #GET PLAYER INFO
        info = tdhepscrape.get_player_information(df)

        #GET RID OF D-MEN
        df = df[~df['player'].str.contains(r'\(([^)]*D[^)]*)\)', regex=True)]

        #GET RID OF PLAYER POSITIONS FROM NAMES
        df['player'] = df['player'].str.replace(r'\s*\([^)]*\)', '', regex=True)

        #ADD YEAR TO DF
        df.insert(0, "year", year)

        #JOIN PLAYER BIO WITH STATS
        year_output = pd.merge(df[["year", "player", "gp", "g", "a", "tp"]], info[["player", "dob", "height", "weight", "shoots"]], on='player', how='inner')

        #ADD CURRENT YEAR PROSPECTS TO AGGREGATED DF
        aggregated_output = pd.concat([aggregated_output, year_output])


In [7]:
#Changing types of all columns, they are objects by default
if scrape == True:
    aggregated_output["gp"] = aggregated_output["gp"].replace("-", 0)
    aggregated_output["gp"] = aggregated_output["gp"].astype(int)
    aggregated_output["g"] = aggregated_output["g"].replace("-", 0)
    aggregated_output["g"] = aggregated_output["g"].astype(int)
    aggregated_output["a"] = aggregated_output["a"].replace("-", 0)
    aggregated_output["a"] = aggregated_output["a"].astype(int)
    aggregated_output["tp"] = aggregated_output["tp"].replace("-", 0)
    aggregated_output["tp"] = aggregated_output["tp"].astype(int)

In [8]:
#Get corresponding draft year for every season (oe. 2024-2025 would have a draft year of 2025)
# note this isn't necessarily the players' draft year - it's just the draft year for the corresponding row's season

if scrape == True:
    aggregated_output["draft_year"] = aggregated_output["year"].str[5:]
    aggregated_output["draft_year"] = aggregated_output["draft_year"].astype(int)
    aggregated_output

In [9]:
#Getting all NHL players drafted form 2005 - 2020
if scrape == True:
    years = list(range(2005, 2021))

    draftyears = pd.DataFrame()

    for year in years:
        df_list = pd.read_html(f"https://www.hockey-reference.com/draft/NHL_{year}_entry.html", match="Round")

        players_drafted = df_list[0]

        players_drafted

        #Let's get rid of the top header that isnt really used
        players_drafted.columns = players_drafted.columns.get_level_values(1)

        players_drafted["draft_year"] = year
        players_drafted = players_drafted[["draft_year", "Player"]]
        players_drafted =players_drafted.rename(columns={"draft_year": "player_draft_year", "Player": "player"})
        draftyears = pd.concat([draftyears, players_drafted])

    draftyears        

In [10]:
#Getting games played for all NHL players
if scrape == True:
    nhl_gp = pd.DataFrame()
    pages = list(range(1, 80))

    for page in pages:
        df_list = pd.read_html(f"https://www.eliteprospects.com/league/nhl/stats/all-time?page={page}")
        page_stats = df_list[2]
        page_stats = page_stats[["Player", "GP"]]
        nhl_gp = pd.concat([nhl_gp, page_stats])

    nhl_gp = nhl_gp.rename(columns={"Player": "player", "GP": "nhl_gp"})

    #GET RID OF PLAYER POSITIONS FROM NAMES
    nhl_gp['player'] = nhl_gp['player'].str.replace(r'\s*\([^)]*\)', '', regex=True)

    # Replacing NA and "-" values with 0
    nhl_gp["nhl_gp"] = nhl_gp["nhl_gp"].fillna(0)
    nhl_gp["nhl_gp"] = nhl_gp["nhl_gp"].replace("-", 0)

    nhl_gp

In [11]:
#Join GP, draft year onto OHL player data
if scrape == True:
    #MAKE ALL PLAYER NAMES UPPERCASE (TO MAKE JOINING TABLES NON CASE SENSITIVE)
    aggregated_output['player'] = aggregated_output['player'].str.upper()
    draftyears['player'] = draftyears['player'].str.upper()
    nhl_gp['player'] = nhl_gp['player'].str.upper()


    #Filter ohl stats for only drafted players' draft year stats - 
    #This will get rid of a) undrafted players, and b) drafted players non-draft year stats
    df = pd.merge(aggregated_output, draftyears, left_on=['player', 'draft_year'], right_on=['player', 'player_draft_year'], how='inner')

    #Can get rid of one of the draft year columns - don't need both
    df = df[["year", "player", "gp", "g", "a", "tp", "dob",	"height", "weight",	"shoots", 'draft_year']]

    #Join players' games played - if player gp not found, assume it to be 0.
    df = pd.merge(df, nhl_gp, left_on=['player'], right_on=['player'], how='left')
    df["nhl_gp"] = df["nhl_gp"].fillna(0)
    df["nhl_gp"] = df["nhl_gp"].astype(int)

    df.to_excel(filename, index=False)

In [12]:
#SOME DATA CLEANING

df = pd.read_excel(filename)

#Changing the weight to a numerical variate in kg
df["weight_kg"] = df["weight"].apply(lambda x: x[:x.find(" ")])
df["weight_kg"] = df["weight_kg"].astype(int)
df.dtypes

#Getting the age of the player at the time of draft (for simplicity, we will assume draft to be on June 30 for all years)

df["draft_date"] = df["draft_year"].astype(str) + '-06-30'
df["draft_date"] = pd.to_datetime(df["draft_date"])
df["dob"] = pd.to_datetime(df["dob"])
df["age_days"] = (df["draft_date"] - df["dob"])
df["age_days"] = df["age_days"].dt.days

#Can get rid of intermediate columns
df = df.drop(["dob", "weight", "draft_year", "draft_date"], axis = 1)

#Adding columns for goals/g and points/g
df["gpg"] = df["g"] / df["gp"]
df["ppg"] = df["tp"] / df["gp"]

#Create indicator variable to measure if the player has played at least 200 nhl games
df["Pr[GP > 200]"] = df["nhl_gp"] >= 200
df["Pr[GP > 200]"].astype(int)

#Renaming height column to height_cm for clarity
df = df.rename(columns = {"height": "height_cm"})
df

Unnamed: 0,year,player,gp,g,a,tp,height_cm,shoots,nhl_gp,weight_kg,age_days,gpg,ppg,Pr[GP > 200]
0,2006-2007,PATRICK KANE,58,62,83,145,178,L,1302,80,6797,1.068966,2.500000,True
1,2006-2007,SAM GAGNER,53,35,83,118,180,R,1043,89,6533,0.660377,2.226415,True
2,2006-2007,BRETT MACLEAN,68,47,53,100,188,R,18,89,6762,0.691176,1.470588,False
3,2006-2007,STEFAN LEGEIN,64,43,32,75,178,R,0,77,6792,0.671875,1.171875,False
4,2006-2007,ZACK TORQUATO,65,30,39,69,183,R,0,88,6596,0.461538,1.061538,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,2019-2020,DECLAN MCDONNELL,63,21,21,42,178,R,0,86,6700,0.333333,0.666667,False
284,2019-2020,ANTONIO STRANGES,61,19,21,40,180,L,0,84,6720,0.311475,0.655738,False
285,2019-2020,TANNER DICKINSON,64,9,31,40,183,L,0,80,6692,0.140625,0.625000,False
286,2019-2020,MARTIN CHROMIAK,28,11,22,33,183,R,0,86,6524,0.392857,1.178571,False


In [13]:
# X: predictors, y: binary response
df_regress = df[df["year"].isin(["2004-2005", "2006-2007", "2007-2008", "2008-2009", "2009-2010", 
                                 "2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019"])]
X = df_regress[['g', 'a', 'tp', 'height_cm', 'weight_kg', 'age_days', 'gpg', 'ppg']]
X = sm.add_constant(X)  # adds intercept
y = df_regress['Pr[GP > 200]']

model = sm.Logit(y, X)
result = model.fit()

print(result.summary())

         Current function value: 0.488150
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:           Pr[GP > 200]   No. Observations:                  167
Model:                          Logit   Df Residuals:                      158
Method:                           MLE   Df Model:                            8
Date:                Mon, 04 Aug 2025   Pseudo R-squ.:                  0.2244
Time:                        20:14:31   Log-Likelihood:                -81.521
converged:                      False   LL-Null:                       -105.11
Covariance Type:            nonrobust   LLR p-value:                 1.424e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7392     10.224     -0.072      0.942     -20.778      19.299
g             -0.0991        nan        nan        nan         na



In [14]:
#Testing model out
df_test = df[df["year"].isin(["2010-2011", "2011-2012", "2012-2013", "2013-2014"])]
X = df_test[['g', 'a', 'tp', 'height_cm', 'weight_kg', 'age_days', 'gpg', 'ppg']]
X = sm.add_constant(X, has_constant='add')  # adds intercept

#If regression yields a probability greater than or equal to 0.5, we will say it predicts the player will become a full-time NHL player
df_test["pred_prob"] = result.predict(X) >= 0.5
df_test

Unnamed: 0,year,player,gp,g,a,tp,height_cm,shoots,nhl_gp,weight_kg,age_days,gpg,ppg,Pr[GP > 200],pred_prob
72,2010-2011,RYAN STROME,65,33,73,106,185,R,864,87,6563,0.507692,1.630769,True,True
73,2010-2011,SHANE PRINCE,59,25,63,88,181,L,128,88,6800,0.423729,1.491525,False,True
74,2010-2011,STEFAN NOESEN,68,34,43,77,185,R,444,93,6712,0.500000,1.132353,True,False
75,2010-2011,ANDY ANDREOFF,66,33,42,75,185,L,188,95,7349,0.500000,1.136364,False,False
76,2010-2011,MARK SCHEIFELE,66,22,53,75,190,R,879,94,6681,0.333333,1.136364,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,2013-2014,CRISTIANO DIGIACINTO,50,17,11,28,183,L,0,88,6746,0.340000,0.560000,False,False
169,2013-2014,JAKE EVANS,57,11,7,18,183,L,350,89,7072,0.192982,0.315789,True,False
170,2013-2014,JADEN LINDO,40,9,9,18,188,R,0,97,6745,0.225000,0.450000,False,False
171,2013-2014,CHRISTIAN DVORAK,33,6,8,14,185,L,534,91,6723,0.181818,0.424242,True,False


In [15]:
TP = df_test[(df_test["Pr[GP > 200]"] == True) & (df_test["pred_prob"] == True)].shape[0]
print(f"True Positive: {TP}")

TN = df_test[(df_test["Pr[GP > 200]"] == False) & (df_test["pred_prob"] == False)].shape[0]
print(f"True Negative: {TN}")

FP = df_test[(df_test["Pr[GP > 200]"] == False) & (df_test["pred_prob"] == True)].shape[0]
print(f"False Positive: {FP}")

FN = df_test[(df_test["Pr[GP > 200]"] == True) & (df_test["pred_prob"] == False)].shape[0]
print(f"False Negative: {FN}")

success_rate = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
print(f"Success Rate: {success_rate:.2%}")

positive_success_rate = TP / (TP + FP) if (TP + FP) > 0 else 0
print(f"Positive Success Rate: {positive_success_rate:.2%}")

nhler_success_rate = TP / (TP + FN) if (TP + FN) > 0 else 0
print(f"NHLer Success Rate: {nhler_success_rate:.2%}")

True Positive: 10
True Negative: 59
False Positive: 4
False Negative: 28
Success Rate: 68.32%
Positive Success Rate: 71.43%
NHLer Success Rate: 26.32%


In [16]:
from vetiver.handlers.base import BaseHandler
import statsmodels.api as sm

#Need to create a custom handler for statsmodels Logit models, as the default handler cannot handle this type of model
class StatsmodelsLogitHandler(BaseHandler):
    def __init__(self, model, prototype_data):
        super().__init__(model, prototype_data)

    @staticmethod
    def model_type():
        return "statsmodels_logit"

    pip_name = "statsmodels"

    def handler_predict(self, input_data, check_prototype: bool):
        """
        Make predictions using a fitted statsmodels Logit model.

        Parameters
        ----------
        input_data:
            New data (e.g., from API)
        check_prototype: bool
            Whether to check data shape

        Returns
        -------
        Prediction array from model.predict
        """
        # Add constant to match model spec
        input_data_const = sm.add_constant(input_data, has_constant='add')
        prediction = self.model.predict(input_data_const)
        return prediction

In [17]:
from pins import board_s3
from vetiver import vetiver_pin_write
from vetiver import VetiverModel

#Store the model in an S3 bucket:

board = board_s3("devakshah-stat468-models", allow_pickle_read=True)

custom_handler = StatsmodelsLogitHandler(result, prototype_data=X)
vetiver_model = VetiverModel(custom_handler, model_name="my_logit_model", description="Logistic regression", handler=custom_handler)

vetiver_pin_write(board, vetiver_model)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
('The hash of pin "my_logit_model" has not changed. Your pin will not be stored.',)


In [20]:
#Storing Other Data used by app / report in s3 bucket:
board = board_s3("devakshah-stat468-models", allow_pickle_read=True)
board.pin_write(df, name="regression_input_data", type="csv")

Writing pin:
Name: 'regression_input_data'
Version: 20250804T201549Z-c3a37


Meta(title='regression_input_data: a pinned 288 x 14 DataFrame', description=None, created='20250804T201549Z', pin_hash='c3a374df234b2935', file='regression_input_data.csv', file_size=26991, type='csv', api_version=1, version=Version(created=datetime.datetime(2025, 8, 4, 20, 15, 49, 442041), hash='c3a374df234b2935'), tags=None, name='regression_input_data', user={}, local={})