---
title: "STAT468 Final Project"
author: "Devak Shah"
format: html
toc: true
number-sections: true
jupyter: python3
---

In [1]:
import TopDownHockey_Scraper.TopDownHockey_NHL_Scraper as tdhnhlscrape
import TopDownHockey_Scraper.TopDownHockey_EliteProspects_Scraper as tdhepscrape
from nhlpy import NHLClient

In [2]:
import pandas as pd
from datetime import date
import statsmodels.api as sm
import numpy as np

In [3]:
#Get OHL Player Data (stats, build, etc)

#Can't include all years because of backend data issues in some years
#years = ["2004-2005", "2006-2007", "2007-2008", "2008-2009", "2009-2010", "2010-2011", "2011-2012", "2012-2013", "2013-2014", 
#         "2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019"]

years = ["2016-2017", "2013-2014"]

aggregated_output = pd.DataFrame()

for year in years:
    df = tdhepscrape.get_skaters(("ohl"), (year))[0:10]

    #GET PLAYER INFO
    info = tdhepscrape.get_player_information(df)

    #GET RID OF D-MEN
    df = df[~df['player'].str.contains(r'\(([^)]*D[^)]*)\)', regex=True)]

    #GET RID OF PLAYER POSITIONS FROM NAMES
    df['player'] = df['player'].str.replace(r'\s*\([^)]*\)', '', regex=True)

    #ADD YEAR TO DF
    df.insert(0, "year", year)

    #JOIN PLAYER BIO WITH STATS
    year_output = pd.merge(df[["year", "player", "gp", "g", "a", "tp"]], info[["player", "dob", "height", "weight", "shoots"]], on='player', how='inner')

    #ADD CURRENT YEAR PROSPECTS TO AGGREGATED DF
    aggregated_output = pd.concat([aggregated_output, year_output])


Your scrape request is skater data from the following league:
ohl
In the following season:
2016-2017
Beginning scrape of ohl skater data from 2016-2017.


Successfully scraped all ohl skater data from 2016-2017.
Scraping ohl data is complete. You scraped skater data from 2016-2017.
Completed scraping skater data from the following league:
ohl
Over the following season:
2016-2017
Beginning scrape for 10 players.


Ryan Moore scraped! That's 1 down! Only 9 left to go!


Cliff Pu scraped! That's 2 down! Only 8 left to go!


Taylor Raddysh scraped! That's 3 down! Only 7 left to go!


Adam Mascherin scraped! That's 4 down! Only 6 left to go!


Nick Suzuki scraped! That's 5 down! Only 5 left to go!


Petrus Palmu scraped! That's 6 down! Only 4 left to go!


Kole Sherwood scraped! That's 7 down! Only 3 left to go!


Alex DeBrincat scraped! That's 8 down! Only 2 left to go!


Jordan Kyrou scraped! That's 9 down! Only 1 left to go!


Kevin Hancock scraped! That's 10 down! Only 0 left to go!
Your scrape is complete! You've obtained player information for 10 players!
Your scrape request is skater data from the following league:
ohl
In the following season:
2013-2014
Beginning scrape of ohl skater data from 2013-2014.


Successfully scraped all ohl skater data from 2013-2014.
Scraping ohl data is complete. You scraped skater data from 2013-2014.
Completed scraping skater data from the following league:
ohl
Over the following season:
2013-2014
Beginning scrape for 10 players.


Connor McDavid scraped! That's 1 down! Only 9 left to go!


Michael Dal Colle scraped! That's 2 down! Only 8 left to go!


Dane Fox scraped! That's 3 down! Only 7 left to go!


Sam Bennett scraped! That's 4 down! Only 6 left to go!


Max Domi scraped! That's 5 down! Only 5 left to go!


Connor Brown scraped! That's 6 down! Only 4 left to go!


Andreas Athanasiou scraped! That's 7 down! Only 3 left to go!


Sergei Tolchinsky scraped! That's 8 down! Only 2 left to go!


Scott Kosmachuk scraped! That's 9 down! Only 1 left to go!


Nikolai Goldobin scraped! That's 10 down! Only 0 left to go!
Your scrape is complete! You've obtained player information for 10 players!


In [4]:
#Changing types of all columns, they are objects by default
aggregated_output["gp"] = aggregated_output["gp"].replace("-", 0)
aggregated_output["gp"] = aggregated_output["gp"].astype(int)
aggregated_output["g"] = aggregated_output["g"].replace("-", 0)
aggregated_output["g"] = aggregated_output["g"].astype(int)
aggregated_output["a"] = aggregated_output["a"].replace("-", 0)
aggregated_output["a"] = aggregated_output["a"].astype(int)
aggregated_output["tp"] = aggregated_output["tp"].replace("-", 0)
aggregated_output["tp"] = aggregated_output["tp"].astype(int)

In [5]:
#Get corresponding draft year for every season (oe. 2024-2025 would have a draft year of 2025)
# note this isn't necessarily the players' draft year - it's just the draft year for the corresponding row's season

aggregated_output["draft_year"] = aggregated_output["year"].str[5:]
aggregated_output["draft_year"] = aggregated_output["draft_year"].astype(int)
aggregated_output

Unnamed: 0,year,player,gp,g,a,tp,dob,height,weight,shoots,draft_year
0,2016-2017,Alex DeBrincat,63,65,62,127,1997-12-18,173,82 kg / 181 lbs,R,2017
1,2016-2017,Taylor Raddysh,58,42,67,109,1998-02-18,191,90 kg / 198 lbs,R,2017
2,2016-2017,Adam Mascherin,65,35,65,100,1998-06-06,178,93 kg / 205 lbs,L,2017
3,2016-2017,Petrus Palmu,62,40,58,98,1997-07-16,168,78 kg / 172 lbs,L,2017
4,2016-2017,Nick Suzuki,65,45,51,96,1999-08-10,180,94 kg / 207 lbs,R,2017
5,2016-2017,Jordan Kyrou,66,30,64,94,1998-05-05,185,89 kg / 196 lbs,R,2017
6,2016-2017,Ryan Moore,68,39,51,90,1997-04-09,175,76 kg / 168 lbs,L,2017
7,2016-2017,Cliff Pu,63,35,51,86,1998-06-03,188,84 kg / 185 lbs,R,2017
8,2016-2017,Kole Sherwood,60,33,52,85,1997-01-22,185,96 kg / 212 lbs,R,2017
9,2016-2017,Kevin Hancock,68,30,55,85,1998-03-02,180,82 kg / 181 lbs,L,2017


In [6]:
#Getting all NHL players drafted form 2005 - 2020

years = list(range(2005, 2021))

draftyears = pd.DataFrame()

for year in years:
    df_list = pd.read_html(f"https://www.hockey-reference.com/draft/NHL_{year}_entry.html", match="Round")

    players_drafted = df_list[0]

    players_drafted

    #Let's get rid of the top header that isnt really used
    players_drafted.columns = players_drafted.columns.get_level_values(1)

    players_drafted["draft_year"] = year
    players_drafted = players_drafted[["draft_year", "Player"]]
    players_drafted =players_drafted.rename(columns={"draft_year": "player_draft_year", "Player": "player"})
    draftyears = pd.concat([draftyears, players_drafted])

draftyears        

Unnamed: 0,player_draft_year,player
0,2005,Sidney Crosby
1,2005,Bobby Ryan
2,2005,Jack Johnson
3,2005,Benoit Pouliot
4,2005,Carey Price
...,...,...
223,2020,Ryan Tverberg
224,2020,Henrik Tikkanen
225,2020,Maxim Marushev
226,2020,Jakub Konecny


In [7]:
#Getting games played for all NHL players

nhl_gp = pd.DataFrame()
pages = list(range(1, 80))

for page in pages:
    df_list = pd.read_html(f"https://www.eliteprospects.com/league/nhl/stats/all-time?page={page}")
    page_stats = df_list[2]
    page_stats = page_stats[["Player", "GP"]]
    nhl_gp = pd.concat([nhl_gp, page_stats])

nhl_gp = nhl_gp.rename(columns={"Player": "player", "GP": "nhl_gp"})

#GET RID OF PLAYER POSITIONS FROM NAMES
nhl_gp['player'] = nhl_gp['player'].str.replace(r'\s*\([^)]*\)', '', regex=True)

# Replacing NA and "-" values with 0
nhl_gp["nhl_gp"] = nhl_gp["nhl_gp"].fillna(0)
nhl_gp["nhl_gp"] = nhl_gp["nhl_gp"].replace("-", 0)

nhl_gp

Unnamed: 0,player,nhl_gp
0,Wayne Gretzky,1487.0
1,Jaromír Jágr,1733.0
2,Mark Messier,1756.0
3,Gordie Howe,1767.0
4,Ron Francis,1731.0
...,...,...
20,Isaac Howard,0
21,,0
22,Daniil But,0
23,Arseni Gritsyuk,0


In [8]:
#Join GP, draft year onto OHL player data

#MAKE ALL PLAYER NAMES UPPERCASE (TO MAKE JOINING TABLES NON CASE SENSITIVE)
aggregated_output['player'] = aggregated_output['player'].str.upper()
draftyears['player'] = draftyears['player'].str.upper()
nhl_gp['player'] = nhl_gp['player'].str.upper()


#Filter ohl stats for only drafted players' draft year stats - 
#This will get rid of a) undrafted players, and b) drafted players non-draft year stats
df = pd.merge(aggregated_output, draftyears, left_on=['player', 'draft_year'], right_on=['player', 'player_draft_year'], how='inner')

#Can get rid of one of the draft year columns - don't need both
df = df[["year", "player", "gp", "g", "a", "tp", "dob",	"height", "weight",	"shoots", 'draft_year']]

#Join players' games played - if player gp not found, assume it to be 0.
df = pd.merge(df, nhl_gp, left_on=['player'], right_on=['player'], how='left')
df["nhl_gp"] = df["nhl_gp"].fillna(0)
df["nhl_gp"] = df["nhl_gp"].astype(int)

filename = "regression_input2.xlsx"
df.to_excel(filename, index=False)

In [9]:
#SOME DATA CLEANING

df = pd.read_excel(filename)

#Changing the weight to a numerical variate in kg
df["weight_kg"] = df["weight"].apply(lambda x: x[:x.find(" ")])
df["weight_kg"] = df["weight_kg"].astype(int)
df.dtypes

#Getting the age of the player at the time of draft (for simplicity, we will assume draft to be on June 30 for all years)

df["draft_date"] = df["draft_year"].astype(str) + '-06-30'
df["draft_date"] = pd.to_datetime(df["draft_date"])
df["dob"] = pd.to_datetime(df["dob"])
df["age_days"] = (df["draft_date"] - df["dob"])
df["age_days"] = df["age_days"].dt.days

#Can get rid of intermediate columns
df = df.drop(["dob", "weight", "draft_year", "draft_date"], axis = 1)

#Adding columns for goals/g and points/g
df["gpg"] = df["g"] / df["gp"]
df["ppg"] = df["tp"] / df["gp"]

#Create indicator variable to measure if the player has played at least 200 nhl games
df["Pr[GP > 200]"] = df["nhl_gp"] >= 200
df["Pr[GP > 200]"].astype(int)

#Renaming height column to height_cm for clarity
df = df.rename(columns = {"height": "height_cm"})
df

Unnamed: 0,year,player,gp,g,a,tp,height_cm,shoots,nhl_gp,weight_kg,age_days,gpg,ppg,Pr[GP > 200]
0,2016-2017,PETRUS PALMU,62,40,58,98,168,L,0,78,7289,0.645161,1.580645,False
1,2016-2017,NICK SUZUKI,65,45,51,96,180,R,455,94,6534,0.692308,1.476923,True
2,2013-2014,MICHAEL DAL COLLE,67,39,56,95,191,L,112,88,6584,0.58209,1.41791,False


In [10]:
# X: predictors, y: binary response
df_regress = df[df["year"].isin(["2004-2005", "2006-2007", "2007-2008", "2008-2009", "2009-2010", 
                                 "2014-2015", "2015-2016", "2016-2017", "2017-2018", "2018-2019"])]
X = df_regress[['g', 'a', 'tp', 'height_cm', 'weight_kg', 'age_days', 'gpg', 'ppg']]
X = sm.add_constant(X)  # adds intercept
y = df_regress['Pr[GP > 200]']

model = sm.Logit(y, X)
result = model.fit()

print(result.summary())

         Current function value: 0.000475
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:           Pr[GP > 200]   No. Observations:                    2
Model:                          Logit   Df Residuals:                        0
Method:                           MLE   Df Model:                            1
Date:                Fri, 01 Aug 2025   Pseudo R-squ.:                  0.9993
Time:                        20:41:14   Log-Likelihood:            -0.00095050
converged:                      False   LL-Null:                       -1.3863
Covariance Type:            nonrobust   LLR p-value:                   0.09600
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0009   1.16e+09   8.19e-13      1.000   -2.27e+09    2.27e+09
g              0.0834        nan        nan        nan         na



In [11]:
#Testing model out
df_test = df[df["year"].isin(["2010-2011", "2011-2012", "2012-2013", "2013-2014"])]
X = df_test[['g', 'a', 'tp', 'height_cm', 'weight_kg', 'age_days', 'gpg', 'ppg']]
X = sm.add_constant(X, has_constant='add')  # adds intercept

#If regression yields a probability greater than or equal to 0.5, we will say it predicts the player will become a full-time NHL player
df_test["pred_prob"] = result.predict(X) >= 0.5
df_test

Unnamed: 0,year,player,gp,g,a,tp,height_cm,shoots,nhl_gp,weight_kg,age_days,gpg,ppg,Pr[GP > 200],pred_prob
2,2013-2014,MICHAEL DAL COLLE,67,39,56,95,191,L,112,88,6584,0.58209,1.41791,False,True


In [12]:
TP = df_test[(df_test["Pr[GP > 200]"] == True) & (df_test["pred_prob"] == True)].shape[0]
print(f"True Positive: {TP}")

TN = df_test[(df_test["Pr[GP > 200]"] == False) & (df_test["pred_prob"] == False)].shape[0]
print(f"True Negative: {TN}")

FP = df_test[(df_test["Pr[GP > 200]"] == False) & (df_test["pred_prob"] == True)].shape[0]
print(f"False Positive: {FP}")

FN = df_test[(df_test["Pr[GP > 200]"] == True) & (df_test["pred_prob"] == False)].shape[0]
print(f"False Negative: {FN}")

success_rate = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
print(f"Success Rate: {success_rate:.2%}")

positive_success_rate = TP / (TP + FP) if (TP + FP) > 0 else 0
print(f"Positive Success Rate: {positive_success_rate:.2%}")

nhler_success_rate = TP / (TP + FN) if (TP + FN) > 0 else 0
print(f"NHLer Success Rate: {nhler_success_rate:.2%}")

True Positive: 0
True Negative: 0
False Positive: 1
False Negative: 0
Success Rate: 0.00%
Positive Success Rate: 0.00%
NHLer Success Rate: 0.00%
