In [2]:
#import relevant packages
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,RocCurveDisplay

In [3]:
hof_df = pd.read_csv('data/HOF.CSV')
hof_df.head()

Unnamed: 0,year,hofID,name,category
0,1945,bakerho01h,Hobey Baker,Player
1,1945,gardich01h,Charlie Gardiner,Player
2,1945,gerared01h,Eddie Gerard,Player
3,1945,mcgeefr01h,Frank McGee,Player
4,1945,morenho01h,Howie Morenz,Player


In [4]:
hof_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      366 non-null    int64 
 1   hofID     366 non-null    object
 2   name      366 non-null    object
 3   category  366 non-null    object
dtypes: int64(1), object(3)
memory usage: 11.6+ KB


In [6]:
masterdf = pd.read_csv('data/Master.CSV')
masterdf.head()

Unnamed: 0,playerID,coachID,hofID,firstName,lastName,nameNote,nameGiven,nameNick,height,weight,...,birthDay,birthCountry,birthState,birthCity,deathYear,deathMon,deathDay,deathCountry,deathState,deathCity
0,aaltoan01,,,Antti,Aalto,,Antti,,73.0,210.0,...,4.0,Finland,,Lappeenranta,,,,,,
1,abbeybr01,,,Bruce,Abbey,,Bruce,,73.0,185.0,...,18.0,Canada,ON,Toronto,,,,,,
2,abbotge01,,,George,Abbott,,George Henry,Preacher,67.0,153.0,...,3.0,Canada,ON,Synenham,,,,,,
3,abbotre01,,,Reg,Abbott,,Reginald Stewart,,71.0,164.0,...,4.0,Canada,MB,Winnipeg,,,,,,
4,abdelju01,,,Justin,Abdelkader,,,,73.0,195.0,...,25.0,USA,MI,Muskegon,,,,,,


In [7]:
masterdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7761 entries, 0 to 7760
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   playerID      7520 non-null   object 
 1   coachID       395 non-null    object 
 2   hofID         366 non-null    object 
 3   firstName     7748 non-null   object 
 4   lastName      7761 non-null   object 
 5   nameNote      18 non-null     object 
 6   nameGiven     5985 non-null   object 
 7   nameNick      1306 non-null   object 
 8   height        7334 non-null   float64
 9   weight        7336 non-null   float64
 10  shootCatch    7048 non-null   object 
 11  legendsID     6577 non-null   object 
 12  ihdbID        7125 non-null   float64
 13  hrefID        7457 non-null   object 
 14  firstNHL      6851 non-null   float64
 15  lastNHL       6851 non-null   float64
 16  firstWHA      903 non-null    float64
 17  lastWHA       903 non-null    float64
 18  pos           7447 non-null 

In [12]:
masterdf['lastNHL']

0       2000.0
1          NaN
2       1943.0
3       1952.0
4       2011.0
         ...  
7756    2011.0
7757    1985.0
7758       NaN
7759    1943.0
7760    2007.0
Name: lastNHL, Length: 7761, dtype: float64

In [15]:
scoringdf = pd.read_csv('data/scoring.CSV')
scoringdf.tail()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PostA,PostPts,PostPIM,Post+/-,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG
45962,zyuzian01,2002,2,MIN,NHL,D,66.0,4.0,12.0,16.0,...,1.0,1.0,14.0,-3.0,0.0,0.0,0.0,0.0,0.0,30.0
45963,zyuzian01,2003,1,MIN,NHL,D,65.0,8.0,13.0,21.0,...,,,,,,,,,,
45964,zyuzian01,2005,1,MIN,NHL,D,57.0,7.0,11.0,18.0,...,,,,,,,,,,
45965,zyuzian01,2006,1,CAL,NHL,D,49.0,1.0,5.0,6.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0
45966,zyuzian01,2007,1,CHI,NHL,D,32.0,2.0,3.0,5.0,...,,,,,,,,,,


In [14]:
scoringdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45967 entries, 0 to 45966
Data columns (total 31 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   playerID  45967 non-null  object 
 1   year      45967 non-null  int64  
 2   stint     45967 non-null  int64  
 3   tmID      45967 non-null  object 
 4   lgID      45967 non-null  object 
 5   pos       45385 non-null  object 
 6   GP        45699 non-null  float64
 7   G         45699 non-null  float64
 8   A         45699 non-null  float64
 9   Pts       45699 non-null  float64
 10  PIM       45699 non-null  float64
 11  +/-       36265 non-null  float64
 12  PPG       37748 non-null  float64
 13  PPA       23040 non-null  float64
 14  SHG       37744 non-null  float64
 15  SHA       23214 non-null  float64
 16  GWG       36567 non-null  float64
 17  GTG       28106 non-null  float64
 18  SOG       36364 non-null  float64
 19  PostGP    19153 non-null  float64
 20  PostG     19153 non-null  fl

In [18]:
shootoutdf = pd.read_csv('data/ScoringShootout.CSV')
shootoutdf.head()

Unnamed: 0,playerID,year,stint,tmID,S,G,GDG
0,adamske01,2006,1,PHO,1,0,0
1,afanadm01,2005,1,TBL,1,0,0
2,afanadm01,2006,1,TBL,2,1,1
3,afinoma01,2005,1,BUF,5,3,2
4,afinoma01,2006,1,BUF,6,2,1


In [19]:
shootoutdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   playerID  2072 non-null   object
 1   year      2072 non-null   int64 
 2   stint     2072 non-null   int64 
 3   tmID      2072 non-null   object
 4   S         2072 non-null   int64 
 5   G         2072 non-null   int64 
 6   GDG       2072 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 113.4+ KB


In [20]:
awardsdf = pd.read_csv('data/AwardsPlayers.csv')
awardsdf.head()

Unnamed: 0,playerID,award,year,lgID,note,pos
0,malonjo01,Art Ross,1917,NHL,,
1,cleghod01,Art Ross,1918,NHL,,
2,malonjo01,Art Ross,1919,NHL,,
3,lalonne01,Art Ross,1920,NHL,,
4,broadpu01,Art Ross,1921,NHL,,


In [22]:
awardsdf.drop(columns = ['note', 'pos'], inplace=True)
awardsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2091 entries, 0 to 2090
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   playerID  2091 non-null   object
 1   award     2091 non-null   object
 2   year      2091 non-null   int64 
 3   lgID      2091 non-null   object
dtypes: int64(1), object(3)
memory usage: 65.5+ KB


In [25]:
teamsdf = pd.read_csv('data/Teams.csv')
teamsdf.tail()

Unnamed: 0,year,lgID,tmID,franchID,confID,divID,rank,playoff,G,W,...,GA,name,PIM,BenchMinor,PPG,PPC,SHA,PKG,PKC,SHF
1514,2011,NHL,TBL,TBL,EC,SE,3,,82,38,...,281,Tampa Bay Lightning,865.0,16.0,41.0,269.0,12.0,59.0,284.0,2.0
1515,2011,NHL,TOR,TOR,EC,NE,4,,82,35,...,264,Toronto Maple Leafs,824.0,16.0,49.0,267.0,6.0,55.0,242.0,5.0
1516,2011,NHL,VAN,VAN,WC,NW,1,CQF,82,51,...,198,Vancouver Canucks,1049.0,10.0,57.0,288.0,4.0,40.0,286.0,7.0
1517,2011,NHL,WAS,WAS,EC,SE,2,CSF,82,42,...,230,Washington Capitals,767.0,16.0,41.0,245.0,10.0,49.0,266.0,3.0
1518,2011,NHL,WPG,WPG,EC,SE,4,,82,37,...,246,Winnipeg Jets,905.0,4.0,45.0,251.0,8.0,58.0,292.0,3.0


In [24]:
teamsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1519 entries, 0 to 1518
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        1519 non-null   int64  
 1   lgID        1519 non-null   object 
 2   tmID        1519 non-null   object 
 3   franchID    1519 non-null   object 
 4   confID      902 non-null    object 
 5   divID       1171 non-null   object 
 6   rank        1519 non-null   int64  
 7   playoff     936 non-null    object 
 8   G           1519 non-null   int64  
 9   W           1519 non-null   int64  
 10  L           1519 non-null   int64  
 11  T           1309 non-null   float64
 12  OTL         358 non-null    float64
 13  Pts         1519 non-null   int64  
 14  SoW         210 non-null    float64
 15  SoL         210 non-null    float64
 16  GF          1519 non-null   int64  
 17  GA          1519 non-null   int64  
 18  name        1519 non-null   object 
 19  PIM         1409 non-null  

In [33]:
teamspostdf = pd.read_csv('data/TeamsPost.csv')
teamspostdf.sort_values(by='year', ascending=False).head()

Unnamed: 0,year,lgID,tmID,G,W,L,T,GF,GA,PIM,BenchMinor,PPG,PPC,SHA,PKG,PKC,SHF
926,2011,NHL,WAS,14,7,7,0,29,30,122.0,2.0,7.0,39.0,0.0,6.0,48.0,0.0
918,2011,NHL,NYR,20,10,10,0,43,41,218.0,2.0,13.0,73.0,0.0,11.0,69.0,0.0
911,2011,NHL,BOS,7,3,4,0,15,16,52.0,0.0,2.0,23.0,0.0,3.0,19.0,0.0
912,2011,NHL,CHI,6,2,4,0,12,17,89.0,0.0,1.0,19.0,0.0,4.0,19.0,0.0
913,2011,NHL,DET,5,1,4,0,9,13,55.0,0.0,4.0,23.0,0.0,2.0,22.0,0.0


In [32]:
teamsSCdf = pd.read_csv('data/TeamsSC.csv')
teamsSCdf.sort_values(by='year', ascending=False).head()

Unnamed: 0,year,lgID,tmID,G,W,L,T,GF,GA,PIM
29,1925,WCHL,VIC,4,1,3,0,3,10,
28,1925,NHL,MTM,4,3,1,0,10,3,50.0
27,1924,WCHL,VIC,4,3,1,0,16,8,
26,1924,NHL,MTL,4,1,3,0,8,16,49.0
25,1923,WCHL,CAT,2,0,2,0,1,9,


In [30]:
teamsSCdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    30 non-null     int64  
 1   lgID    30 non-null     object 
 2   tmID    30 non-null     object 
 3   G       30 non-null     int64  
 4   W       30 non-null     int64  
 5   L       30 non-null     int64  
 6   T       30 non-null     int64  
 7   GF      30 non-null     int64  
 8   GA      30 non-null     int64  
 9   PIM     8 non-null      float64
dtypes: float64(1), int64(7), object(2)
memory usage: 2.5+ KB


In [36]:
seriespostdf = pd.read_csv('data/SeriesPost.csv')
seriespostdf.sort_values(by='year', ascending=False).head()

Unnamed: 0,year,round,series,tmIDWinner,lgIDWinner,tmIDLoser,lgIDLoser,W,L,T,GoalsWinner,GoalsLoser,note
831,2011,SCF,O,LAK,NHL,NJD,NHL,4,2,0,16,8,
824,2011,CQF,H,NAS,NHL,DET,NHL,4,1,0,13,9,
817,2011,CQF,A,NYR,NHL,OTT,NHL,4,3,0,14,13,
818,2011,CQF,B,WAS,NHL,BOS,NHL,4,3,0,16,15,
819,2011,CQF,C,NJD,NHL,FLO,NHL,4,3,0,18,17,


In [37]:
seriespostdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 832 entries, 0 to 831
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         832 non-null    int64 
 1   round        832 non-null    object
 2   series       739 non-null    object
 3   tmIDWinner   832 non-null    object
 4   lgIDWinner   832 non-null    object
 5   tmIDLoser    832 non-null    object
 6   lgIDLoser    832 non-null    object
 7   W            832 non-null    int64 
 8   L            832 non-null    int64 
 9   T            832 non-null    int64 
 10  GoalsWinner  832 non-null    int64 
 11  GoalsLoser   832 non-null    int64 
 12  note         59 non-null     object
dtypes: int64(6), object(7)
memory usage: 84.6+ KB


In [40]:
scchampdf = seriespostdf[seriespostdf['round'] == 'SCF'].copy()
scchampdf.drop(columns = ['note', 'series'], inplace=True)

In [43]:
scchampdf.sort_values(by='year', ascending=False, inplace=True)

In [44]:
scchampdf.head()

Unnamed: 0,year,round,tmIDWinner,lgIDWinner,tmIDLoser,lgIDLoser,W,L,T,GoalsWinner,GoalsLoser
831,2011,SCF,LAK,NHL,NJD,NHL,4,2,0,16,8
816,2010,SCF,BOS,NHL,VAN,NHL,4,3,0,23,8
801,2009,SCF,CHI,NHL,PHI,NHL,4,2,0,25,22
786,2008,SCF,PIT,NHL,DET,NHL,4,3,0,14,17
771,2007,SCF,DET,NHL,PIT,NHL,4,2,0,17,10


In [46]:
scchampdf['lgIDWinner'].value_counts()

lgIDWinner
NHL     91
PCHA     5
NHA      2
WCHL     1
Name: count, dtype: int64

In [48]:
scchampdf = scchampdf[scchampdf['lgIDWinner'] == 'NHL'].copy()
scchampdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 831 to 10
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         91 non-null     int64 
 1   round        91 non-null     object
 2   tmIDWinner   91 non-null     object
 3   lgIDWinner   91 non-null     object
 4   tmIDLoser    91 non-null     object
 5   lgIDLoser    91 non-null     object
 6   W            91 non-null     int64 
 7   L            91 non-null     int64 
 8   T            91 non-null     int64 
 9   GoalsWinner  91 non-null     int64 
 10  GoalsLoser   91 non-null     int64 
dtypes: int64(6), object(5)
memory usage: 8.5+ KB
