<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Packages and Data

In [1]:
# Load packages

from math import exp
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, get_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE, SelectFromModel, SequentialFeatureSelector

In [2]:
# Read in cleaned player data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4977 entries, 0 to 4976
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          4977 non-null   object 
 1   Eligible        4977 non-null   int64  
 2   Position        4977 non-null   object 
 3   Hall_of_Fame    4977 non-null   int64  
 4   MVP             4977 non-null   int64  
 5   Finals_MVP      4977 non-null   int64  
 6   NBA_Champ       4977 non-null   int64  
 7   All_NBA         4977 non-null   int64  
 8   All_Defensive   4977 non-null   int64  
 9   Def_POY         4977 non-null   int64  
 10  All_Star        4977 non-null   int64  
 11  Scoring_Champ   4977 non-null   int64  
 12  TRB_Champ       4977 non-null   int64  
 13  AST_Champ       4977 non-null   int64  
 14  STL_Champ       4977 non-null   int64  
 15  BLK_Champ       4977 non-null   int64  
 16  All_ABA         4977 non-null   int64  
 17  ABA_Champ       4977 non-null   i

### Data Cleaning

In [3]:
# Replace all instances of -999 with NA (consider doing this in scraper to eliminate a step)
model_df = model_df.replace(-999, np.nan)

In [4]:
# Reduce number of possible positions to guard, forward, center
model_df.loc[model_df['Position'] == 'Center/Forward', 'Position'] = 'Center'
model_df.loc[model_df['Position'].isin(['PointGuard', 'ShootingGuard', 'Guard/Forward']), 'Position'] = 'Guard'
model_df.loc[model_df['Position'].isin(['SmallForward', 'PowerForward', 'Forward/Guard', 'Forward/Center']), 'Position'] = 'Forward'

In [5]:
# Combine individual ABA and NBA accolades
model_df['All_League'] = model_df['All_NBA'] + model_df['All_ABA']
model_df['Champ'] = model_df['NBA_Champ'] + model_df['ABA_Champ']

In [6]:
hofers = model_df['Hall_of_Fame'].value_counts()[1]

In [7]:
# Count number of NAs by column:
for col in model_df.columns:
  na_df = model_df[model_df[col].isna()]
  if len(na_df) > 0:
    try:
      na_rows = na_df['Hall_of_Fame'].value_counts()[1]
    except KeyError:
      na_rows = 0
    print(f"{col}:\t{len(model_df[model_df[col].isna()])} nulls \t{na_rows}/{hofers} HOFers are null")

3P_per_game:	1118 nulls 	53/140 HOFers are null
3PA_per_game:	1118 nulls 	53/140 HOFers are null
2P_per_game:	1118 nulls 	53/140 HOFers are null
2PA_per_game:	1118 nulls 	53/140 HOFers are null
ORB_per_game:	949 nulls 	39/140 HOFers are null
DRB_per_game:	949 nulls 	39/140 HOFers are null
TRB_per_game:	288 nulls 	1/140 HOFers are null
STL_per_game:	1180 nulls 	41/140 HOFers are null
BLK_per_game:	1180 nulls 	41/140 HOFers are null
GS_totals:	1689 nulls 	66/140 HOFers are null
FG%_totals:	34 nulls 	0/140 HOFers are null
3P_totals:	1118 nulls 	53/140 HOFers are null
3PA_totals:	1118 nulls 	53/140 HOFers are null
3P%_totals:	1627 nulls 	54/140 HOFers are null
2P_totals:	1118 nulls 	53/140 HOFers are null
2PA_totals:	1118 nulls 	53/140 HOFers are null
2P%_totals:	1162 nulls 	53/140 HOFers are null
eFG%_totals:	1146 nulls 	53/140 HOFers are null
FT%_totals:	241 nulls 	0/140 HOFers are null
ORB_totals:	949 nulls 	39/140 HOFers are null
DRB_totals:	949 nulls 	39/140 HOFers are null
TRB_totals

In [8]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

#### **What to do with NA values**
<br/>

Columns to drop:
`GS_totals`, `Trp_Dbl_totals`, `ORB_per_game`, `DRB_per_game`, `ORB_totals`, `DRB_totals`, `3P%_totals`, `2P%_totals`, `eFG%_totals`, `OWS_advanced`, `DWS_advanced`, `WS/48_advanced`, `OBPM_advanced`, `DBPM_advanced`
<br/>
<br/>

Columns to consider dropping: `3P_per_game`, `3PA_per_game`, `3P_totals`, `3PA_totals`, `FG%_totals` (these players never took a shot), `FT%_totals` (these players never took a FT)
<br/>
<br/>

Columns to impute from FGM, FGA, etc.:
`2P_per_game`, `2PA_per_game`, `2P_totals`, `2PA_totals`
<br/>
<br/>

Columns to fill with league average:
`PER_advanced`, `VORP_advanced` (consider some more advanced PER/VORP)
<br/>
<br/>

Columns to make 0:
`WS_advanced`, `BPM_advanced` (consider some more advanced imputation for BPM)
<br/>
<br/>

Columns to make 0 or fill with mean (undecided):,
        "`TS%_advanced` (these players never took a shot or free throw)
<br/>
<br/>

Columns to fill with mean by position:,
`PTS_per_game`, `TRB_per_game`, `AST_per_game`, `STL_per_game`, `BLK_per_game`, `TRB_totals`, `AST_totals`, `STL_totals`, `BLK_totals`

In [9]:
model_df.columns

Index(['Player', 'Eligible', 'Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP',
       'NBA_Champ', 'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'All_ABA', 'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game',
       '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',
       'FT_per_game', 'FTA_per_game', 'ORB_per_game', 'DRB_per_game',
       'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',
       'PTS_per_game', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals',
       'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals',
       '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals',
       'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals',
       'PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced',
       'WS_advanced', 'WS/48_advanced', 'OBPM_advanced

In [11]:
# Fill NAs accordingly
def fillNulls(model_df):
  cols_to_zero = ['WS_advanced', 'OWS_advanced', 'DWS_advanced', 'BPM_advanced',
                  '3P_per_game', '3PA_per_game', '3P_totals', '3PA_totals', 'FG%_totals', 'FT%_totals', 'TS%_advanced']
  model_df[cols_to_zero] = model_df[cols_to_zero].fillna(0) # fill cols with 0
  
  cols_to_avg = ['PER_advanced', 'VORP_advanced', '3P%_totals', '2P%_totals', 'eFG%_totals']
  model_df[cols_to_avg] = model_df[cols_to_avg].fillna(model_df[cols_to_avg].mean()) # fill cols with avg
  
  cols_to_position_avg = ['TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game']
  model_df[cols_to_position_avg] = model_df.groupby("Position")[cols_to_position_avg].transform(lambda x: x.fillna(x.mean())) # fills cols with avg by position

  cols_to_scale_avg = ['TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals']
  for col_total, col_avg in zip(cols_to_scale_avg, cols_to_position_avg):
    model_df[col_total] = model_df[col_avg] * model_df['G_totals']
  
  cols_to_fill = ['2P_per_game', '2PA_per_game', '2P_totals', '2PA_totals']
  cols_to_fill_with = ['FG_per_game', 'FGA_per_game', 'FG_totals', 'FGA_totals']
  model_df[cols_to_fill] = model_df[cols_to_fill].fillna(model_df[cols_to_fill_with]) # fill 2P shooting columns with FG columns

  return model_df
fillNulls(model_df)

Unnamed: 0,Player,Eligible,Position,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,...,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced,All_League,Champ
0,Alaa Abdelnaby,1,Forward,0,0,0,0,0,0,0,...,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.500000,0,0
1,Zaid Abdul-Aziz,1,Center,0,0,0,0,0,0,0,...,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.700000,0,0
2,Kareem Abdul-Jabbar,1,Center,1,6,2,6,15,11,0,...,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.700000,15,6
3,Mahmoud Abdul-Rauf,1,Guard,0,0,0,0,0,0,0,...,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.500000,0,0
4,Tariq Abdul-Wahad,1,Guard,0,0,0,0,0,0,0,...,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.200000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4972,Jim Zoet,1,Center,0,0,0,0,0,0,0,...,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.100000,0,0
4973,Bill Zopf,1,Guard,0,0,0,0,0,0,0,...,-0.5,0.4,-0.1,-0.011,,,0.0,3.434634,0,0
4974,Ivica Zubac,0,Center,0,0,0,0,0,0,0,...,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.500000,0,0
4975,Matt Zunic,1,Guard,0,0,0,0,0,0,0,...,0.2,1.8,2.0,,,,0.0,3.434634,0,0


In [12]:
# Columns to exclude

#NBA and ABA were combined for All League, Championship
#Games played not a stat
#Name, eligibility, position not relevant
#Attempts does not mean anything- can just pass in makes and percentage
#FT, FG encoded in points

exclude_cols = ['Player', 'Eligible', 'Position', 'NBA_Champ', 'All_NBA', 'All_ABA', 'ABA_Champ', 'G_totals', 
                '3PA_totals', 'FTA_totals', 'FGA_totals', 
                'FTA_per_game', 'FGA_per_game', '3PA_per_game',
                'FT_per_game', 'FT_totals', 'FG_per_game', 'FG_totals']

# Count number of NAs by column:
for col in model_df.columns:
  na_df = model_df[model_df[col].isna()]
  if len(na_df) > 0:
    try:
      na_rows = na_df['Hall_of_Fame'].value_counts()[1]
    except KeyError:
      na_rows = 0
    print(f"{col}:\t{len(model_df[model_df[col].isna()])} nulls \t{na_rows}/{hofers} HOFers are null")
    exclude_cols.append(col)

2P_per_game:	1118 nulls 	53/140 HOFers are null
2PA_per_game:	1118 nulls 	53/140 HOFers are null
ORB_per_game:	949 nulls 	39/140 HOFers are null
DRB_per_game:	949 nulls 	39/140 HOFers are null
GS_totals:	1689 nulls 	66/140 HOFers are null
2P_totals:	1118 nulls 	53/140 HOFers are null
2PA_totals:	1118 nulls 	53/140 HOFers are null
ORB_totals:	949 nulls 	39/140 HOFers are null
DRB_totals:	949 nulls 	39/140 HOFers are null
Trp_Dbl_totals:	4526 nulls 	53/140 HOFers are null
WS/48_advanced:	344 nulls 	1/140 HOFers are null
OBPM_advanced:	1185 nulls 	41/140 HOFers are null
DBPM_advanced:	1185 nulls 	41/140 HOFers are null


In [13]:
exclude_cols

['Player',
 'Eligible',
 'Position',
 'NBA_Champ',
 'All_NBA',
 'All_ABA',
 'ABA_Champ',
 'G_totals',
 '3PA_totals',
 'FTA_totals',
 'FGA_totals',
 'FTA_per_game',
 'FGA_per_game',
 '3PA_per_game',
 'FT_per_game',
 'FT_totals',
 'FG_per_game',
 'FG_totals',
 '2P_per_game',
 '2PA_per_game',
 'ORB_per_game',
 'DRB_per_game',
 'GS_totals',
 '2P_totals',
 '2PA_totals',
 'ORB_totals',
 'DRB_totals',
 'Trp_Dbl_totals',
 'WS/48_advanced',
 'OBPM_advanced',
 'DBPM_advanced']

### Feature Selection

In [14]:
# All remaining columns
model_df.columns

Index(['Player', 'Eligible', 'Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP',
       'NBA_Champ', 'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'All_ABA', 'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game',
       '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',
       'FT_per_game', 'FTA_per_game', 'ORB_per_game', 'DRB_per_game',
       'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',
       'PTS_per_game', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals',
       'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals',
       '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals',
       'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals',
       'PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced',
       'WS_advanced', 'WS/48_advanced', 'OBPM_advanced

In [15]:
#model_df['ChampChamp'] = model_df['Champ'] * model_df['Champ']

In [16]:
# Split into two dataframes: one where the players are eligible for the Hall of Fame, one where they are not

all_cols_eligible_df = (model_df[model_df['Eligible'] == 1]).loc[:, ~model_df.columns.isin(exclude_cols)]
#all_cols_noneligible_df = (model_df[model_df['Eligible'] == 0]).loc[:, ~model_df.columns.isin(exclude_cols)]

In [17]:
all_cols_eligible_df

Unnamed: 0,Hall_of_Fame,MVP,Finals_MVP,All_Defensive,Def_POY,All_Star,Scoring_Champ,TRB_Champ,AST_Champ,STL_Champ,...,PTS_totals,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,BPM_advanced,VORP_advanced,All_League,Champ
0,0,0,0,0,0,0,0,0,0,0,...,1465,13.000000,0.532,0.7,4.1,4.8,-3.8,-1.500000,0,0
1,0,0,0,0,0,0,0,0,0,0,...,4557,15.100000,0.479,5.9,11.6,17.5,0.4,2.700000,0,0
2,1,6,2,11,0,19,2,1,0,0,...,38387,24.600000,0.592,178.9,94.5,273.4,5.7,85.700000,15,6
3,0,0,0,0,0,0,0,0,0,0,...,8553,15.400000,0.506,16.7,8.4,25.2,-0.8,4.500000,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1830,11.400000,0.467,-0.6,4.1,3.5,-3.0,-1.200000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4970,0,0,0,0,0,0,0,0,0,0,...,458,6.100000,0.474,-1.5,1.4,0.0,-5.2,-1.400000,0,0
4972,0,0,0,0,0,0,0,0,0,0,...,2,-0.800000,0.200,-0.1,0.0,-0.1,-5.4,-0.100000,0,0
4973,0,0,0,0,0,0,0,0,0,0,...,118,9.600000,0.391,-0.5,0.4,-0.1,0.0,3.434634,0,0
4975,0,0,0,0,0,0,0,0,0,0,...,273,11.212562,0.368,0.2,1.8,2.0,0.0,3.434634,0,0


In [18]:
# Scale features and split eligible dataset into dependent and independent variables

sc0 = StandardScaler()

X_all = pd.DataFrame(sc0.fit_transform(all_cols_eligible_df.iloc[:,1:]), index=all_cols_eligible_df.index, columns=all_cols_eligible_df.columns[1:])
y_all = all_cols_eligible_df.iloc[:,0]

In [19]:
X_all.columns

Index(['MVP', 'Finals_MVP', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'ROY', '3P_per_game', 'TRB_per_game', 'AST_per_game', 'STL_per_game',
       'BLK_per_game', 'PTS_per_game', 'FG%_totals', '3P_totals', '3P%_totals',
       '2P%_totals', 'eFG%_totals', 'FT%_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'PER_advanced',
       'TS%_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced',
       'BPM_advanced', 'VORP_advanced', 'All_League', 'Champ'],
      dtype='object')

#### Experimenting with Various Feature Selectors

In [20]:
#Select top k features based on mutual info regression
kbest_selector = SelectKBest(mutual_info_regression, k = 10)
kbest_selector.fit(X_all, y_all)
kbest_cols = list(X_all.columns[kbest_selector.get_support()])

kbest_cols

['All_Star',
 'PTS_per_game',
 'TRB_totals',
 'STL_totals',
 'PTS_totals',
 'PER_advanced',
 'OWS_advanced',
 'DWS_advanced',
 'WS_advanced',
 'All_League']

In [21]:
rfe_selector = RFE(estimator=LogisticRegression(max_iter=120),n_features_to_select = 10, step = 1)
rfe_selector.fit(X_all, y_all)
rfe_cols = list(X_all.columns[rfe_selector.get_support()])

rfe_cols

['MVP',
 'All_Star',
 'Scoring_Champ',
 'TRB_per_game',
 'AST_per_game',
 'FG%_totals',
 'TRB_totals',
 'PER_advanced',
 'WS_advanced',
 'Champ']

In [22]:
sfm_selector = SelectFromModel(estimator=LogisticRegression())
sfm_selector.fit(X_all, y_all)
sfm_cols = list(X_all.columns[sfm_selector.get_support()])

sfm_cols

['MVP',
 'All_Star',
 'Scoring_Champ',
 'TRB_per_game',
 'BLK_per_game',
 'PTS_per_game',
 'FG%_totals',
 'TRB_totals',
 'BLK_totals',
 'PER_advanced',
 'OWS_advanced',
 'WS_advanced',
 'BPM_advanced',
 'Champ']

In [23]:
list(set(sfm_cols) & set(rfe_cols) & set(kbest_cols))

['PER_advanced', 'WS_advanced', 'TRB_totals', 'All_Star']

The SFS technique takes awhile

In [24]:
#sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select = 10, cv =5, direction ='backward')
#sfs_selector.fit(X_all, y_all)
#X_all.columns[sfs_selector.get_support()]

#### Selecting Our Columns

In [25]:
# 'Finals_MVP', 'BLK_totals', 'VORP_advanced', 'STL_totals', 'TS%_advanced', 'All_Defensive', 'AST_Champ', 'TRB_totals', 'AST_totals'

# 'BPM_advanced', 'WS_advanced',
model_cols = ['Player', 'Eligible', 'Hall_of_Fame', 'Def_POY', 'All_Star', 'All_League', 'ChampChamp', # accolades
       'PTS_per_game', 'WS_advanced', 'TRB_totals', 'AST_totals', 'PER_advanced', 'PTS_totals'] # stats

eligible_df = (model_df[model_df['Eligible'] == 1]).loc[:, model_df.columns.isin(model_cols)]
noneligible_df = (model_df[model_df['Eligible'] == 0]).loc[:, model_df.columns.isin(model_cols)]

In [26]:
# Dropping players that are unpredictable (reasoning explained below)
extraneous_players = ['Maurice Stokes', 'Bill Bradley', 'Toni Kukoč',
       'Calvin Murphy', 'Vlade Divac', 'Buddy Jeannette',
       'Dražen Petrović', 'Al Cervi', 'Arvydas Sabonis',
       'Šarūnas Marčiulionis', 'Dino Radja', 'Chuck Cooper',
       'Bob Houbregs']

eligible_df = eligible_df[~eligible_df['Player'].isin(extraneous_players)]

### Train/Test Split

In [27]:
# Split training set into X and y
X_eligible = eligible_df.iloc[:, 3:].values
y_eligible = eligible_df.iloc[:, 2].values

In [28]:
# Train-test split dividing HOF eligible players into a training set and a validation set
X_training, X_validation, y_train, y_val = train_test_split(eligible_df, y_eligible, test_size = 0.25, random_state = 0)

In [29]:
X_train = X_training.iloc[:,3:].values
X_val = X_validation.iloc[:,3:].values

In [30]:
X_test = noneligible_df.iloc[:, 3:]
y_test = noneligible_df.iloc[:, 2]

### Feature Scaling

In [31]:
# Scale features
sc1 = StandardScaler()
X_eligible = sc1.fit_transform(X_eligible)

# Scale whole matrix of features to prevent information leakage
# Scale for training set and validation set
sc2 = StandardScaler()
X_train = sc2.fit_transform(X_train)
X_val = sc2.transform(X_val)

### Model Selection

In [32]:
# Define classifiers / regressors

CLASSIFIERS = [
    LogisticRegression(),
    XGBClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    GaussianNB(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    SVC(kernel = 'rbf')
  ]

In [33]:
# Define accuracy metrics

METRICS = [
    'f1',
    'accuracy',
    'precision',
    'recall',
    'average_precision'
]

Accuracy- Percentage of samples correctly classified

Precision- Of players model identified as HOF players, how many were truly HOF

Recall- Of true HOF players, how many did model identify as HOF

F1- Harmonic mean of precision and recall

In [34]:
# Function that fits, predicts, and evaluates on different model types
def get_metrics(classifier, X, y_true):
  print(classifier)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X)
  #clf_val_preds = classifier.predict(X_val)

  output_metrics = []

  for metric in METRICS:
    score = get_scorer(metric)._score_func(y_true, y_pred)
    output_metrics.append(score)

  return output_metrics


In [35]:
training_metrics = []
val_metrics = []

for classifier in CLASSIFIERS:
  clf_train_metrics = get_metrics(classifier, X_train, y_train)
  val_train_metrics = get_metrics(classifier, X_val, y_val)
  
  training_metrics.append([classifier] + clf_train_metrics)
  val_metrics.append([classifier] + val_train_metrics)

LogisticRegression()
LogisticRegression()
XGBClassifier()
XGBClassifier()
DecisionTreeClassifier()
DecisionTreeClassifier()
RandomForestClassifier()
RandomForestClassifier()
SVC()
SVC()
GaussianNB()
GaussianNB()
KNeighborsClassifier()
KNeighborsClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
SVC()
SVC()


In [36]:
train_metrics_df = pd.DataFrame(data = training_metrics, columns=['Classifier'] + METRICS)
val_metrics_df = pd.DataFrame(data = val_metrics, columns=['Classifier'] + METRICS)

In [37]:
# Output modeling metrics
train_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
2,DecisionTreeClassifier(),1.0,1.0,1.0,1.0,1.0
3,"(DecisionTreeClassifier(max_features='auto', r...",1.0,1.0,1.0,1.0,1.0
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.994764,0.999673,1.0,0.989583,0.98991
1,XGBClassifier(),0.989583,0.999346,0.989583,0.989583,0.979602
4,SVC(),0.916667,0.994771,0.916667,0.916667,0.842892
8,SVC(),0.916667,0.994771,0.916667,0.916667,0.842892
0,LogisticRegression(),0.899471,0.993791,0.913978,0.885417,0.812847
6,KNeighborsClassifier(),0.874317,0.992484,0.91954,0.833333,0.771512
5,GaussianNB(),0.607595,0.959477,0.436364,1.0,0.436364


In [38]:
val_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
1,XGBClassifier(),0.792453,0.989216,0.954545,0.677419,0.656431
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.763636,0.987255,0.875,0.677419,0.602546
3,"(DecisionTreeClassifier(max_features='auto', r...",0.745098,0.987255,0.95,0.612903,0.594023
4,SVC(),0.740741,0.986275,0.869565,0.645161,0.571794
8,SVC(),0.740741,0.986275,0.869565,0.645161,0.571794
6,KNeighborsClassifier(),0.730769,0.986275,0.904762,0.612903,0.566296
0,LogisticRegression(),0.705882,0.985294,0.9,0.580645,0.535326
2,DecisionTreeClassifier(),0.679245,0.983333,0.818182,0.580645,0.487818
5,GaussianNB(),0.610526,0.963725,0.453125,0.935484,0.425852


### Hyperparameter Optimization

In [39]:
logistic_classifier = LogisticRegression(random_state= 0)

In [40]:
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)

In [41]:
# Define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(X_eligible, y_eligible)

In [42]:
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.860527 using {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

0.844287 (0.073548) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.844287 (0.073548) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.842411 (0.074895) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.842411 (0.074895) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.842411 (0.074895) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.843972 (0.075292) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.839254 (0.076499) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.839254 (0.076499) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.852231 (0.069669) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.845170 (0.073543) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.845170 (0.073543) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.859270 (0.065139) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.804256 (0.080468) w

### Modeling

In [43]:
#from sklearn.preprocessing import PolynomialFeatures
#poly_reg = PolynomialFeatures(degree = 2)
#X_poly = poly_reg.fit_transform(X_train)

In [44]:
# Fit a classifier with parameters found above
classifier = LogisticRegression(random_state = 1, **grid_result.best_params_)
classifier.fit(X_train, y_train)

LogisticRegression(C=0.01, random_state=1, solver='liblinear')

### Predictions

In [45]:
# Predict both class and probability for the training set 
y_train_pred_probs = classifier.predict_proba(X_train)[:, 1]
y_train_pred = classifier.predict(X_train)

In [46]:
# Predict both class and probability for the test set 
y_val_pred_probs = classifier.predict_proba(X_val)[:, 1]
y_val_pred = classifier.predict(X_val)

In [47]:
#eligible_df['pred'] = y_train_pred_probs
X_training['pred'] = y_train_pred_probs
X_validation['pred'] = y_val_pred_probs

#### Borderline Correct Positive HOF Predictions

In [48]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)[:5]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
2203,Dennis Johnson,1,1,0,5,14.1,4290.0,5500.0,15535,14.6,82.6,2,0.533947
4937,George Yardley,1,1,0,6,19.2,4200.8,802.4,9063,20.6,58.5,2,0.533947
4492,Wes Unseld,1,1,0,5,10.8,13776.0,3837.6,10624,16.0,110.1,1,0.538701
2840,Bob McAdoo,1,1,0,5,22.1,8008.8,1959.6,18787,20.7,89.1,2,0.539099
2910,Dick McGuire,1,1,0,7,8.0,3099.6,4206.6,5921,15.6,50.9,1,0.54095


In [49]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)[:5]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
3530,Jim Pollard,1,1,0,4,13.2,3416.4,1401.6,5762,15.6,34.9,4,0.521464
4376,David Thompson,1,1,0,5,22.7,2427.2,1953.6,13422,19.9,63.3,3,0.542723
4606,Bobby Wanzer,1,1,0,5,12.2,2556.0,1817.6,6924,17.3,63.9,3,0.549109
4401,Nate Thurmond,1,1,0,7,15.0,14460.0,2602.8,14437,16.5,78.0,0,0.561155
289,Zelmo Beaty,1,1,0,5,17.1,9690.1,1333.5,15207,18.7,106.0,2,0.568804


#### Under predictions

##### Expected Under Predictions:

<br/>

#### **Old players:**

<br/>


Bob Houbregs- A few seasons with mediocre stats, great college player but no NBA accolades and no other league. No clue why he is in. From 50s

<br/>

Chuck Cooper- 6.7 PPG and no accolades. Instrumental to NBA as broke the color barrier

<br/>

Al Cervi- 4 years of recorded stats with Syracuse (49-53). Dominated NBL with scoring champion, championship, 4 all league

<br/>

Buddy Jeannette	- From 40s, 3 seasons in BAA, but played in late 30s, early 40s with no stats for NBL- seeral first team/titles

<br/>
<br/>


#### **European players:**

<br/>

Dino Radja - Croatian player, only 4 years in NBA (with very good stats) but made it for Europe play

<br/>

Šarūnas Marčiulionis - 8 solid years in NBA, 8 years before in USSR league, made it for foreign/olympic play

<br/>

Arvydas Sabonis	- Many great years in Lithuania/Spain unaccounted for, solid 7 years in NBA in 30s


<br/>

Dražen Petrović	- 5 solid years in NBA, played in Yugoslavia/Spain


<br/>

Vlade Divac	- Many years in NBA with solid stats, no accolades. Did well in Yugoslav league, won Europa POY in 7 years before NBA

<br/>

Toni Kukoc- Very solid NBA career and picked up some championships. Played in Italy/Yugoslavia before (9 years). Won Europa POY, Euroscar POY, many Euroleage Championships

<br/>
<br/>


#### **Other:**

Bill Bradley- Not great individual stats, exceptional college player and 2 time champ but not above average. 70s so not that old

<br/>

Calvin Murphy- Very good pro career with not many accolades. Three time all american in college

<br/>

Maurice Stokes - From 50s, 3 years in NBA putting up 17-17 type numbers. Injured and paralyzed after three years with 3 all NBA. Good college player too

<br/>



##### **Legitimately wrong:**

Guy Rodgers- Long NBA career, 4x all star, 2x ast champ

Ralph Sampson- 6 solid years in NBA (of his 9), 4x all star, legend in college

Bill Walton - MVP, Legend in college, 2x all star, 2x all league. Can be fixed by adding MVP variable but may hurt with other players

Walt Bellamy- 4x all star, tons of points (43) and rebounds (12) over career

Wes Unseld- Solid accolades, consistent 15-15 guy earlier in career

Jamaal Wilkes- Solid contributor to many championships, also won All D which is currently not on here

Players to look into:

 'Walt Bellamy',
       'Maurice Cheeks', 'Earl Monroe', 'Arnie Risen', 'Tom Gola',
       'Frank Ramsey', 'Wes Unseld', 'Andy Phillip', 'Bob Dandridge',
       'David Thompson'

In [50]:
#eligible_df[(eligible_df['pred'] < 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] < 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
2029,Lou Hudson,1,1,0,6,20.2,3916.0,2403.0,17940,17.4,81.0,1,0.499854
3497,Andy Phillip,1,1,0,5,9.1,3084.4,3785.4,6384,14.1,60.5,2,0.484417
489,Carl Braun,1,1,0,5,13.5,2679.2,2915.6,10625,15.8,64.3,2,0.480955
308,Walt Bellamy,1,1,0,4,20.1,14289.1,2503.2,20941,19.9,130.0,0,0.409421
1452,Joe Fulks,1,1,0,2,16.4,2591.7,586.8,8003,10.9,29.2,4,0.363027
3068,Earl Monroe,1,1,0,4,18.8,2778.0,3611.4,17454,17.2,77.4,1,0.356331
2673,Clyde Lovellette,1,1,0,4,17.0,6688.0,1126.4,11947,21.7,70.6,1,0.354565
1561,Manu Ginóbili,1,1,0,2,13.3,3699.5,4016.6,14043,20.2,106.4,2,0.339316
761,Maurice Cheeks,1,1,0,4,11.1,3082.8,7376.7,12195,16.5,103.5,0,0.337977
3790,Guy Rodgers,1,1,0,4,11.7,3835.6,6957.6,10415,13.6,33.3,0,0.265893


In [51]:
X_validation[(X_validation['pred'] < 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
3944,Charlie Scott,1,1,0,5,20.7,2868.0,3513.3,14837,15.8,41.4,2,0.453777
1583,Gail Goodrich,1,1,0,5,18.6,3299.2,4845.7,19181,16.7,76.3,1,0.436125
1576,Tom Gola,1,1,0,5,11.3,5584.0,2931.6,7871,14.2,53.2,1,0.408013
967,Bob Dandridge,1,1,0,4,18.5,5705.2,2852.6,15530,16.7,80.3,1,0.371366
3730,Arnie Risen,1,1,0,4,12.0,6178.9,1082.9,7633,16.7,56.0,1,0.336515
3878,Ralph Sampson,1,1,0,4,15.4,4012.8,1048.8,7039,16.0,20.1,1,0.282623
3627,Frank Ramsey,1,1,0,0,13.4,3426.5,1121.4,8378,15.6,49.2,0,0.094992
2284,K.C. Jones,1,1,0,0,7.4,2366.0,2906.8,5011,10.4,38.6,0,0.093048


#### Over predictions

Larry Foust deserves to be in HOF. Also has 94% rating on Bball ref

Amare Stoudemire (72% bball ref), Chauncey (84% on bball ref), Shawn Marion (76% on bball ref) are newer to ballot

Larry Costello is in HOF as Contributor- this led to confusion as one of his key contributions was his play. 71% on bball ref

Tom Sanders in HOF as contributor but 15% rating on bball ref

Max Zaslofsky	

Legitimate wrong predictions:

Jimmy Jones- ABA Legend, very efficient. Think ABA was less respected when he was in it than Erving. Shorter career

Mack Calvin- Early ABA

Shawn Kemp (39% bball ref)- Lower longevity

Donnie Freeman- ABA Star i late 60s early 70s

Walter Davis (31% bball ref)

Jermaine O'Neal (32% bball ref)

In [52]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
4240,Amar'e Stoudemire,1,0,0,6,18.9,6598.8,1015.2,15994,21.8,92.5,5,0.794614
2363,Shawn Kemp,1,0,0,6,14.6,8828.4,1681.6,15347,19.1,89.5,3,0.689923
3298,Jermaine O'Neal,1,0,0,6,13.2,7279.2,1415.4,13309,17.9,66.0,3,0.653888
351,Chauncey Billups,1,0,0,5,15.2,3024.7,5632.2,15802,18.8,120.8,3,0.646232
2281,Jimmy Jones,1,0,0,6,16.6,2949.8,3087.0,11366,17.1,78.8,3,0.644861
670,Mack Calvin,1,0,0,5,16.1,1887.5,3624.0,12172,17.4,60.4,4,0.62855
2223,Kevin Johnson,1,0,0,3,17.9,2425.5,6688.5,13127,20.7,92.8,5,0.622989
1440,Donnie Freeman,1,0,0,5,18.9,2268.0,2268.0,12233,18.2,58.0,4,0.61744
2228,Marques Johnson,1,0,0,5,20.1,4837.0,2487.6,13892,20.1,79.8,3,0.584813
1028,Walter Davis,1,0,0,6,18.9,3099.0,3925.4,19521,19.1,76.9,2,0.580117


In [53]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,PTS_per_game,TRB_totals,AST_totals,PTS_totals,PER_advanced,WS_advanced,All_League,pred
1412,Larry Foust,1,0,0,8,13.7,8006.6,1388.9,11198,19.8,79.2,2,0.724164
2767,Shawn Marion,1,0,0,4,15.2,10118.1,2209.7,17700,18.8,124.9,2,0.53362


Model does well at handling these extreme cases

### Model Coefficients

In [54]:
for col, coef in zip(model_cols[3:], classifier.coef_[0]):
  print(f"{col}: {exp(coef)}")

Def_POY: 1.1140104059936384
All_Star: 1.6056111682510639
All_League: 0.9760283497016762
ChampChamp: 1.0575579492618166
PTS_per_game: 1.0276420359884002
WS_advanced: 1.023778245055249
TRB_totals: 0.9984134892261213
AST_totals: 1.1405611533931326
PER_advanced: 1.4468397729338855


Massive collinearity problem here causing points to hurt here

In [55]:
#for col, coef in zip(poly_reg.get_feature_names(), classifier.coef_[0]):
#  print(f"{col}: {exp(coef)}")

### Confusion Matrix

In [56]:
# Output confuson matrix for the training set
cm = confusion_matrix(y_train, y_train_pred)
#[00 01]
#[10 11]
print(cm)

[[2951   13]
 [  12   84]]


In [57]:
print(f"Accuracy: {get_scorer('accuracy')._score_func(y_train, y_train_pred)}")
print(f"F1 score: {get_scorer('f1')._score_func(y_train, y_train_pred)}")
print(f"Precision score: {get_scorer('precision')._score_func(y_train, y_train_pred)}")
print(f"Recall score: {get_scorer('recall')._score_func(y_train, y_train_pred)}")

Accuracy: 0.9918300653594772
F1 score: 0.8704663212435233
Precision score: 0.865979381443299
Recall score: 0.875


In [58]:
# Output confusion matrix for the validation set
cm = confusion_matrix(y_val, y_val_pred)
#[00 01]
#[10 11]
print(cm)
accuracy_score(y_val, y_val_pred)

[[987   2]
 [  8  23]]


0.9901960784313726

In [59]:
print(f"Accuracy: {get_scorer('accuracy')._score_func(y_val, y_val_pred)}")
print(f"F1 score: {get_scorer('f1')._score_func(y_val, y_val_pred)}")
print(f"Precision score: {get_scorer('precision')._score_func(y_val, y_val_pred)}")
print(f"Recall score: {get_scorer('recall')._score_func(y_val, y_val_pred)}")

Accuracy: 0.9901960784313726
F1 score: 0.8214285714285714
Precision score: 0.92
Recall score: 0.7419354838709677


### Other Metrics

In [196]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2964
           1       0.91      0.88      0.89        96

    accuracy                           0.99      3060
   macro avg       0.95      0.94      0.95      3060
weighted avg       0.99      0.99      0.99      3060



In [197]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       989
           1       0.95      0.68      0.79        31

    accuracy                           0.99      1020
   macro avg       0.97      0.84      0.89      1020
weighted avg       0.99      0.99      0.99      1020

