<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Packages and Data

In [1]:
# Load packages

from math import exp
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, get_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE, SelectFromModel, SequentialFeatureSelector

In [2]:
# Read in cleaned player data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4977 entries, 0 to 4976
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          4977 non-null   object 
 1   Eligible        4977 non-null   int64  
 2   Position        4977 non-null   object 
 3   Hall_of_Fame    4977 non-null   int64  
 4   MVP             4977 non-null   int64  
 5   Finals_MVP      4977 non-null   int64  
 6   NBA_Champ       4977 non-null   int64  
 7   All_NBA         4977 non-null   int64  
 8   All_Defensive   4977 non-null   int64  
 9   Def_POY         4977 non-null   int64  
 10  All_Star        4977 non-null   int64  
 11  Scoring_Champ   4977 non-null   int64  
 12  TRB_Champ       4977 non-null   int64  
 13  AST_Champ       4977 non-null   int64  
 14  STL_Champ       4977 non-null   int64  
 15  BLK_Champ       4977 non-null   int64  
 16  All_ABA         4977 non-null   int64  
 17  ABA_Champ       4977 non-null   i

### Data Cleaning

In [3]:
# Replace all instances of -999 with NA (consider doing this in scraper to eliminate a step)
model_df = model_df.replace(-999, np.nan)

In [4]:
# Reduce number of possible positions to guard, forward, center
model_df.loc[model_df['Position'] == 'Center/Forward', 'Position'] = 'Center'
model_df.loc[model_df['Position'].isin(['PointGuard', 'ShootingGuard', 'Guard/Forward']), 'Position'] = 'Guard'
model_df.loc[model_df['Position'].isin(['SmallForward', 'PowerForward', 'Forward/Guard', 'Forward/Center']), 'Position'] = 'Forward'

In [5]:
# Combine individual ABA and NBA accolades
model_df['All_League'] = model_df['All_NBA'] + model_df['All_ABA']
model_df['Champ'] = model_df['NBA_Champ'] + model_df['ABA_Champ']

In [6]:
hofers = model_df['Hall_of_Fame'].value_counts()[1]

In [7]:
# Count number of NAs by column:
for col in model_df.columns:
  na_df = model_df[model_df[col].isna()]
  if len(na_df) > 0:
    try:
      na_rows = na_df['Hall_of_Fame'].value_counts()[1]
    except KeyError:
      na_rows = 0
    print(f"{col}:\t{len(model_df[model_df[col].isna()])} nulls \t{na_rows}/{hofers} HOFers are null")

3P_per_game:	1118 nulls 	53/140 HOFers are null
3PA_per_game:	1118 nulls 	53/140 HOFers are null
2P_per_game:	1118 nulls 	53/140 HOFers are null
2PA_per_game:	1118 nulls 	53/140 HOFers are null
ORB_per_game:	949 nulls 	39/140 HOFers are null
DRB_per_game:	949 nulls 	39/140 HOFers are null
TRB_per_game:	288 nulls 	1/140 HOFers are null
STL_per_game:	1180 nulls 	41/140 HOFers are null
BLK_per_game:	1180 nulls 	41/140 HOFers are null
GS_totals:	1689 nulls 	66/140 HOFers are null
FG%_totals:	34 nulls 	0/140 HOFers are null
3P_totals:	1118 nulls 	53/140 HOFers are null
3PA_totals:	1118 nulls 	53/140 HOFers are null
3P%_totals:	1627 nulls 	54/140 HOFers are null
2P_totals:	1118 nulls 	53/140 HOFers are null
2PA_totals:	1118 nulls 	53/140 HOFers are null
2P%_totals:	1162 nulls 	53/140 HOFers are null
eFG%_totals:	1146 nulls 	53/140 HOFers are null
FT%_totals:	241 nulls 	0/140 HOFers are null
ORB_totals:	949 nulls 	39/140 HOFers are null
DRB_totals:	949 nulls 	39/140 HOFers are null
TRB_totals

In [8]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

#### **What to do with NA values**
<br/>

Columns to drop:
`GS_totals`, `Trp_Dbl_totals`, `ORB_per_game`, `DRB_per_game`, `ORB_totals`, `DRB_totals`, `3P%_totals`, `2P%_totals`, `eFG%_totals`, `OWS_advanced`, `DWS_advanced`, `WS/48_advanced`, `OBPM_advanced`, `DBPM_advanced`
<br/>
<br/>

Columns to consider dropping: `3P_per_game`, `3PA_per_game`, `3P_totals`, `3PA_totals`, `FG%_totals` (these players never took a shot), `FT%_totals` (these players never took a FT)
<br/>
<br/>

Columns to impute from FGM, FGA, etc.:
`2P_per_game`, `2PA_per_game`, `2P_totals`, `2PA_totals`
<br/>
<br/>

Columns to fill with league average:
`PER_advanced`, `VORP_advanced` (consider some more advanced PER/VORP)
<br/>
<br/>

Columns to make 0:
`WS_advanced`, `BPM_advanced` (consider some more advanced imputation for BPM)
<br/>
<br/>

Columns to make 0 or fill with mean (undecided):,
        "`TS%_advanced` (these players never took a shot or free throw)
<br/>
<br/>

Columns to fill with mean by position:,
`PTS_per_game`, `TRB_per_game`, `AST_per_game`, `STL_per_game`, `BLK_per_game`, `TRB_totals`, `AST_totals`, `STL_totals`, `BLK_totals`

In [9]:
# Fill NAs accordingly
def fillNulls(model_df):
  cols_to_zero = ['WS_advanced', 'OWS_advanced', 'DWS_advanced', 'BPM_advanced',
                  '3P_per_game', '3PA_per_game', '3P_totals', '3PA_totals', 'FG%_totals', 'FT%_totals', 'TS%_advanced']
  model_df[cols_to_zero] = model_df[cols_to_zero].fillna(0) # fill cols with 0
  
  cols_to_avg = ['PER_advanced', 'VORP_advanced', '3P%_totals', '2P%_totals', 'eFG%_totals']
  model_df[cols_to_avg] = model_df[cols_to_avg].fillna(model_df[cols_to_avg].mean()) # fill cols with avg
  
  cols_to_position_avg = ['TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game']
  model_df[cols_to_position_avg] = model_df.groupby("Position")[cols_to_position_avg].transform(lambda x: x.fillna(x.mean())) # fills cols with avg by position

  cols_to_scale_avg = ['TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals']
  for col_total, col_avg in zip(cols_to_scale_avg, cols_to_position_avg):
    model_df[col_total] = model_df[col_total].fillna(model_df[col_avg] * model_df['G_totals'])
  
  cols_to_fill = ['2P_per_game', '2PA_per_game', '2P_totals', '2PA_totals']
  cols_to_fill_with = ['FG_per_game', 'FGA_per_game', 'FG_totals', 'FGA_totals']
  model_df[cols_to_fill] = model_df[cols_to_fill].fillna(model_df[cols_to_fill_with]) # fill 2P shooting columns with FG columns

  return model_df
fillNulls(model_df)

Unnamed: 0,Player,Eligible,Position,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,...,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced,All_League,Champ
0,Alaa Abdelnaby,1,Forward,0,0,0,0,0,0,0,...,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.500000,0,0
1,Zaid Abdul-Aziz,1,Center,0,0,0,0,0,0,0,...,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.700000,0,0
2,Kareem Abdul-Jabbar,1,Center,1,6,2,6,15,11,0,...,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.700000,15,6
3,Mahmoud Abdul-Rauf,1,Guard,0,0,0,0,0,0,0,...,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.500000,0,0
4,Tariq Abdul-Wahad,1,Guard,0,0,0,0,0,0,0,...,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.200000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4972,Jim Zoet,1,Center,0,0,0,0,0,0,0,...,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.100000,0,0
4973,Bill Zopf,1,Guard,0,0,0,0,0,0,0,...,-0.5,0.4,-0.1,-0.011,,,0.0,3.434634,0,0
4974,Ivica Zubac,0,Center,0,0,0,0,0,0,0,...,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.500000,0,0
4975,Matt Zunic,1,Guard,0,0,0,0,0,0,0,...,0.2,1.8,2.0,,,,0.0,3.434634,0,0


In [10]:
# Columns to exclude

#NBA and ABA were combined for All League, Championship
#Games played not a stat
#Name, eligibility, position not relevant
#Attempts does not mean anything- can just pass in makes and percentage
#FT, FG encoded in points

exclude_cols = ['Player', 'Eligible', 'Position', 'NBA_Champ', 'All_NBA', 'All_ABA', 'ABA_Champ', 'G_totals', 
                '3PA_totals', 'FTA_totals', 'FGA_totals', 
                'FTA_per_game', 'FGA_per_game', '3PA_per_game',
                'FT_per_game', 'FT_totals', 'FG_per_game', 'FG_totals']

# Count number of NAs by column:
for col in model_df.columns:
  na_df = model_df[model_df[col].isna()]
  if len(na_df) > 0:
    try:
      na_rows = na_df['Hall_of_Fame'].value_counts()[1]
    except KeyError:
      na_rows = 0
    print(f"{col}:\t{len(model_df[model_df[col].isna()])} nulls \t{na_rows}/{hofers} HOFers are null")
    exclude_cols.append(col)

2P_per_game:	1118 nulls 	53/140 HOFers are null
2PA_per_game:	1118 nulls 	53/140 HOFers are null
ORB_per_game:	949 nulls 	39/140 HOFers are null
DRB_per_game:	949 nulls 	39/140 HOFers are null
GS_totals:	1689 nulls 	66/140 HOFers are null
2P_totals:	1118 nulls 	53/140 HOFers are null
2PA_totals:	1118 nulls 	53/140 HOFers are null
ORB_totals:	949 nulls 	39/140 HOFers are null
DRB_totals:	949 nulls 	39/140 HOFers are null
Trp_Dbl_totals:	4526 nulls 	53/140 HOFers are null
WS/48_advanced:	344 nulls 	1/140 HOFers are null
OBPM_advanced:	1185 nulls 	41/140 HOFers are null
DBPM_advanced:	1185 nulls 	41/140 HOFers are null


In [11]:
exclude_cols

['Player',
 'Eligible',
 'Position',
 'NBA_Champ',
 'All_NBA',
 'All_ABA',
 'ABA_Champ',
 'G_totals',
 '3PA_totals',
 'FTA_totals',
 'FGA_totals',
 'FTA_per_game',
 'FGA_per_game',
 '3PA_per_game',
 'FT_per_game',
 'FT_totals',
 'FG_per_game',
 'FG_totals',
 '2P_per_game',
 '2PA_per_game',
 'ORB_per_game',
 'DRB_per_game',
 'GS_totals',
 '2P_totals',
 '2PA_totals',
 'ORB_totals',
 'DRB_totals',
 'Trp_Dbl_totals',
 'WS/48_advanced',
 'OBPM_advanced',
 'DBPM_advanced']

### Feature Selection

In [12]:
# All remaining columns
model_df.columns

Index(['Player', 'Eligible', 'Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP',
       'NBA_Champ', 'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'All_ABA', 'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game',
       '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',
       'FT_per_game', 'FTA_per_game', 'ORB_per_game', 'DRB_per_game',
       'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',
       'PTS_per_game', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals',
       'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals',
       '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals',
       'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals',
       'PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced',
       'WS_advanced', 'WS/48_advanced', 'OBPM_advanced

In [141]:
#Improve prediction of solid contributors to many championships
model_df['Champ PTS_per_game'] = model_df['Champ'] * model_df['PTS_per_game']
#model_df['Champ Champ'] = model_df['Champ'] * model_df['Champ']

In [142]:
# Split into two dataframes: one where the players are eligible for the Hall of Fame, one where they are not

all_cols_eligible_df = (model_df[model_df['Eligible'] == 1]).loc[:, ~model_df.columns.isin(exclude_cols)]
#all_cols_noneligible_df = (model_df[model_df['Eligible'] == 0]).loc[:, ~model_df.columns.isin(exclude_cols)]

In [143]:
#poly_reg = PolynomialFeatures(degree = 2)
#X_poly = poly_reg.fit_transform(all_cols_eligible_df.iloc[:,1:])
#poly_df = pd.DataFrame(X_poly, index=all_cols_eligible_df.index, columns=poly_reg.get_feature_names_out())
#X_all = pd.DataFrame(sc0.fit_transform(poly_df), index=poly_df.index, columns=poly_df.columns)

In [144]:
# Scale features and split eligible dataset into dependent and independent variables

sc0 = StandardScaler()

X_all = pd.DataFrame(sc0.fit_transform(all_cols_eligible_df.iloc[:,1:]), index=all_cols_eligible_df.index, columns=all_cols_eligible_df.columns[1:])
y_all = all_cols_eligible_df.iloc[:,0]

In [145]:
len(X_all.columns)

38

#### Experimenting with Various Feature Selectors

In [146]:
#Select top k features based on mutual info regression
kbest_selector = SelectKBest(mutual_info_regression, k = 10)
kbest_selector.fit(X_all, y_all)
kbest_cols = list(X_all.columns[kbest_selector.get_support()])

kbest_cols

['All_Star',
 'PTS_per_game',
 'TRB_totals',
 'PTS_totals',
 'PER_advanced',
 'OWS_advanced',
 'DWS_advanced',
 'WS_advanced',
 'BPM_advanced',
 'All_League']

In [147]:
#rfe_selector = RFE(estimator=LogisticRegression(max_iter=120),n_features_to_select = 10, step = 1)
rfe_selector = RFE(estimator=LogisticRegression(max_iter=120),n_features_to_select = 10, step = 1)
rfe_selector.fit(X_all, y_all)
rfe_cols = list(X_all.columns[rfe_selector.get_support()])

rfe_cols

['MVP',
 'All_Star',
 'Scoring_Champ',
 'TRB_per_game',
 'BLK_per_game',
 'FG%_totals',
 'TRB_totals',
 'PER_advanced',
 'WS_advanced',
 'Champ']

In [148]:
sfm_selector = SelectFromModel(estimator=LogisticRegression())
sfm_selector.fit(X_all, y_all)
sfm_cols = list(X_all.columns[sfm_selector.get_support()])

sfm_cols

['MVP',
 'All_Star',
 'Scoring_Champ',
 'TRB_per_game',
 'BLK_per_game',
 'PTS_per_game',
 'FG%_totals',
 'TRB_totals',
 'BLK_totals',
 'PER_advanced',
 'OWS_advanced',
 'DWS_advanced',
 'WS_advanced',
 'BPM_advanced',
 'Champ']

Variables found significant by each technique:

In [149]:
list(set(sfm_cols) & set(rfe_cols) & set(kbest_cols))

['TRB_totals', 'PER_advanced', 'WS_advanced', 'All_Star']

Variables found significant by 2/3 techniques

In [150]:
list(set(sfm_cols) & set(rfe_cols) | set(kbest_cols) & set(rfe_cols) | set(kbest_cols) & set(sfm_cols))

['TRB_per_game',
 'BLK_per_game',
 'PTS_per_game',
 'FG%_totals',
 'PER_advanced',
 'WS_advanced',
 'Scoring_Champ',
 'DWS_advanced',
 'OWS_advanced',
 'BPM_advanced',
 'TRB_totals',
 'MVP',
 'Champ',
 'All_Star']

The SFS technique takes awhile

In [117]:
#sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select = 10, cv =5, direction ='backward')
#sfs_selector.fit(X_all, y_all)
#X_all.columns[sfs_selector.get_support()]

#### Selecting Our Columns

In [265]:
# 'Finals_MVP', 'BLK_totals', 'VORP_advanced', 'STL_totals', 'TS%_advanced', 'All_Defensive', 'AST_Champ', 'TRB_totals', 'AST_totals'

# 'BPM_advanced', 'WS_advanced',
model_cols = ['Player', 'Eligible', #info for splitting/assessing predictions
              'Hall_of_Fame', 'Def_POY', 'All_Star', 'All_League', 'AST_Champ', # accolades
       'PTS_totals', 'TRB_totals', 'PER_advanced', 'Champ PTS_per_game'] # stats

eligible_df = (model_df[model_df['Eligible'] == 1]).loc[:, model_df.columns.isin(model_cols)]
noneligible_df = (model_df[model_df['Eligible'] == 0]).loc[:, model_df.columns.isin(model_cols)]

In [266]:
# Dropping players that are unpredictable (reasoning explained below)
extraneous_players = ['Maurice Stokes', 'Bill Bradley', 'Toni Kukoč',
       'Calvin Murphy', 'Vlade Divac', 'Buddy Jeannette',
       'Dražen Petrović', 'Al Cervi', 'Arvydas Sabonis',
       'Šarūnas Marčiulionis', 'Dino Radja', 'Chuck Cooper',
       'Bob Houbregs']

eligible_df = eligible_df[~eligible_df['Player'].isin(extraneous_players)]

### Train/Test Split

In [267]:
# Split training set into X and y
X_eligible = eligible_df.iloc[:, 3:].values
y_eligible = eligible_df.iloc[:, 2].values

In [268]:
# Train-test split dividing HOF eligible players into a training set and a validation set
X_training, X_validation, y_train, y_val = train_test_split(eligible_df, y_eligible, test_size = 0.25, random_state = 0)

In [269]:
X_train = X_training.iloc[:,3:].values
X_val = X_validation.iloc[:,3:].values

In [270]:
X_test = noneligible_df.iloc[:, 3:]
y_test = noneligible_df.iloc[:, 2]

### Feature Scaling

In [271]:
# Scale features
sc1 = StandardScaler()
X_eligible = sc1.fit_transform(X_eligible)

# Scale whole matrix of features to prevent information leakage
# Scale for training set and validation set
sc2 = StandardScaler()
X_train = sc2.fit_transform(X_train)
X_val = sc2.transform(X_val)

### Model Selection

In [272]:
# Define classifiers / regressors

CLASSIFIERS = [
    LogisticRegression(),
    XGBClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    GaussianNB(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    SVC(kernel = 'rbf')
  ]

In [273]:
# Define accuracy metrics

METRICS = [
    'f1',
    'accuracy',
    'precision',
    'recall',
    'average_precision'
]

Accuracy- Percentage of samples correctly classified

Precision- Of players model identified as HOF players, how many were truly HOF

Recall- Of true HOF players, how many did model identify as HOF

F1- Harmonic mean of precision and recall

In [274]:
# Function that fits, predicts, and evaluates on different model types
def get_metrics(classifier, X, y_true):
  print(classifier)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X)
  #clf_val_preds = classifier.predict(X_val)

  output_metrics = []

  for metric in METRICS:
    score = get_scorer(metric)._score_func(y_true, y_pred)
    output_metrics.append(score)

  return output_metrics


In [275]:
training_metrics = []
val_metrics = []

for classifier in CLASSIFIERS:
  clf_train_metrics = get_metrics(classifier, X_train, y_train)
  val_train_metrics = get_metrics(classifier, X_val, y_val)
  
  training_metrics.append([classifier] + clf_train_metrics)
  val_metrics.append([classifier] + val_train_metrics)

LogisticRegression()
LogisticRegression()
XGBClassifier()
XGBClassifier()
DecisionTreeClassifier()
DecisionTreeClassifier()
RandomForestClassifier()
RandomForestClassifier()
SVC()
SVC()
GaussianNB()
GaussianNB()
KNeighborsClassifier()
KNeighborsClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
SVC()
SVC()


In [276]:
train_metrics_df = pd.DataFrame(data = training_metrics, columns=['Classifier'] + METRICS)
val_metrics_df = pd.DataFrame(data = val_metrics, columns=['Classifier'] + METRICS)

In [277]:
# Output modeling metrics
train_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
2,DecisionTreeClassifier(),1.0,1.0,1.0,1.0,1.0
3,"(DecisionTreeClassifier(max_features='auto', r...",1.0,1.0,1.0,1.0,1.0
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.994764,0.999673,1.0,0.989583,0.98991
1,XGBClassifier(),0.979167,0.998693,0.979167,0.979167,0.959421
4,SVC(),0.93617,0.996078,0.956522,0.916667,0.879426
8,SVC(),0.93617,0.996078,0.956522,0.916667,0.879426
6,KNeighborsClassifier(),0.905028,0.994444,0.975904,0.84375,0.828321
0,LogisticRegression(),0.904255,0.994118,0.923913,0.885417,0.821643
5,GaussianNB(),0.66899,0.968954,0.502618,1.0,0.502618


In [278]:
val_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
3,"(DecisionTreeClassifier(max_features='auto', r...",0.814815,0.990196,0.956522,0.709677,0.687645
0,LogisticRegression(),0.8,0.989216,0.916667,0.709677,0.659361
1,XGBClassifier(),0.8,0.989216,0.916667,0.709677,0.659361
2,DecisionTreeClassifier(),0.745763,0.985294,0.785714,0.709677,0.566427
5,GaussianNB(),0.729412,0.977451,0.574074,1.0,0.574074
6,KNeighborsClassifier(),0.716981,0.985294,0.863636,0.612903,0.54109
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.716981,0.985294,0.863636,0.612903,0.54109
4,SVC(),0.692308,0.984314,0.857143,0.580645,0.510441
8,SVC(),0.692308,0.984314,0.857143,0.580645,0.510441


### Hyperparameter Optimization

In [279]:
logistic_classifier = LogisticRegression(random_state= 0)

In [280]:
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)

In [281]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py

def print_dataframe(filtered_cv_results):
    """Pretty print for filtered dataframe"""
    for mean_precision, std_precision, mean_recall, std_recall, params in zip(
        filtered_cv_results["mean_test_precision"],
        filtered_cv_results["std_test_precision"],
        filtered_cv_results["mean_test_recall"],
        filtered_cv_results["std_test_recall"],
        filtered_cv_results["params"],
    ):
        print(
            f"precision: {mean_precision:0.3f} (±{std_precision:0.03f}),"
            f" recall: {mean_recall:0.3f} (±{std_recall:0.03f}),"
            f" for {params}"
        )
    print()

In [282]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py

# TODO: 
# We may also try keeping all models with one standard deviation of the best by
# recall and then selecting the fastest model to predict.

def refit_strategy(cv_results):
  """Define the strategy to select the best estimator.

  The strategy defined here is to filter-out all results below a precision threshold
  of 0.9, rank the remaining by recall and select the model with the highest
  recall.

  Parameters
  ----------
  cv_results : dict of numpy (masked) ndarrays
      CV results as returned by the `GridSearchCV`.

  Returns
  -------
  best_index : int
      The index of the best estimator as it appears in `cv_results`.
  """
  
  precision_threshold = 0.89
  cv_results_ = pd.DataFrame(cv_results)
  print("All grid-search results:")
  print_dataframe(cv_results_)

  # Filter-out all results below the threshold
  high_precision_cv_results = cv_results_[
        cv_results_["mean_test_precision"] > precision_threshold
  ]

  print(f"Models with a precision higher than {precision_threshold}:")
  print_dataframe(high_precision_cv_results)

  high_precision_cv_results = high_precision_cv_results[
        [
            "mean_score_time",
            "mean_test_recall",
            "std_test_recall",
            "mean_test_precision",
            "std_test_precision",
            "rank_test_recall",
            "rank_test_precision",
            "params",
        ]
    ]

  # Select the most performant models in terms of recall
  # (within 1 sigma from the best)
  #best_recall_std = high_precision_cv_results["mean_test_recall"].std()
  #best_recall = high_precision_cv_results["mean_test_recall"].max()
  best_recall_index = high_precision_cv_results["mean_test_recall"].idxmax()
  #best_recall_threshold = best_recall - best_recall_std

  print(
        "\nThe selected final model is the fastest to predict out of the previously\n"
        "selected subset of best models based on precision and recall.\n"
        "Its scoring time is:\n\n"
        f"{high_precision_cv_results.loc[best_recall_index]}"
    )
  
  return best_recall_index

In [283]:
scorers = ['precision', 'recall']

# Define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=grid, n_jobs=-1, cv=cv, scoring=scorers,refit=refit_strategy,error_score=0)
grid_result = grid_search.fit(X_eligible, y_eligible)

All grid-search results:
precision: 0.886 (±0.081), recall: 0.869 (±0.087), for {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
precision: 0.886 (±0.081), recall: 0.869 (±0.087), for {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
precision: 0.886 (±0.081), recall: 0.869 (±0.087), for {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
precision: 0.886 (±0.081), recall: 0.869 (±0.087), for {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
precision: 0.886 (±0.081), recall: 0.869 (±0.087), for {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
precision: 0.886 (±0.081), recall: 0.874 (±0.084), for {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
precision: 0.890 (±0.083), recall: 0.866 (±0.084), for {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
precision: 0.890 (±0.083), recall: 0.866 (±0.084), for {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
precision: 0.891 (±0.082), recall: 0.872 (±0.097), for {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
precision: 0.914 (±0.071), recall: 0

In [284]:
# Summarize results
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#print()
#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param))

### Modeling

In [285]:
#from sklearn.preprocessing import PolynomialFeatures
#poly_reg = PolynomialFeatures(degree = 2)
#X_poly = poly_reg.fit_transform(X_train)

In [286]:
# Fit a classifier with parameters found above
classifier = LogisticRegression(random_state = 1, **grid_result.best_params_)
classifier.fit(X_train, y_train)

LogisticRegression(C=0.01, random_state=1, solver='liblinear')

### Predictions

In [287]:
# Predict both class and probability for the training set 
y_train_pred_probs = classifier.predict_proba(X_train)[:, 1]
y_train_pred = classifier.predict(X_train)

In [288]:
# Predict both class and probability for the test set 
y_val_pred_probs = classifier.predict_proba(X_val)[:, 1]
y_val_pred = classifier.predict(X_val)

In [289]:
#eligible_df['pred'] = y_train_pred_probs
X_training['pred'] = y_train_pred_probs
X_validation['pred'] = y_val_pred_probs

#### Borderline Correct Positive HOF Predictions

In [290]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)[:5]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
4754,Jamaal Wilkes,1,1,0,3,0,5117.0,14644,16.5,0,70.8,0.500496
4481,Jack Twyman,1,1,0,6,0,5424.0,15840,17.8,2,0.0,0.505184
975,Adrian Dantley,1,1,0,6,0,5455.0,23177,21.5,2,0.0,0.524788
2539,Bob Lanier,1,1,0,8,0,9698.0,19248,21.7,0,0.0,0.530314
4574,Chet Walker,1,1,0,7,0,7314.0,18831,17.6,0,18.2,0.535663


In [291]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)[:5]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
3944,Charlie Scott,1,1,0,5,0,2846.0,14837,15.8,2,20.7,0.520618
4606,Bobby Wanzer,1,1,0,5,0,1979.0,6924,17.3,3,12.2,0.527236
2399,Bernard King,1,1,0,4,0,5060.0,19655,19.2,4,0.0,0.52753
3023,Reggie Miller,1,1,0,5,0,4182.0,25279,18.4,3,0.0,0.528351
289,Zelmo Beaty,1,1,0,5,0,9665.0,15207,18.7,2,17.1,0.55244


#### Under predictions

##### **Legitimately wrong:**

Guy Rodgers- Long NBA career, 4x all star, 2x ast champ

Ralph Sampson- 6 solid years in NBA (of his 9), 4x all star, legend in college

Bill Walton - MVP, Legend in college, 2x all star, 2x all league. Can be fixed by adding MVP variable but may hurt with other players

Walt Bellamy- 4x all star, tons of points (43) and rebounds (12) over career

Wes Unseld- Solid accolades, consistent 15-15 guy earlier in career

Jamaal Wilkes- Solid contributor to many championships, also won All D which is currently not on here

Players to look into:

 'Walt Bellamy',
       'Maurice Cheeks', 'Earl Monroe', 'Arnie Risen', 'Tom Gola',
       'Frank Ramsey', 'Wes Unseld', 'Andy Phillip', 'Bob Dandridge',
       'David Thompson'

In [294]:
#eligible_df[(eligible_df['pred'] < 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] < 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
3790,Guy Rodgers,1,1,0,4,2,3791.0,10415,13.6,0,0.0,0.492253
1561,Manu Ginóbili,1,1,0,2,0,3697.0,14043,20.2,2,53.2,0.482913
4937,George Yardley,1,1,0,6,0,4220.0,9063,20.6,2,0.0,0.478895
489,Carl Braun,1,1,0,5,0,2122.0,10625,15.8,2,13.5,0.467269
4492,Wes Unseld,1,1,0,5,0,13769.0,10624,16.0,1,10.8,0.460457
2029,Lou Hudson,1,1,0,6,0,3926.0,17940,17.4,1,0.0,0.423029
1452,Joe Fulks,1,1,0,2,0,1379.0,8003,10.9,4,16.4,0.415533
3068,Earl Monroe,1,1,0,4,0,2796.0,17454,17.2,1,18.8,0.372953
4601,Bill Walton,1,1,0,2,0,4923.0,6215,20.0,2,26.6,0.339929
308,Walt Bellamy,1,1,0,4,0,14241.0,20941,19.9,0,0.0,0.299861


In [295]:
X_validation[(X_validation['pred'] < 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
4376,David Thompson,1,1,0,5,0,2446.0,13422,19.9,3,0.0,0.484865
4401,Nate Thurmond,1,1,0,7,0,14464.0,14437,16.5,0,0.0,0.482138
967,Bob Dandridge,1,1,0,4,0,5715.0,15530,16.7,1,37.0,0.480104
1583,Gail Goodrich,1,1,0,5,0,3279.0,19181,16.7,1,18.6,0.447168
1576,Tom Gola,1,1,0,5,0,5617.0,7871,14.2,1,11.3,0.397994
3730,Arnie Risen,1,1,0,4,0,5011.0,7633,16.7,1,24.0,0.389145
3627,Frank Ramsey,1,1,0,0,0,3410.0,8378,15.6,0,93.8,0.382829
3878,Ralph Sampson,1,1,0,4,0,4011.0,7039,16.0,1,0.0,0.272598
2284,K.C. Jones,1,1,0,0,0,2399.0,5011,10.4,0,59.2,0.22146


#### Over predictions

Larry Foust deserves to be in HOF. Also has 94% rating on Bball ref

Amare Stoudemire (72% bball ref), Chauncey (84% on bball ref), Shawn Marion (76% on bball ref) are newer to ballot

Larry Costello is in HOF as Contributor- this led to confusion as one of his key contributions was his play. 71% on bball ref

Tom Sanders in HOF as contributor but 15% rating on bball ref

Max Zaslofsky	

Legitimate wrong predictions:

Jimmy Jones- ABA Legend, very efficient. Think ABA was less respected when he was in it than Erving. Shorter career

Mack Calvin- Early ABA

Shawn Kemp (39% bball ref)- Lower longevity

Donnie Freeman- ABA Star i late 60s early 70s

Walter Davis (31% bball ref)

Jermaine O'Neal (32% bball ref)

In [296]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
4240,Amar'e Stoudemire,1,0,0,6,0,6632.0,15994,21.8,5,0.0,0.728463
1440,Donnie Freeman,1,0,0,5,0,2292.0,12233,18.2,4,18.9,0.650592
2363,Shawn Kemp,1,0,0,6,0,8834.0,15347,19.1,3,0.0,0.604509
3298,Jermaine O'Neal,1,0,0,6,0,7261.0,13309,17.9,3,0.0,0.588332
351,Chauncey Billups,1,0,0,5,0,2992.0,15802,18.8,3,15.2,0.572868
670,Mack Calvin,1,0,0,5,0,1923.0,12172,17.4,4,0.0,0.554818
2281,Jimmy Jones,1,0,0,6,0,2930.0,11366,17.1,3,0.0,0.552543
2228,Marques Johnson,1,0,0,5,0,4817.0,13892,20.1,3,0.0,0.503293


In [297]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,AST_Champ,TRB_totals,PTS_totals,PER_advanced,All_League,Champ PTS_per_game,pred
1412,Larry Foust,1,0,0,8,0,8041.0,11198,19.8,2,0.0,0.647423


Model does well at handling these extreme cases

### Model Coefficients

In [298]:
for col, coef in zip(model_cols[3:], classifier.coef_[0]):
  print(f"{col}: {exp(coef)}")

Def_POY: 1.113601599676686
All_Star: 1.583488151908115
All_League: 1.1729059051456385
AST_Champ: 1.064178516884116
PTS_totals: 1.0497081223110234
TRB_totals: 1.0014209544503807
PER_advanced: 1.436103416711948
Champ PTS_per_game: 1.2387327122290968


Massive collinearity problem here causing points to hurt here

In [220]:
#for col, coef in zip(poly_reg.get_feature_names(), classifier.coef_[0]):
#  print(f"{col}: {exp(coef)}")

### Confusion Matrix

In [299]:
# Output confuson matrix for the training set
cm = confusion_matrix(y_train, y_train_pred)
#[00 01]
#[10 11]
print(cm)

[[2956    8]
 [  11   85]]


In [300]:
print(f"Accuracy: {get_scorer('accuracy')._score_func(y_train, y_train_pred)}")
print(f"F1 score: {get_scorer('f1')._score_func(y_train, y_train_pred)}")
print(f"Precision score: {get_scorer('precision')._score_func(y_train, y_train_pred)}")
print(f"Recall score: {get_scorer('recall')._score_func(y_train, y_train_pred)}")

Accuracy: 0.9937908496732026
F1 score: 0.8994708994708994
Precision score: 0.9139784946236559
Recall score: 0.8854166666666666


In [301]:
# Output confusion matrix for the validation set
cm = confusion_matrix(y_val, y_val_pred)
#[00 01]
#[10 11]
print(cm)
accuracy_score(y_val, y_val_pred)

[[988   1]
 [  9  22]]


0.9901960784313726

In [302]:
print(f"Accuracy: {get_scorer('accuracy')._score_func(y_val, y_val_pred)}")
print(f"F1 score: {get_scorer('f1')._score_func(y_val, y_val_pred)}")
print(f"Precision score: {get_scorer('precision')._score_func(y_val, y_val_pred)}")
print(f"Recall score: {get_scorer('recall')._score_func(y_val, y_val_pred)}")

Accuracy: 0.9901960784313726
F1 score: 0.8148148148148149
Precision score: 0.9565217391304348
Recall score: 0.7096774193548387


### Other Metrics

In [303]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2964
           1       0.91      0.89      0.90        96

    accuracy                           0.99      3060
   macro avg       0.96      0.94      0.95      3060
weighted avg       0.99      0.99      0.99      3060



In [304]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       989
           1       0.96      0.71      0.81        31

    accuracy                           0.99      1020
   macro avg       0.97      0.85      0.90      1020
weighted avg       0.99      0.99      0.99      1020

