<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Load packages

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [12]:
# read in data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)

In [13]:
model_df.head()

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
0,Alaa Abdelnaby,1,0,0,0,0,0,0,0,0,...,13.0,0.532,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.5
1,Zaid Abdul-Aziz,1,0,0,0,0,0,0,0,0,...,15.1,0.479,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.7
2,Kareem Abdul-Jabbar,1,1,6,2,6,15,11,0,19,...,24.6,0.592,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.7
3,Mahmoud Abdul-Rauf,1,0,0,0,0,0,0,0,0,...,15.4,0.506,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.5
4,Tariq Abdul-Wahad,1,0,0,0,0,0,0,0,0,...,11.4,0.467,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.2


In [14]:
#Columns with NAs:
#THIS DOESN'T INCLUDE THE VALUES ENCODED AS -999

#>500 missing:
#GS_totals -> GS started getting recorded in 81-82, might wanna just drop it doesn't seem super helpful and no good way to estimate imo
#3P%_totals ->

#Some missing:
#ORB_per_game -> gap around 1968 where TRB was being counted but not ORB and DRB
#DRB_per_game -> gap around 1968 where TRB was being counted but not ORB and DRB
#TRB_per_game -> gap around 1946 where TRB is blank
#STL_per_game -> gap around 1972 where STLs are there but blank
#BLK_per_game -> gap around 1972 where BLKs are there but blank
#FG%_totals -> I don't think these players took a shot so make 0*
#2P%_totals -> I don't think these players took a two-pointer so make 0*
#eFG%_totals -> I don't think these players took a shot or a FT so make 0*
#FT%_totals -> I don't think these players took a FT so make 0*
#ORB_totals -> same as ORB per game
#DRB_totals -> same as DRB per game
#TRB_totals -> same as TRB per game
#STL_totals -> seemingly a weird gap in the 70s where the STL category exists but is blank, so these didn't successfully become -999
#BLK_totals -> seemingly a weird gap in the 70s where the BLK category exists but is blank, so these didn't successfully become -999
#PER_advanced -> these players all seemingly played before 1950. missing a TON of stats including all rebounding, advanced, BPM/VORP, and 2pt/3pt shooting metrics
#TS%_advanced -> these players never took a shot so can make them all 0*
#WS/48_advanced -> this is a simple formula: (WS / total minutes played)*48 so we just need to scrape the MP_advanced data

#<5 missing:
#OWS_advanced, DWS_advanced, WS_advanced, OBPM_advanced, DBPM_advanced, BPM_advanced -> these can be taken care of by making them all 0*

model_df.isna().sum()[7:]

All_Defensive        0
Def_POY              0
All_Star             0
Scoring_Champ        0
TRB_Champ            0
AST_Champ            0
STL_Champ            0
BLK_Champ            0
All_ABA              0
ABA_Champ            0
ROY                  0
FG_per_game          0
FGA_per_game         0
3P_per_game          0
3PA_per_game         0
2P_per_game          0
2PA_per_game         0
FT_per_game          0
FTA_per_game         0
ORB_per_game        66
DRB_per_game        66
TRB_per_game       288
AST_per_game         0
STL_per_game        34
BLK_per_game        80
PTS_per_game         0
G_totals             0
GS_totals         1687
FG_totals            0
FGA_totals           0
FG%_totals          34
3P_totals            0
3PA_totals           0
3P%_totals         509
2P_totals            0
2PA_totals           0
2P%_totals          44
eFG%_totals         28
FT_totals            0
FTA_totals           0
FT%_totals         241
ORB_totals          66
DRB_totals          66
TRB_totals 

In [15]:
# JamesOn Curry, Alex Scales have missing BPM/OBPM/DBPM along with pretty much everything else
# We can assign these as 0s as they basically never stepped on the court
model_df[model_df['OBPM_advanced'].isna()]

# Dan King has missing OWS/DWS/WS
# We can assign these as 0s as he basically never stepped on the court
model_df[model_df['OWS_advanced'].isna()]

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
2400,Dan King,1,0,0,0,0,0,0,0,0,...,,0.36,,,,,-999.0,-999.0,-999.0,-999.0


In [28]:
model_df.columns

Index(['Player', 'Eligible', 'Hall_of_Fame', 'MVP', 'Finals_MVP', 'NBA_Champ',
       'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star', 'Scoring_Champ',
       'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ', 'All_ABA',
       'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game', '3P_per_game',
       '3PA_per_game', '2P_per_game', '2PA_per_game', 'FT_per_game',
       'FTA_per_game', 'ORB_per_game', 'DRB_per_game', 'TRB_per_game',
       'AST_per_game', 'STL_per_game', 'BLK_per_game', 'PTS_per_game',
       'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals', 'FG%_totals',
       '3P_totals', '3PA_totals', '3P%_totals', '2P_totals', '2PA_totals',
       '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals', 'FT%_totals',
       'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals', 'STL_totals',
       'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals', 'PER_advanced',
       'TS%_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced',
       'WS/48_advanced', 'OBPM_advanced', 'DBPM_adv

In [27]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

#3P_per_game -> before 3P line - make 0 or fill with the mean? (could standardize the cat and make -999s 0)
#3PA_per_game -> before 3P line - make 0 fill with the mean? (could standardize the cat and make -999s 0)
#2P_per_game -> before 3P line - fill with player's FG per game
#2PA_per_game -> before 3P line - fill with player's FGA per game
#ORB_per_game -> before rebounds were recorded - fill with the mean? (could standardize the cat and make -999s 0)
#DRB_per_game -> before rebounds were recorded - fill with the mean? (could standardize the cat and make -999s 0)
#STL_per_game -> before steals were recorded - fill with the mean? (could standardize the cat and make -999s 0)
#BLK_per_game -> before blocks were recorded - fill with the mean? (could standardize the cat and make -999s 0)
#GS_totals -> vast majority of these were NAs, we should drop it
#3P_totals, 3PA_totals, 3P% totals -> pre-3PT line, make 0 or fill with the mean?
#2P_totals, 2PA_totals, 2P%_totals, eFG%_totals -> pre-3PT line, take FGM, FGA, FG% etc. and plug in
# ORB_totals, DRB_totals -> same as ORB_per_game and DRB_per_game
# STL_totals, BLK_totals -> same as STL_per_game and BLK_per_game
# Trp_Dbl_totals -> maybe drop, missing an insane amount and should theoretically be collinear with other stats
# OBPM_advanced, DBPM_advanced, BPM_advanced, VORP_advanced -> hard to calculate, maybe plug in 0 since it's the league average

3P_per_game - 1117
3PA_per_game - 1117
2P_per_game - 1117
2PA_per_game - 1117
ORB_per_game - 882
DRB_per_game - 882
STL_per_game - 1145
BLK_per_game - 1099
GS_totals - 1
3P_totals - 1117
3PA_totals - 1117
3P%_totals - 1117
2P_totals - 1117
2PA_totals - 1117
2P%_totals - 1117
eFG%_totals - 1117
ORB_totals - 882
DRB_totals - 882
STL_totals - 1145
BLK_totals - 1099
Trp_Dbl_totals - 4525
OBPM_advanced - 1182
DBPM_advanced - 1182
BPM_advanced - 1182
VORP_advanced - 1182
