<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load packages

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# read in data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)

In [None]:
model_df.head()

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
0,Alaa Abdelnaby,1,0,0,0,0,0,0,0,0,...,13.0,0.532,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.5
1,Zaid Abdul-Aziz,1,0,0,0,0,0,0,0,0,...,15.1,0.479,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.7
2,Kareem Abdul-Jabbar,1,1,6,2,6,15,11,0,19,...,24.6,0.592,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.7
3,Mahmoud Abdul-Rauf,1,0,0,0,0,0,0,0,0,...,15.4,0.506,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.5
4,Tariq Abdul-Wahad,1,0,0,0,0,0,0,0,0,...,11.4,0.467,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.2


In [None]:
#Columns with NAs:
#THIS DOESN'T INCLUDE THE VALUES ENCODED AS -999

#>500 missing:
#GS_totals -> GS started getting recorded in 81-82, might wanna just drop it doesn't seem super helpful and no good way to estimate imo
#3P%_totals

#Some missing:
#ORB_per_game
#DRB_per_game
#TRB_per_game
#STL_per_game
#BLK_per_game
#FG%_totals
#2P%_totals
#eFG%_totals
#FT%_totals
#ORB_totals -> same as ORB per game
#DRB_totals -> same as DRB per game
#TRB_totals -> same as TRB per game
#STL_totals -> seemingly a weird gap in the 70s where the STL category exists but is blank, so these didn't successfully become -999
#BLK_totals -> seemingly a weird gap in the 70s where the BLK category exists but is blank, so these didn't successfully become -999
#PER_advanced -> these players all seemingly played before 1950. missing a TON of stats including all rebounding, advanced, BPM/VORP, and 2pt/3pt shooting metrics
#TS%_advanced -> these players never took a shot so can make them all 0
#WS/48_advanced -> this is a simple formula: (WS / total minutes played)*48 so we just need to scrape the MP_advanced data

#<5 missing:
#OWS_advanced, DWS_advanced, WS_advanced, OBPM_advanced, DBPM_advanced, BPM_advanced -> these can be taken care of by making them all 0

model_df.isna().sum()

Def_POY              0
All_Star             0
Scoring_Champ        0
TRB_Champ            0
AST_Champ            0
STL_Champ            0
BLK_Champ            0
All_ABA              0
ABA_Champ            0
ROY                  0
FG_per_game          0
FGA_per_game         0
3P_per_game          0
3PA_per_game         0
2P_per_game          0
2PA_per_game         0
FT_per_game          0
FTA_per_game         0
ORB_per_game        66
DRB_per_game        66
TRB_per_game       288
AST_per_game         0
STL_per_game        34
BLK_per_game        80
PTS_per_game         0
G_totals             0
GS_totals         1687
FG_totals            0
FGA_totals           0
FG%_totals          34
3P_totals            0
3PA_totals           0
3P%_totals         509
2P_totals            0
2PA_totals           0
2P%_totals          44
eFG%_totals         28
FT_totals            0
FTA_totals           0
FT%_totals         241
ORB_totals          66
DRB_totals          66
TRB_totals         288
AST_totals 

In [None]:
# JamesOn Curry, Alex Scales have missing BPM/OBPM/DBPM along with pretty much everything else
# We can assign these as 0s as they basically never stepped on the court
model_df[model_df['OBPM_advanced'].isna()]

# Dan King has missing OWS/DWS/WS
# We can assign these as 0s as he basically never stepped on the court
model_df[model_df['OWS_advanced'].isna()]

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
2400,Dan King,1,0,0,0,0,0,0,0,0,...,,0.36,,,,,-999.0,-999.0,-999.0,-999.0


In [16]:
model_df[model_df['ORB_totals'].isna()][:]

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
34,Matthew Aitch,1,0,0,0,0,0,0,0,0,...,11.9,0.449,0.0,0.5,0.5,0.040,-999.0,-999.0,-999.0,-999.0
58,Bill Allen,1,0,0,0,0,0,0,0,0,...,11.9,0.464,0.3,0.3,0.6,0.033,-999.0,-999.0,-999.0,-999.0
161,Johnny Austin,1,0,0,0,0,0,0,0,0,...,9.3,0.460,0.3,0.5,0.9,0.056,-999.0,-999.0,-999.0,-999.0
274,Howard Bayne,1,0,0,0,0,0,0,0,0,...,8.2,0.399,-1.1,1.9,0.8,0.031,-999.0,-999.0,-999.0,-999.0
281,Al Beard,1,0,0,0,0,0,0,0,0,...,6.3,0.539,0.1,0.1,0.3,0.107,-999.0,-999.0,-999.0,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4472,Herschell Turner,1,0,0,0,0,0,0,0,0,...,2.2,0.365,-0.8,0.2,-0.6,-0.055,-999.0,-999.0,-999.0,-999.0
4695,Dexter Westbrook,1,0,0,0,0,0,0,0,0,...,7.0,0.531,0.2,0.1,0.3,0.119,-999.0,-999.0,-999.0,-999.0
4734,Ron Widby,1,0,0,0,0,0,0,0,0,...,11.5,0.397,-0.2,0.2,0.0,-0.006,-999.0,-999.0,-999.0,-999.0
4746,Gene Wiley,1,0,0,0,0,0,0,0,0,...,9.3,0.481,2.6,5.7,8.3,0.062,-999.0,-999.0,-999.0,-999.0
