<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Load packages

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [8]:
# read in data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4976 entries, 0 to 4975
Data columns (total 66 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          4976 non-null   object 
 1   Eligible        4976 non-null   int64  
 2   Hall_of_Fame    4976 non-null   int64  
 3   MVP             4976 non-null   int64  
 4   Finals_MVP      4976 non-null   int64  
 5   NBA_Champ       4976 non-null   int64  
 6   All_NBA         4976 non-null   int64  
 7   All_Defensive   4976 non-null   int64  
 8   Def_POY         4976 non-null   int64  
 9   All_Star        4976 non-null   int64  
 10  Scoring_Champ   4976 non-null   int64  
 11  TRB_Champ       4976 non-null   int64  
 12  AST_Champ       4976 non-null   int64  
 13  STL_Champ       4976 non-null   int64  
 14  BLK_Champ       4976 non-null   int64  
 15  All_ABA         4976 non-null   int64  
 16  ABA_Champ       4976 non-null   int64  
 17  ROY             4976 non-null   i

In [3]:
#Columns with NAs:
for col in model_df.columns:
  if len(model_df[model_df[col].isna()]) > 0:
    print(col, "-", len(model_df[model_df[col].isna()]))

ORB_per_game - 66
DRB_per_game - 66
TRB_per_game - 288
STL_per_game - 34
BLK_per_game - 80
GS_totals - 1687
FG%_totals - 34
3P%_totals - 509
2P%_totals - 44
eFG%_totals - 28
FT%_totals - 241
ORB_totals - 66
DRB_totals - 66
TRB_totals - 288
STL_totals - 34
BLK_totals - 80
PER_advanced - 344
TS%_advanced - 29
OWS_advanced - 1
DWS_advanced - 1
WS_advanced - 1
WS/48_advanced - 344
OBPM_advanced - 2
DBPM_advanced - 2
BPM_advanced - 2


In [4]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

3P_per_game - 1117
3PA_per_game - 1117
2P_per_game - 1117
2PA_per_game - 1117
ORB_per_game - 882
DRB_per_game - 882
STL_per_game - 1145
BLK_per_game - 1099
GS_totals - 1
3P_totals - 1117
3PA_totals - 1117
3P%_totals - 1117
2P_totals - 1117
2PA_totals - 1117
2P%_totals - 1117
eFG%_totals - 1117
ORB_totals - 882
DRB_totals - 882
STL_totals - 1145
BLK_totals - 1099
Trp_Dbl_totals - 4525
OBPM_advanced - 1182
DBPM_advanced - 1182
BPM_advanced - 1182
VORP_advanced - 1182


In [5]:
# WHAT TO DO WITH NAs / -999

# Columns to drop:
# GS_totals, Trp_Dbl_totals, ORB_per_game, DRB_per_game, ORB_totals, DRB_totals

# Columns to consider dropping:
# OWS_advanced, DWS_advanced, WS/48_advanced, OBPM_advanced, DBPM_advanced, 

# Columns to take from FGM, FGA, FG%, etc.
# 2P_per_game, 2PA_per_game, 2P_totals, 2PA_totals

# Columns to fill with league average
# PER_advanced, VORP_advanced

# Columns to make 0
# WS_advanced, BPM_advanced

# Columns to make 0 or fill with mean (undecided):***
# 3P_per_game, 3PA_per_game, 3P_totals, 3PA_totals
# FG%_totals (these players never took a shot)
# FT%_totals (these players never took a FT)
# TS%_advanced (these players never took a shot or free throw?)

# Columns to fill with mean (potentially by position):
# TRB_per_game, AST_per_game, STL_per_game, BLK_per_game, TRB_totals, AST_totals, STL_totals, BLK_totals

# Columns where NA and -999 mean something different (undecided):***
# 3P%_totals (-999s were before the 3PT line, NAs never attempted a 3)
# 2P%_totals (-999s were before the 3PT line, NAs never attempted a 2)
# eFG%_totals (-999s were before the 3PT line, NAs never attempted a shot)

In [None]:
def fillNulls(model_df):
  cols_to_zero = ['WS_advanced', 'BPM_advanced', '3P_per_game', '3PA_per_game', '3P_totals', '3PA_totals', 'FG%_totals', 'FT%_totals', 'TS%_advanced']
  model_df[cols_to_zero] = model_df[cols_to_zero].fillna(0) # fill cols with 0
  
  cols_to_avg = ['PER_advanced', 'VORP_advanced', '3P%_totals', '2P%_totals', 'eFG%_totals']
  model_df[cols_to_avg] = model_df[cols_to_avg].fillna(model_df[cols_to_avg].mean()) # fill cols with avg
  
  cols_to_position_avg = ['TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals']
  model_df[cols_to_position_avg] = model_df.groupby("POSITION")[cols_to_position_avg].transform(lambda x: x.fillna(x.mean())) # fills cols with avg by position
  
  cols_to_fill = ['2P_per_game', '2PA_per_game', '2P_totals', '2PA_totals']
  cols_to_fill_with = ['FG_per_game', 'FGA_per_game', 'FG_totals', 'FGA_totals']
  model_df[cols_to_fill] = model_df[cols_to_fill].fillna(model_df[cols_to_fill_with]) # fill 2P shooting columns with FG columns

  return model_df

In [None]:
def standardizeCols(model_df, numeric_cols):
  scaler = StandardScaler()
  model_df[numeric_cols] = scaler.fit_transform(model_df[numeric_cols])
  return model_df