<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [164]:
# Load packages

from math import exp
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [45]:
# read in data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4977 entries, 0 to 4976
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          4977 non-null   object 
 1   Eligible        4977 non-null   int64  
 2   Position        4977 non-null   object 
 3   Hall_of_Fame    4977 non-null   int64  
 4   MVP             4977 non-null   int64  
 5   Finals_MVP      4977 non-null   int64  
 6   NBA_Champ       4977 non-null   int64  
 7   All_NBA         4977 non-null   int64  
 8   All_Defensive   4977 non-null   int64  
 9   Def_POY         4977 non-null   int64  
 10  All_Star        4977 non-null   int64  
 11  Scoring_Champ   4977 non-null   int64  
 12  TRB_Champ       4977 non-null   int64  
 13  AST_Champ       4977 non-null   int64  
 14  STL_Champ       4977 non-null   int64  
 15  BLK_Champ       4977 non-null   int64  
 16  All_ABA         4977 non-null   int64  
 17  ABA_Champ       4977 non-null   i

In [61]:
model_df = model_df.replace(-999, np.nan)

In [62]:
# Reduce number of positions
model_df.loc[model_df['Position'] == 'Center/Forward', 'Position'] = 'Center'
model_df.loc[model_df['Position'].isin(['PointGuard', 'ShootingGuard', 'Guard/Forward']), 'Position'] = 'Guard'
model_df.loc[model_df['Position'].isin(['SmallForward', 'PowerForward', 'Forward/Guard', 'Forward/Center']), 'Position'] = 'Forward'

In [74]:
# Add ABA and NBA accolades
model_df['All_League'] = model_df['All_NBA'] + model_df['All_ABA']
model_df['Champ'] = model_df['NBA_Champ'] + model_df['ABA_Champ']

In [75]:
#Columns with NAs:
for col in model_df.columns:
  if len(model_df[model_df[col].isna()]) > 0:
    print(col, "-", len(model_df[model_df[col].isna()]))

2P_per_game - 1118
2PA_per_game - 1118
ORB_per_game - 949
DRB_per_game - 949
GS_totals - 1689
2P_totals - 1118
2PA_totals - 1118
ORB_totals - 949
DRB_totals - 949
Trp_Dbl_totals - 4526
OWS_advanced - 1
DWS_advanced - 1
WS/48_advanced - 344
OBPM_advanced - 1185
DBPM_advanced - 1185


In [76]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

In [77]:
# WHAT TO DO WITH NAs / -999

# Columns to drop:
# GS_totals, Trp_Dbl_totals, ORB_per_game, DRB_per_game, ORB_totals, DRB_totals, 3P%_totals, 2P%_totals, eFG%_totals, OWS_advanced, DWS_advanced, WS/48_advanced, OBPM_advanced, DBPM_advanced

# Columns to drop for now: (just for bare bones model, consider bringing back FG% FT% 2PT stuff at least)
# 3P_per_game, 3PA_per_game, 3P_totals, 3PA_totals
# FG%_totals (these players never took a shot)
# FT%_totals (these players never took a FT)
# Columns to take from FGM, FGA, etc.
# 2P_per_game, 2PA_per_game, 2P_totals, 2PA_totals

# Columns to fill with league average
# PER_advanced, VORP_advanced (consider some more advanced PER/VORP)

# Columns to make 0
# WS_advanced, BPM_advanced (consider some more advance imputation for BPM)

# Columns to make 0 or fill with mean (undecided):***
# TS%_advanced (these players never took a shot or free throw?)

# Columns to fill with mean (potentially by position):
# PTS_per_game, TRB_per_game, AST_per_game, STL_per_game, BLK_per_game, TRB_totals, AST_totals, STL_totals, BLK_totals

In [78]:
def fillNulls(model_df):
  cols_to_zero = ['WS_advanced', 'BPM_advanced', '3P_per_game', '3PA_per_game', '3P_totals', '3PA_totals', 'FG%_totals', 'FT%_totals', 'TS%_advanced']
  model_df[cols_to_zero] = model_df[cols_to_zero].fillna(0) # fill cols with 0
  
  cols_to_avg = ['PER_advanced', 'VORP_advanced', '3P%_totals', '2P%_totals', 'eFG%_totals']
  model_df[cols_to_avg] = model_df[cols_to_avg].fillna(model_df[cols_to_avg].mean()) # fill cols with avg
  
  cols_to_position_avg = ['TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals']
  model_df[cols_to_position_avg] = model_df.groupby("Position")[cols_to_position_avg].transform(lambda x: x.fillna(x.mean())) # fills cols with avg by position
  
  cols_to_fill = ['2P_per_game', '2PA_per_game', '2P_totals', '2PA_totals']
  cols_to_fill_with = ['FG_per_game', 'FGA_per_game', 'FG_totals', 'FGA_totals']
  model_df[cols_to_fill] = model_df[cols_to_fill].fillna(model_df[cols_to_fill_with]) # fill 2P shooting columns with FG columns

  return model_df
fillNulls(model_df)

Unnamed: 0,Player,Eligible,Position,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,...,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced,All_League,Champ
0,Alaa Abdelnaby,1,Forward,0,0,0,0,0,0,0,...,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.500000,0,0
1,Zaid Abdul-Aziz,1,Center,0,0,0,0,0,0,0,...,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.700000,0,0
2,Kareem Abdul-Jabbar,1,Center,1,6,2,6,15,11,0,...,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.700000,15,6
3,Mahmoud Abdul-Rauf,1,Guard,0,0,0,0,0,0,0,...,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.500000,0,0
4,Tariq Abdul-Wahad,1,Guard,0,0,0,0,0,0,0,...,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.200000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4972,Jim Zoet,1,Center,0,0,0,0,0,0,0,...,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.100000,0,0
4973,Bill Zopf,1,Guard,0,0,0,0,0,0,0,...,-0.5,0.4,-0.1,-0.011,,,0.0,3.434634,0,0
4974,Ivica Zubac,0,Center,0,0,0,0,0,0,0,...,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.500000,0,0
4975,Matt Zunic,1,Guard,0,0,0,0,0,0,0,...,0.2,1.8,2.0,,,,0.0,3.434634,0,0


In [226]:
model_df.columns

Index(['Player', 'Eligible', 'Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP',
       'NBA_Champ', 'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'All_ABA', 'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game',
       '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',
       'FT_per_game', 'FTA_per_game', 'ORB_per_game', 'DRB_per_game',
       'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',
       'PTS_per_game', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals',
       'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals',
       '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals',
       'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals',
       'PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced',
       'WS_advanced', 'WS/48_advanced', 'OBPM_advanced

In [298]:
# 'Finals_MVP', 'BLK_totals', 'VORP_advanced', 'STL_totals', 'TS%_advanced', 'All_Defensive', 'AST_Champ', 'TRB_totals', 'AST_totals'
model_cols = [
    'Eligible', 'Hall_of_Fame', 'Def_POY', 'All_Star', 'All_League', 'Champ', 'Scoring_Champ',  # accolades
    'PER_advanced', 'WS_advanced', 'BPM_advanced', 'PTS_totals'] # stats
df = model_df[model_cols]
df

Unnamed: 0,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PER_advanced,WS_advanced,BPM_advanced,PTS_totals
0,1,0,0,0,0,0,0,13.000000,4.8,-3.8,1465
1,1,0,0,0,0,0,0,15.100000,17.5,0.4,4557
2,1,1,0,19,15,6,2,24.600000,273.4,5.7,38387
3,1,0,0,0,0,0,0,15.400000,25.2,-0.8,8553
4,1,0,0,0,0,0,0,11.400000,3.5,-3.0,1830
...,...,...,...,...,...,...,...,...,...,...,...
4972,1,0,0,0,0,0,0,-0.800000,-0.1,-5.4,2
4973,1,0,0,0,0,0,0,9.600000,-0.1,0.0,118
4974,0,0,0,0,0,0,0,19.200000,26.1,0.6,3001
4975,1,0,0,0,0,0,0,11.212562,2.0,0.0,273


In [299]:
eligible_df = df[df['Eligible'] == 1]
noneligible_df = df[df['Eligible'] == 0]

In [300]:
X_eligible = eligible_df.iloc[:, 2:].values
y_eligible = eligible_df.iloc[:, 1].values


X_test = noneligible_df.iloc[:, 2:].values
y_test = noneligible_df.iloc[:, 1].values

In [301]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_eligible, y_eligible, test_size = 0.25, random_state = 0)

In [302]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Scale whole matrix of features to prevent information leakage
X_train = sc.fit_transform(X_eligible)
X_val = sc.transform(X_val)

In [303]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 1)
classifier.fit(X_train, y_eligible)

LogisticRegression(random_state=1)

In [304]:
y_train_pred_probs = classifier.predict_proba(X_train)[:, 1]
y_train_pred = classifier.predict(X_train)

In [305]:
eligible_df['pred'] = y_train_pred_probs
eligible_df['Player'] = model_df[model_df['Eligible'] == 1]['Player']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [306]:
eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)

Unnamed: 0,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PER_advanced,WS_advanced,BPM_advanced,PTS_totals,pred,Player
489,1,1,0,5,2,1,0,15.8,64.3,0.0,10625,0.510380,Carl Braun
4481,1,1,0,6,2,0,0,17.8,75.0,0.0,15840,0.569550,Jack Twyman
4401,1,1,0,7,0,0,0,16.5,78.0,1.1,14437,0.580738,Nate Thurmond
989,1,1,0,4,5,1,0,18.1,49.7,0.0,6594,0.592832,Bob Davies
2910,1,1,0,7,1,0,0,15.6,50.9,0.0,5921,0.594615,Dick McGuire
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3299,1,1,0,15,14,4,2,26.4,181.7,5.1,28596,1.000000,Shaquille O'Neal
587,1,1,0,18,15,5,2,22.9,172.7,4.6,33643,1.000000,Kobe Bryant
2,1,1,0,19,15,6,2,24.6,273.4,5.7,38387,1.000000,Kareem Abdul-Jabbar
744,1,1,0,13,10,2,7,26.2,247.3,0.0,31419,1.000000,Wilt Chamberlain


In [307]:
eligible_df[(eligible_df['pred'] < 0.5) & (eligible_df['Hall_of_Fame'] == 1)]

Unnamed: 0,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PER_advanced,WS_advanced,BPM_advanced,PTS_totals,pred,Player
308,1,1,0,4,0,0,0,19.9,130.0,-0.2,20941,0.24508,Walt Bellamy
468,1,1,0,1,0,2,0,12.2,38.8,-2.2,9217,0.021576,Bill Bradley
740,1,1,0,0,1,0,0,21.1,21.9,0.0,1591,0.00515,Al Cervi
761,1,1,0,4,0,1,0,16.5,103.5,2.8,12195,0.304994,Maurice Cheeks
872,1,1,0,0,0,0,0,11.1,11.6,0.0,2725,0.001548,Chuck Cooper
967,1,1,0,4,1,2,0,16.7,80.3,1.3,15530,0.456385,Bob Dandridge
1115,1,1,0,1,0,0,0,17.7,96.4,2.3,13398,0.017247,Vlade Divac
1576,1,1,0,5,1,1,0,14.2,53.2,0.0,7871,0.36244,Tom Gola
1583,1,1,0,5,1,1,0,16.7,76.3,-0.3,19181,0.429269,Gail Goodrich
2009,1,1,0,0,0,0,0,15.6,16.5,0.0,2611,0.00223,Bob Houbregs


In [308]:
eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 0)]

Unnamed: 0,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PER_advanced,WS_advanced,BPM_advanced,PTS_totals,pred,Player
351,1,0,0,5,3,1,0,18.8,120.8,3.2,15802,0.814125,Chauncey Billups
670,1,0,0,5,4,0,0,17.4,60.4,0.1,12172,0.514249,Mack Calvin
888,1,0,0,6,1,1,0,14.5,62.7,0.0,8622,0.598303,Larry Costello
1028,1,0,0,6,2,0,0,19.1,76.9,1.5,19521,0.577523,Walter Davis
1412,1,0,0,8,2,0,0,19.8,79.2,0.0,11198,0.898918,Larry Foust
1440,1,0,0,5,4,1,0,18.2,58.0,-0.9,12233,0.690312,Donnie Freeman
2228,1,0,0,5,3,0,0,20.1,79.8,3.3,13892,0.538101,Marques Johnson
2281,1,0,0,6,3,0,0,17.1,78.8,1.0,11366,0.689351,Jimmy Jones
2363,1,0,0,6,3,0,0,19.1,89.5,0.6,15347,0.73258,Shawn Kemp
2767,1,0,0,4,2,1,0,18.8,124.9,2.7,17700,0.579091,Shawn Marion


In [309]:
for col, coef in zip(model_cols[2:], classifier.coef_[0]):
  print(f"{col}: {exp(coef)}")

Def_POY: 1.0821724285828658
All_Star: 3.5297364638046997
All_League: 1.5373709456135096
Champ: 1.7213771542413352
Scoring_Champ: 2.0888968426033716
PER_advanced: 1.475346778483537
WS_advanced: 1.5637728243628777
BPM_advanced: 1.0226858758907538
PTS_totals: 0.8968657428202993


In [310]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_eligible, y_train_pred)
#[00 01]
#[10 11]
print(cm)
accuracy_score(y_eligible, y_train_pred)

[[3940   13]
 [  30  110]]


0.9894942584901051

In [None]:
def standardizeCols(model_df, numeric_cols):
  scaler = StandardScaler()
  model_df[numeric_cols] = scaler.fit_transform(model_df[numeric_cols])
  return model_df