<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Hall_of_Fame_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Packages and Data

In [314]:
# Load packages

from math import exp
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, accuracy_score, classification_report, f1_score, get_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [315]:
# read in data
model_df = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Scraped%20Player%20Data.csv", index_col=0)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4977 entries, 0 to 4976
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Player          4977 non-null   object 
 1   Eligible        4977 non-null   int64  
 2   Position        4977 non-null   object 
 3   Hall_of_Fame    4977 non-null   int64  
 4   MVP             4977 non-null   int64  
 5   Finals_MVP      4977 non-null   int64  
 6   NBA_Champ       4977 non-null   int64  
 7   All_NBA         4977 non-null   int64  
 8   All_Defensive   4977 non-null   int64  
 9   Def_POY         4977 non-null   int64  
 10  All_Star        4977 non-null   int64  
 11  Scoring_Champ   4977 non-null   int64  
 12  TRB_Champ       4977 non-null   int64  
 13  AST_Champ       4977 non-null   int64  
 14  STL_Champ       4977 non-null   int64  
 15  BLK_Champ       4977 non-null   int64  
 16  All_ABA         4977 non-null   int64  
 17  ABA_Champ       4977 non-null   i

### Data Cleaning

In [316]:
model_df = model_df.replace(-999, np.nan)

In [317]:
# Reduce number of positions
model_df.loc[model_df['Position'] == 'Center/Forward', 'Position'] = 'Center'
model_df.loc[model_df['Position'].isin(['PointGuard', 'ShootingGuard', 'Guard/Forward']), 'Position'] = 'Guard'
model_df.loc[model_df['Position'].isin(['SmallForward', 'PowerForward', 'Forward/Guard', 'Forward/Center']), 'Position'] = 'Forward'

In [318]:
# Add ABA and NBA accolades
model_df['All_League'] = model_df['All_NBA'] + model_df['All_ABA']
model_df['Champ'] = model_df['NBA_Champ'] + model_df['ABA_Champ']

In [319]:
#Columns with NAs:
for col in model_df.columns:
  if len(model_df[model_df[col].isna()]) > 0:
    print(col, "-", len(model_df[model_df[col].isna()]))

3P_per_game - 1118
3PA_per_game - 1118
2P_per_game - 1118
2PA_per_game - 1118
ORB_per_game - 949
DRB_per_game - 949
TRB_per_game - 288
STL_per_game - 1180
BLK_per_game - 1180
GS_totals - 1689
FG%_totals - 34
3P_totals - 1118
3PA_totals - 1118
3P%_totals - 1627
2P_totals - 1118
2PA_totals - 1118
2P%_totals - 1162
eFG%_totals - 1146
FT%_totals - 241
ORB_totals - 949
DRB_totals - 949
TRB_totals - 288
STL_totals - 1180
BLK_totals - 1180
Trp_Dbl_totals - 4526
PER_advanced - 344
TS%_advanced - 29
OWS_advanced - 1
DWS_advanced - 1
WS_advanced - 1
WS/48_advanced - 344
OBPM_advanced - 1185
DBPM_advanced - 1185
BPM_advanced - 1185
VORP_advanced - 1183


In [320]:
# Columns with -999s
for col in model_df.columns:
  if len(model_df[model_df[col] == -999]) > 0:
    print(col, "-", len(model_df[model_df[col] == -999]))

In [321]:
# WHAT TO DO WITH NAs / -999

# Columns to drop:
# GS_totals, Trp_Dbl_totals, ORB_per_game, DRB_per_game, ORB_totals, DRB_totals, 3P%_totals, 2P%_totals, eFG%_totals, OWS_advanced, DWS_advanced, WS/48_advanced, OBPM_advanced, DBPM_advanced

# Columns to drop for now: (just for bare bones model, consider bringing back FG% FT% 2PT stuff at least)
# 3P_per_game, 3PA_per_game, 3P_totals, 3PA_totals
# FG%_totals (these players never took a shot)
# FT%_totals (these players never took a FT)
# Columns to take from FGM, FGA, etc.
# 2P_per_game, 2PA_per_game, 2P_totals, 2PA_totals

# Columns to fill with league average
# PER_advanced, VORP_advanced (consider some more advanced PER/VORP)

# Columns to make 0
# WS_advanced, BPM_advanced (consider some more advance imputation for BPM)

# Columns to make 0 or fill with mean (undecided):***
# TS%_advanced (these players never took a shot or free throw?)

# Columns to fill with mean (potentially by position):
# PTS_per_game, TRB_per_game, AST_per_game, STL_per_game, BLK_per_game, TRB_totals, AST_totals, STL_totals, BLK_totals

In [322]:
def fillNulls(model_df):
  cols_to_zero = ['WS_advanced', 'BPM_advanced', '3P_per_game', '3PA_per_game', '3P_totals', '3PA_totals', 'FG%_totals', 'FT%_totals', 'TS%_advanced']
  model_df[cols_to_zero] = model_df[cols_to_zero].fillna(0) # fill cols with 0
  
  cols_to_avg = ['PER_advanced', 'VORP_advanced', '3P%_totals', '2P%_totals', 'eFG%_totals']
  model_df[cols_to_avg] = model_df[cols_to_avg].fillna(model_df[cols_to_avg].mean()) # fill cols with avg
  
  cols_to_position_avg = ['TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals']
  model_df[cols_to_position_avg] = model_df.groupby("Position")[cols_to_position_avg].transform(lambda x: x.fillna(x.mean())) # fills cols with avg by position
  
  cols_to_fill = ['2P_per_game', '2PA_per_game', '2P_totals', '2PA_totals']
  cols_to_fill_with = ['FG_per_game', 'FGA_per_game', 'FG_totals', 'FGA_totals']
  model_df[cols_to_fill] = model_df[cols_to_fill].fillna(model_df[cols_to_fill_with]) # fill 2P shooting columns with FG columns

  return model_df
fillNulls(model_df)

Unnamed: 0,Player,Eligible,Position,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,...,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced,All_League,Champ
0,Alaa Abdelnaby,1,Forward,0,0,0,0,0,0,0,...,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.500000,0,0
1,Zaid Abdul-Aziz,1,Center,0,0,0,0,0,0,0,...,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.700000,0,0
2,Kareem Abdul-Jabbar,1,Center,1,6,2,6,15,11,0,...,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.700000,15,6
3,Mahmoud Abdul-Rauf,1,Guard,0,0,0,0,0,0,0,...,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.500000,0,0
4,Tariq Abdul-Wahad,1,Guard,0,0,0,0,0,0,0,...,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.200000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4972,Jim Zoet,1,Center,0,0,0,0,0,0,0,...,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.100000,0,0
4973,Bill Zopf,1,Guard,0,0,0,0,0,0,0,...,-0.5,0.4,-0.1,-0.011,,,0.0,3.434634,0,0
4974,Ivica Zubac,0,Center,0,0,0,0,0,0,0,...,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.500000,0,0
4975,Matt Zunic,1,Guard,0,0,0,0,0,0,0,...,0.2,1.8,2.0,,,,0.0,3.434634,0,0


### Feature Selection

In [323]:
model_df.columns

Index(['Player', 'Eligible', 'Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP',
       'NBA_Champ', 'All_NBA', 'All_Defensive', 'Def_POY', 'All_Star',
       'Scoring_Champ', 'TRB_Champ', 'AST_Champ', 'STL_Champ', 'BLK_Champ',
       'All_ABA', 'ABA_Champ', 'ROY', 'FG_per_game', 'FGA_per_game',
       '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',
       'FT_per_game', 'FTA_per_game', 'ORB_per_game', 'DRB_per_game',
       'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',
       'PTS_per_game', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals',
       'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals',
       '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals',
       'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals',
       'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals',
       'PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced',
       'WS_advanced', 'WS/48_advanced', 'OBPM_advanced

In [324]:
# 'Finals_MVP', 'BLK_totals', 'VORP_advanced', 'STL_totals', 'TS%_advanced', 'All_Defensive', 'AST_Champ', 'TRB_totals', 'AST_totals'

# 'BPM_advanced', 'WS_advanced',
model_cols = ['Player', 'Eligible', 'Hall_of_Fame', 'Def_POY', 'All_Star', 'All_League', 'Champ', 'Scoring_Champ', # accolades
       'PTS_totals'] # stats
df = model_df[model_cols]
df

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals
0,Alaa Abdelnaby,1,0,0,0,0,0,0,1465
1,Zaid Abdul-Aziz,1,0,0,0,0,0,0,4557
2,Kareem Abdul-Jabbar,1,1,0,19,15,6,2,38387
3,Mahmoud Abdul-Rauf,1,0,0,0,0,0,0,8553
4,Tariq Abdul-Wahad,1,0,0,0,0,0,0,1830
...,...,...,...,...,...,...,...,...,...
4972,Jim Zoet,1,0,0,0,0,0,0,2
4973,Bill Zopf,1,0,0,0,0,0,0,118
4974,Ivica Zubac,0,0,0,0,0,0,0,3001
4975,Matt Zunic,1,0,0,0,0,0,0,273


In [325]:
eligible_df = df[df['Eligible'] == 1]
noneligible_df = df[df['Eligible'] == 0]

In [326]:
extraneous_players = ['Maurice Stokes', 'Bill Bradley', 'Toni Kukoč',
       'Calvin Murphy', 'Vlade Divac', 'Buddy Jeannette',
       'Dražen Petrović', 'Al Cervi', 'Arvydas Sabonis',
       'Šarūnas Marčiulionis', 'Dino Radja', 'Chuck Cooper',
       'Bob Houbregs']

eligible_df = eligible_df[~eligible_df['Player'].isin(extraneous_players)]

### Train/Test Split

In [327]:
X_eligible = eligible_df.iloc[:, 3:].values
y_eligible = eligible_df.iloc[:, 2].values

In [328]:
# Train test split
X_training, X_validation, y_train, y_val = train_test_split(eligible_df, y_eligible, test_size = 0.25, random_state = 0)

In [329]:
X_train = X_training.iloc[:,3:].values
X_val = X_validation.iloc[:,3:].values

In [330]:
X_test = noneligible_df.iloc[:, 3:]
y_test = noneligible_df.iloc[:, 2]

### Feature Scaling

In [331]:
sc1 = StandardScaler()
X_eligible = sc1.fit_transform(X_eligible)

# Scale whole matrix of features to prevent information leakage
# Scale for train/val
sc2 = StandardScaler()
X_train = sc2.fit_transform(X_train)
X_val = sc2.transform(X_val)

### Model Selection

In [332]:
CLASSIFIERS = [
    LogisticRegression(),
    XGBClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    GaussianNB(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    SVC(kernel = 'rbf')
  ]

In [333]:
METRICS = [
    'f1',
    'accuracy',
    'precision',
    'recall',
    'average_precision'
]

Accuracy- Percentage of samples correctly classified

Precision- Of players model identified as HOF players, how many were truly HOF

Recall- Of true HOF players, how many did model identify as HOF

F1- Harmonic mean of precision and recall

In [334]:
def get_metrics(classifier, X, y_true):
  print(classifier)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X)
  #clf_val_preds = classifier.predict(X_val)

  output_metrics = []

  for metric in METRICS:
    score = get_scorer(metric)._score_func(y_true, y_pred)
    output_metrics.append(score)

  return output_metrics


In [335]:
training_metrics = []
val_metrics = []

for classifier in CLASSIFIERS:
  clf_train_metrics = get_metrics(classifier, X_train, y_train)
  val_train_metrics = get_metrics(classifier, X_val, y_val)
  
  training_metrics.append([classifier] + clf_train_metrics)
  val_metrics.append([classifier] + val_train_metrics)

LogisticRegression()
LogisticRegression()
XGBClassifier()
XGBClassifier()
DecisionTreeClassifier()
DecisionTreeClassifier()
RandomForestClassifier()
RandomForestClassifier()
SVC()
SVC()
GaussianNB()
GaussianNB()
KNeighborsClassifier()
KNeighborsClassifier()
AdaBoostClassifier()
AdaBoostClassifier()
SVC()
SVC()


In [336]:
train_metrics_df = pd.DataFrame(data = training_metrics, columns=['Classifier'] + METRICS)
val_metrics_df = pd.DataFrame(data = val_metrics, columns=['Classifier'] + METRICS)

In [337]:
train_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
2,DecisionTreeClassifier(),1.0,1.0,1.0,1.0,1.0
3,"(DecisionTreeClassifier(max_features='auto', r...",1.0,1.0,1.0,1.0,1.0
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.989474,0.999346,1.0,0.979167,0.97982
1,XGBClassifier(),0.958333,0.997386,0.958333,0.958333,0.91971
4,SVC(),0.912088,0.994771,0.965116,0.864583,0.838672
8,SVC(),0.912088,0.994771,0.965116,0.864583,0.838672
0,LogisticRegression(),0.893617,0.993464,0.913043,0.875,0.802835
6,KNeighborsClassifier(),0.888889,0.993464,0.952381,0.833333,0.79888
5,GaussianNB(),0.78481,0.983333,0.659574,0.96875,0.639943


In [338]:
val_metrics_df.sort_values(by='f1', ascending=False)

Unnamed: 0,Classifier,f1,accuracy,precision,recall,average_precision
5,GaussianNB(),0.810811,0.986275,0.697674,0.967742,0.676149
0,LogisticRegression(),0.792453,0.989216,0.954545,0.677419,0.656431
3,"(DecisionTreeClassifier(max_features='auto', r...",0.754717,0.987255,0.909091,0.645161,0.597295
1,XGBClassifier(),0.72,0.986275,0.947368,0.580645,0.56283
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.716981,0.985294,0.863636,0.612903,0.54109
4,SVC(),0.692308,0.984314,0.857143,0.580645,0.510441
8,SVC(),0.692308,0.984314,0.857143,0.580645,0.510441
2,DecisionTreeClassifier(),0.690909,0.983333,0.791667,0.612903,0.49698
6,KNeighborsClassifier(),0.612245,0.981373,0.833333,0.483871,0.418912


### Hyperparameter Optimization

In [None]:
logistic_classifier = LogisticRegression(random_state= 0)

In [199]:
#https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)

In [201]:
# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(X_eligible, y_eligible)

In [202]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.887643 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}

0.887643 (0.053954) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.887643 (0.053954) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.886320 (0.053418) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.887643 (0.053954) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.887643 (0.053954) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.886708 (0.048007) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.883583 (0.058514) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.883583 (0.058514) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.884749 (0.049557) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.870411 (0.060474) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.870411 (0.060474) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.880227 (0.052501) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.806675 (0.078770) wi

### Modeling

In [203]:
#from sklearn.preprocessing import PolynomialFeatures
#poly_reg = PolynomialFeatures(degree = 2)
#X_poly = poly_reg.fit_transform(X_train)

In [204]:
classifier = LogisticRegression(random_state = 1, **grid_result.best_params_)
classifier.fit(X_train, y_train)

LogisticRegression(C=100, random_state=1, solver='newton-cg')

### Predictions

In [205]:
y_train_pred_probs = classifier.predict_proba(X_train)[:, 1]
y_train_pred = classifier.predict(X_train)

In [206]:
y_val_pred_probs = classifier.predict_proba(X_val)[:, 1]
y_val_pred = classifier.predict(X_val)

In [207]:
#eligible_df['pred'] = y_train_pred_probs
X_training['pred'] = y_train_pred_probs
X_validation['pred'] = y_val_pred_probs

### Borderline Correct Positive HOF Predictions

In [208]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=True)[:5]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
4754,Jamaal Wilkes,1,1,0,3,0,4,0,14644,0.539708
4698,Paul Westphal,1,1,0,5,4,1,0,12809,0.563565
2673,Clyde Lovellette,1,1,0,4,1,3,0,11947,0.583186
2910,Dick McGuire,1,1,0,7,1,0,0,5921,0.637571
4481,Jack Twyman,1,1,0,6,2,0,0,15840,0.64807


In [209]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)[:20]

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
3299,Shaquille O'Neal,1,1,0,15,14,4,2,28596,1.0
246,Rick Barry,1,1,0,12,10,1,1,25279,1.0
1844,John Havlicek,1,1,0,13,11,8,0,26395,1.0
1856,Elvin Hayes,1,1,0,12,6,1,1,27313,1.0
899,Bob Cousy,1,1,0,13,12,6,0,16960,1.0
273,Elgin Baylor,1,1,0,11,10,0,0,23149,0.999967
3516,Scottie Pippen,1,1,0,7,7,6,0,18940,0.999956
2399,Bernard King,1,1,0,4,4,0,1,19655,0.999729
2999,Vern Mikkelsen,1,1,0,6,4,4,0,10063,0.990526
4715,Jo Jo White,1,1,0,7,2,2,0,14399,0.983239


### Under predictions

#### Expected Under Predictions:

<br/>

#### **Old players:**

<br/>


Bob Houbregs- A few seasons with mediocre stats, great college player but no NBA accolades and no other league. No clue why he is in. From 50s

<br/>

Chuck Cooper- 6.7 PPG and no accolades. Instrumental to NBA as broke the color barrier

<br/>

Al Cervi- 4 years of recorded stats with Syracuse (49-53). Dominated NBL with scoring champion, championship, 4 all league

<br/>

Buddy Jeannette	- From 40s, 3 seasons in BAA, but played in late 30s, early 40s with no stats for NBL- seeral first team/titles

<br/>
<br/>


#### **European players:**

<br/>

Dino Radja - Croatian player, only 4 years in NBA (with very good stats) but made it for Europe play

<br/>

Šarūnas Marčiulionis - 8 solid years in NBA, 8 years before in USSR league, made it for foreign/olympic play

<br/>

Arvydas Sabonis	- Many great years in Lithuania/Spain unaccounted for, solid 7 years in NBA in 30s


<br/>

Dražen Petrović	- 5 solid years in NBA, played in Yugoslavia/Spain


<br/>

Vlade Divac	- Many years in NBA with solid stats, no accolades. Did well in Yugoslav league, won Europa POY in 7 years before NBA

<br/>

Toni Kukoc- Very solid NBA career and picked up some championships. Played in Italy/Yugoslavia before (9 years). Won Europa POY, Euroscar POY, many Euroleage Championships

<br/>
<br/>


#### **Other:**

Bill Bradley- Not great individual stats, exceptional college player and 2 time champ but not above average. 70s so not that old

<br/>

Calvin Murphy- Very good pro career with not many accolades. Three time all american in college

<br/>

Maurice Stokes - From 50s, 3 years in NBA putting up 17-17 type numbers. Injured and paralyzed after three years with 3 all NBA. Good college player too

<br/>



#### **Legitimately wrong:**

Guy Rodgers- Long NBA career, 4x all star, 2x ast champ

Ralph Sampson- 6 solid years in NBA (of his 9), 4x all star, legend in college

Bill Walton - MVP, Legend in college, 2x all star, 2x all league. Can be fixed by adding MVP variable but may hurt with other players

Walt Bellamy- 4x all star, tons of points (43) and rebounds (12) over career

Players to look into:

 'Walt Bellamy',
       'Maurice Cheeks', 'Earl Monroe', 'Arnie Risen', 'Tom Gola',
       'Frank Ramsey', 'Wes Unseld', 'Andy Phillip', 'Bob Dandridge',
       'David Thompson'

In [210]:
#eligible_df[(eligible_df['pred'] < 0.5) & (eligible_df['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] < 0.5) & (X_training['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
3136,Chris Mullin,1,1,0,5,4,0,0,17911,0.455059
1781,Tim Hardaway,1,1,0,5,5,0,0,15373,0.420206
489,Carl Braun,1,1,0,5,2,1,0,10625,0.398661
4492,Wes Unseld,1,1,0,5,1,1,0,10624,0.353971
3497,Andy Phillip,1,1,0,5,2,1,0,6384,0.275673
1561,Manu Ginóbili,1,1,0,2,2,4,0,14043,0.272633
3068,Earl Monroe,1,1,0,4,1,1,0,17454,0.240384
308,Walt Bellamy,1,1,0,4,0,0,0,20941,0.120441
761,Maurice Cheeks,1,1,0,4,0,1,0,12195,0.116169
3790,Guy Rodgers,1,1,0,4,0,0,0,10415,0.033385


In [211]:
X_validation[(X_validation['pred'] < 0.5) & (X_validation['Hall_of_Fame'] == 1)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
2211,Gus Johnson,1,1,0,5,4,1,0,10243,0.479975
4663,Chris Webber,1,1,0,5,5,0,0,17182,0.478713
1848,Connie Hawkins,1,1,0,5,3,1,0,11528,0.474414
967,Bob Dandridge,1,1,0,4,1,2,0,15530,0.425849
4606,Bobby Wanzer,1,1,0,5,3,1,0,6924,0.330723
4376,David Thompson,1,1,0,5,3,0,0,13422,0.277253
1576,Tom Gola,1,1,0,5,1,1,0,7871,0.276501
2284,K.C. Jones,1,1,0,0,0,8,0,5011,0.265924
3730,Arnie Risen,1,1,0,4,1,2,0,7633,0.208794
3627,Frank Ramsey,1,1,0,0,0,7,0,8378,0.157316


### Over predictions

Larry Foust deserves to be in HOF. Also has 94% rating on Bball ref

Amare Stoudemire (72% bball ref), Chauncey (84% on bball ref) are newer to ballot

Larry Costello is in HOF as Contributor- this led to confusion as one of his key contributions was his play. 71% on bball ref

Tom Sanders in HOF as contributor but 15% rating on bball ref

In [212]:
#eligible_df[(eligible_df['pred'] > 0.5) & (eligible_df['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)
X_training[(X_training['pred'] > 0.5) & (X_training['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
4240,Amar'e Stoudemire,1,0,0,6,5,0,0,15994,0.768891
1028,Walter Davis,1,0,0,6,2,0,0,19521,0.748807
2363,Shawn Kemp,1,0,0,6,3,0,0,15347,0.67623
888,Larry Costello,1,0,0,6,1,1,0,8622,0.640863
3298,Jermaine O'Neal,1,0,0,6,3,0,0,13309,0.615338
351,Chauncey Billups,1,0,0,5,3,1,0,15802,0.612269
2281,Jimmy Jones,1,0,0,6,3,0,0,11366,0.553678
1440,Donnie Freeman,1,0,0,5,4,1,0,12233,0.544945
3196,Willie Naulls,1,0,0,4,0,3,0,11305,0.515346


In [213]:
X_validation[(X_validation['pred'] > 0.5) & (X_validation['Hall_of_Fame'] == 0)].sort_values(by='pred', ascending=False)

Unnamed: 0,Player,Eligible,Hall_of_Fame,Def_POY,All_Star,All_League,Champ,Scoring_Champ,PTS_totals,pred
1412,Larry Foust,1,0,0,8,2,0,0,11198,0.947278
4956,Max Zaslofsky,1,0,0,1,4,0,1,7990,0.913626


In [129]:
#eligible_df.sort_values(by='PER_advanced', ascending=False)

### Model Coefficients

In [214]:
for col, coef in zip(model_cols[2:], classifier.coef_[0]):
  print(f"{col}: {exp(coef)}")

Hall_of_Fame: 1.2595486404292338
Def_POY: 10.714385916690674
All_Star: 1.2494410817153363
All_League: 2.2487051046157944
Champ: 14.744648364339774
Scoring_Champ: 1.8343782809291445


In [131]:
#for col, coef in zip(poly_reg.get_feature_names(), classifier.coef_[0]):
#  print(f"{col}: {exp(coef)}")

### Confusion Matrix

In [215]:
cm = confusion_matrix(y_train, y_train_pred)
#[00 01]
#[10 11]
print(cm)
accuracy_score(y_train, y_train_pred)

[[2955    9]
 [  11   85]]


0.9934640522875817

In [218]:
cm = confusion_matrix(y_val, y_val_pred)
#[00 01]
#[10 11]
print(cm)
accuracy_score(y_val, y_val_pred)

[[987   2]
 [ 12  19]]


0.9862745098039216

### Other Metrics

In [220]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2964
           1       0.90      0.89      0.89        96

    accuracy                           0.99      3060
   macro avg       0.95      0.94      0.95      3060
weighted avg       0.99      0.99      0.99      3060



In [221]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       989
           1       0.90      0.61      0.73        31

    accuracy                           0.99      1020
   macro avg       0.95      0.81      0.86      1020
weighted avg       0.99      0.99      0.98      1020

