In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('data.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season,allstar,was_allstar,previous_allstars,scaled_gp,western_conf
6226,Kirk Hinrich,ATL,30.0,193.04,86.18248,10.2,2.5,4.0,-9.5,0.011,0.09,0.171,0.543,0.221,2010,0,0,0,0.878049,False
6227,Kwame Brown,CHA,29.0,210.82,122.46984,7.9,6.8,0.7,-7.3,0.102,0.228,0.151,0.55,0.041,2010,0,0,0,0.804878,False
6228,Kobe Bryant,LAL,32.0,198.12,92.98636,25.3,5.1,4.7,7.7,0.035,0.135,0.35,0.548,0.258,2010,1,1,12,1.0,True
6230,Kris Humphries,NJN,26.0,205.74,106.59412,10.0,10.4,1.1,-5.3,0.125,0.322,0.173,0.555,0.069,2010,0,0,0,0.902439,False
6231,Kurt Thomas,CHI,38.0,205.74,104.32616,4.1,5.8,1.2,6.8,0.077,0.227,0.096,0.527,0.074,2010,0,0,0,0.634146,False


In [4]:
#split off 2019 data for later use
is_2019 = (data['season'] == 2019)
data_2019 = data[is_2019]
data = data[~is_2019]

In [5]:
feature_cols = ['pts', 'reb', 'ast', 'oreb_pct', 'dreb_pct', 'usg_pct', 'ts_pct', 'ast_pct', 'was_allstar', 'previous_allstars', 'scaled_gp']
X = data[feature_cols].values
y = data['allstar'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify=y)

In [11]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       468
           1       0.85      0.62      0.72        47

    accuracy                           0.96       515
   macro avg       0.91      0.80      0.85       515
weighted avg       0.95      0.96      0.95       515

[[463   5]
 [ 18  29]]


Not bad for the first simple implementation. There is a lot of optimalisation left to do, like trying out different classifiers such as Logistic Regression or SVMs and also choosing hyper parameters. We can worry about choosing the absolute best model later, let's first try to do some more evaluation of the results and a get a feel for how things work.

In [13]:
rf.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
feat_imp = rf.feature_importances_
feat_dict = dict(zip(feature_cols,feat_imp))
print(feat_dict)

{'pts': 0.252710591480562, 'reb': 0.06086955812512853, 'ast': 0.06480343767702544, 'oreb_pct': 0.03822602122786939, 'dreb_pct': 0.048532622639559826, 'usg_pct': 0.12446505906543213, 'ts_pct': 0.05657740733943112, 'ast_pct': 0.05016562170816975, 'was_allstar': 0.16170466062737454, 'previous_allstars': 0.09328938190306743, 'scaled_gp': 0.048655638206379866}


In [24]:
X_2019 = data_2019[feature_cols].values
y_2019 = data_2019['allstar'].values
probs = rf.predict_proba(X_2019)[:,1]
data_2019['probs'] = probs

In [36]:
data_2019[data_2019.western_conf].sort_values('probs', ascending=False)[['player_name', 'probs','allstar']].head(n=25)

Unnamed: 0,player_name,probs,allstar
10746,James Harden,1.0,1
10813,Anthony Davis,0.99,1
11042,LeBron James,0.99,1
11070,Kawhi Leonard,0.98,1
10893,Russell Westbrook,0.95,1
11069,Karl-Anthony Towns,0.9,0
10878,Damian Lillard,0.83,1
11107,Nikola Jokic,0.8,1
11037,LaMarcus Aldridge,0.76,0
10873,D'Angelo Russell,0.74,0


In [35]:
data_2019[~data_2019.western_conf].sort_values('probs', ascending=False)[['player_name', 'probs','allstar']].head(n=25)

Unnamed: 0,player_name,probs,allstar
10638,Giannis Antetokounmpo,0.98,1
10709,Joel Embiid,0.96,1
10766,Bradley Beal,0.96,0
11076,Kemba Walker,0.89,1
11036,Kyrie Irving,0.86,0
11034,Kyle Lowry,0.76,1
11024,Khris Middleton,0.71,1
10772,Ben Simmons,0.65,1
11108,Nikola Vucevic,0.62,0
11006,Trae Young,0.6,1


We are not doing so well on predicting the 2019-20 allstars. Here are two of the problems I noticed immediately: players on bad teams get overrated by the algorithm. This could be fixed by adding a team record feature. The second problem is that the stats used are from the end of the season, and not at the allstar break. Kyrie Irving was injured before the allstar break and only played in 11 games, but the input data does not reflect this.