In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
df = pd.read_csv('Resources/final_nba_player_dataset.csv')
df = df.drop(['Unnamed: 0','season','player','team','abbreviation','league','season_id','player_id','games_started'],axis=1)
df.head()

Unnamed: 0,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,...,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,strength_of_schedule,playoffs
0,0,1,0,14.9,0.537,0.382,0.358,5.3,18.5,11.7,...,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0,0.29,False
1,0,0,1,14.1,0.564,0.311,0.214,6.7,13.9,10.3,...,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0,-0.11,False
2,1,0,0,9.2,0.503,0.417,0.19,1.4,6.8,4.1,...,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0,-0.08,False
3,1,0,0,9.4,0.573,0.607,0.157,4.6,16.6,10.6,...,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0,-0.14,False
4,1,0,0,13.4,0.605,0.371,0.319,2.3,17.3,9.9,...,55.0,62.0,19.0,10.0,9.0,19.0,34.0,160.0,-0.15,False


In [3]:
le = LabelEncoder()

X = df.drop('playoffs', axis=1)
X = pd.get_dummies(X)

# Create our target
y = df['playoffs'].ravel()
#y.describe()

In [4]:
X.describe()

Unnamed: 0,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,...,blocks,turnovers,personal_fouls,points,strength_of_schedule,position_C,position_PF,position_PG,position_SF,position_SG
count,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,...,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0,19023.0
mean,0.806655,0.103664,0.095726,12.576881,0.508783,0.189556,0.313033,5.99867,13.994291,9.995847,...,23.927667,70.131367,104.755191,484.076644,0.00939,0.201651,0.210377,0.196446,0.188456,0.20307
std,0.394931,0.304832,0.294223,6.409196,0.099961,0.20675,0.237317,4.898227,6.631821,4.997567,...,35.791854,65.860241,79.549514,480.205819,0.398617,0.401243,0.407587,0.39732,0.391086,0.402295
min,0.0,0.0,0.0,-90.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.03,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,9.7,0.477,0.007,0.19,2.4,9.2,6.1,...,3.0,16.0,33.0,92.0,-0.28,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,12.8,0.521,0.107,0.279,5.0,13.0,9.2,...,11.0,52.0,95.0,335.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,15.8,0.558,0.336,0.387,8.8,18.2,13.4,...,29.0,106.0,164.0,747.0,0.31,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,133.8,1.5,1.0,6.0,100.0,100.0,86.4,...,456.0,464.0,386.0,3041.0,1.17,1.0,1.0,1.0,1.0,1.0


In [5]:
Counter(y)

Counter({False: 8952, True: 10071})

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=1)
Counter(y_train)

Counter({True: 7029, False: 6287})

# Balanced Random Forest

In [7]:
from sklearn.preprocessing import StandardScaler
#from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=128, random_state=1) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)

#Counter(predictions)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)

#print(cm)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2035,630
Actual 1,697,2345


In [9]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

print(acc_score)

0.7674785351322937


In [10]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.74      0.76      0.75      2665
        True       0.79      0.77      0.78      3042

    accuracy                           0.77      5707
   macro avg       0.77      0.77      0.77      5707
weighted avg       0.77      0.77      0.77      5707



In [11]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.13506410389772974, 'strength_of_schedule'),
 (0.057436750142002446, 'win_shares_per_48_min'),
 (0.05567231906971915, 'defensive_win_shares'),
 (0.03686669345045784, 'defensive_box_plus/minus'),
 (0.02604524877711593, 'player_efficiency_rating'),
 (0.0257334405311383, 'win_shares'),
 (0.024198914171930777, 'games_started'),
 (0.02369780674211855, 'usage_percentage'),
 (0.02355286340445707, 'defensive_rebound_percentage'),
 (0.023386477564773425, 'defensive_rebounds'),
 (0.022693608533829575, 'total_rebounds'),
 (0.022468801490223518, 'games'),
 (0.02199418115371864, 'true_shooting_percentage'),
 (0.021627430002324224, 'box_plus/minus'),
 (0.020341952887349645, 'offensive_box_plus/minus'),
 (0.020331981062769428, 'minutes_played'),
 (0.019025047797719415, 'total_rebound_percentage'),
 (0.01867119923017513, 'assist_percentage'),
 (0.01848367492685059, 'free_throw_attempt_rate'),
 (0.018020950145088664, 'age'),
 (0.017926282793827738, 'offensive_win_shares'),
 (0.017647050537946296, 's

# Easy Ensemble AdaBoost Classifier

In [12]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

# Create a easy ensemble classifier
ada_model = EasyEnsembleClassifier(n_estimators=128, random_state=1) 

# Fitting the model
ada_model = ada_model.fit(X_train_scaled, y_train)
predictions = ada_model.predict(X_test_scaled)

In [13]:
# Calculated the balanced accuracy score
y_pred2 = ada_model.predict(X_test)
confusion_matrix(y_test, y_pred)

print(cm)

[[2035  630]
 [ 697 2345]]


In [14]:
# Display the confusion matrix
acc_score = accuracy_score(y_test, predictions)

print(acc_score)

0.7730856842474154


In [15]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

      False       0.74      0.78      0.76      0.76      0.77      0.60      2665
       True       0.80      0.76      0.78      0.78      0.77      0.60      3042

avg / total       0.77      0.77      0.77      0.77      0.77      0.60      5707

