In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

### Load, merge and filter data

In [2]:
per_poss = pd.read_csv(Path('../data/per_100_data.csv'))
advanced = pd.read_csv(Path('../data/advanced_data.csv'))

data = per_poss.merge(advanced, on=['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'season', 'all_nba_1st_team'])
data.dropna(axis=1, how='all', inplace=True)

data.shape

(8295, 52)

In [3]:
data.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'season', 'all_nba_1st_team', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%',
       'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
       'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [4]:
# transfer minutes to per game stat instead of season total
data['MP'] = np.round(data['MP'] / data['G'], 1)

In [5]:
data.drop(columns=['Player', 'Pos', 'Age', 'Tm', 'season'], inplace=True)
data.drop(columns=['GS'], inplace=True)

data = data[data.G > 40]
data = data[data.MP > 25]

data.drop(columns=['FG%', '2P%', '3P%', 'FT%'], inplace=True)

In [6]:
data.head()

Unnamed: 0,G,MP,FG,FGA,3P,3PA,2P,2PA,FT,FTA,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,82,38.3,12.3,20.3,0.0,0.0,12.3,20.3,5.3,7.0,...,15.7,24.1,9.5,5.3,14.8,0.227,4.8,2.4,7.2,7.3
2,75,28.9,9.8,18.5,0.0,0.0,9.8,18.4,4.0,5.0,...,18.2,21.9,3.1,3.9,7.0,0.155,1.7,1.9,3.6,3.1
10,73,25.9,7.3,15.1,0.0,0.1,7.3,15.0,2.4,3.5,...,18.8,17.4,0.4,2.7,3.2,0.08,-1.6,1.4,-0.3,0.8
13,82,36.0,11.0,23.2,0.9,2.3,10.1,20.9,4.8,5.7,...,14.0,25.3,5.6,5.6,11.2,0.182,3.0,1.5,4.5,4.8
15,79,26.3,8.1,14.3,0.1,0.2,7.9,14.1,4.8,6.4,...,19.8,18.5,3.0,0.7,3.7,0.086,-0.4,-1.4,-1.8,0.1


### split data and scale it

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = data.drop(columns=['all_nba_1st_team'])
y = data['all_nba_1st_team']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=27, stratify=y)

In [10]:
np.bincount(y_train), np.bincount(y_test)

(array([1394,   27]), array([687,  13]))

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Create model without sampling data

In [13]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [14]:
def create_model(X_train, y_train, estimator, grid, scoring, cv=10):
    clf = GridSearchCV(estimator=estimator, param_grid=grid, scoring=scoring, cv=cv)
    clf.fit(X_train, y_train)

    return clf.best_estimator_, clf.best_params_


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, log_loss, roc_auc_score
from sklearn.dummy import DummyClassifier

In [16]:
def evaluate_model(estimator, X_test, y_test):
    y_predicted = estimator.predict(X_test)

    print('confusion matrix:')
    print(confusion_matrix(y_test, y_predicted))
    print('-------------------------')
    print('classification report:')
    print(classification_report(y_test, y_predicted))
    print('-------------------------')
    print('roc auc score:', roc_auc_score(y_test, y_predicted))
    print('-------------------------')
    print('Log loss:', log_loss(y_test, y_predicted))
    print('-------------------------')


In [17]:
grid_svc = {
    'kernel': ['rbf'],
    'C': [10**i for i in range(-5, 5)],
    'gamma': ['scale', 'auto'] + [10**i for i in range(-3, 3)],
    'probability': [True],
    'random_state': [27]
}

grid_rfc = {
    'n_estimators': np.linspace(start=50, stop=300, num=6, dtype=int),
    'criterion': ['gini'],
    'max_depth': np.linspace(start=10, stop=100, num=10, dtype=int),
    'random_state': [27]
}

grid_gbc = {
    'loss': ['deviance'],
    'n_estimators': np.linspace(start=50, stop=300, num=6, dtype=int),
    'max_depth': np.linspace(start=10, stop=100, num=10, dtype=int),
    'max_features': [None, 'sqrt'],
    'random_state': [27]
}

In [18]:
model_svc, params = create_model(X_train, y_train, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_svc, X_test, y_test)

{'C': 10000, 'gamma': 0.001, 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[681   6]
 [  8   5]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.45      0.38      0.42        13

    accuracy                           0.98       700
   macro avg       0.72      0.69      0.70       700
weighted avg       0.98      0.98      0.98       700

-------------------------
roc auc score: 0.687940880080618
-------------------------
Log loss: 0.6907823815904734
-------------------------


In [19]:
model_rfc, params = create_model(X_train, y_train, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 150, 'random_state': 27}
confusion matrix:
[[687   0]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       1.00      0.23      0.38        13

    accuracy                           0.99       700
   macro avg       0.99      0.62      0.68       700
weighted avg       0.99      0.99      0.98       700

-------------------------
roc auc score: 0.6153846153846154
-------------------------
Log loss: 0.4934110913558679
-------------------------


In [20]:
model_gbc, params = create_model(X_train, y_train, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 20, 'max_features': None, 'n_estimators': 100, 'random_state': 27}
confusion matrix:
[[679   8]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.27      0.23      0.25        13

    accuracy                           0.97       700
   macro avg       0.63      0.61      0.62       700
weighted avg       0.97      0.97      0.97       700

-------------------------
roc auc score: 0.6095621990818498
-------------------------
Log loss: 0.8881491026969065
-------------------------


### undersampling

In [21]:
from imblearn.under_sampling import RandomUnderSampler

In [22]:
ratio = 1/4
rus = RandomUnderSampler(random_state=27, sampling_strategy=ratio)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

In [23]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([108,  27]))

In [24]:
model_undersampling_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_svc, X_test, y_test)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[633  54]
 [  2  11]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       687
           1       0.17      0.85      0.28        13

    accuracy                           0.92       700
   macro avg       0.58      0.88      0.62       700
weighted avg       0.98      0.92      0.95       700

-------------------------
roc auc score: 0.883775613033255
-------------------------
Log loss: 2.763163794823185
-------------------------


In [25]:
model_undersampling_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100, 'random_state': 27}
confusion matrix:
[[657  30]
 [  1  12]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       687
           1       0.29      0.92      0.44        13

    accuracy                           0.96       700
   macro avg       0.64      0.94      0.71       700
weighted avg       0.99      0.96      0.97       700

-------------------------
roc auc score: 0.9397044004030904
-------------------------
Log loss: 1.5296086516644822
-------------------------


In [26]:
model_undersampling_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 250, 'random_state': 27}
confusion matrix:
[[666  21]
 [  2  11]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       687
           1       0.34      0.85      0.49        13

    accuracy                           0.97       700
   macro avg       0.67      0.91      0.74       700
weighted avg       0.98      0.97      0.97       700

-------------------------
roc auc score: 0.9077930802821634
-------------------------
Log loss: 1.1348694980414007
-------------------------


### oversampling

In [27]:
from imblearn.over_sampling import RandomOverSampler

In [28]:
ratio = 1/4
ros = RandomOverSampler(random_state=27, sampling_strategy=ratio)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [29]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([1394,  348]))

In [30]:
model_oversampling_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_svc, X_test, y_test)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[668  19]
 [  5   8]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       687
           1       0.30      0.62      0.40        13

    accuracy                           0.97       700
   macro avg       0.64      0.79      0.69       700
weighted avg       0.98      0.97      0.97       700

-------------------------
roc auc score: 0.7938640689732394
-------------------------
Log loss: 1.184208322612901
-------------------------


In [31]:
model_oversampling_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[686   1]
 [  9   4]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       0.80      0.31      0.44        13

    accuracy                           0.99       700
   macro avg       0.89      0.65      0.72       700
weighted avg       0.98      0.99      0.98       700

-------------------------
roc auc score: 0.6531183518083081
-------------------------
Log loss: 0.49341223363791104
-------------------------


In [32]:
model_oversampling_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 10, 'max_features': None, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[682   5]
 [  8   5]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.50      0.38      0.43        13

    accuracy                           0.98       700
   macro avg       0.74      0.69      0.71       700
weighted avg       0.98      0.98      0.98       700

-------------------------
roc auc score: 0.6886686821184638
-------------------------
Log loss: 0.6414401301728435
-------------------------


### SMOTE

In [33]:
from imblearn.over_sampling import SMOTE

In [34]:
smote = SMOTE(random_state=27, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [35]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([1394, 1394]))

In [36]:
model_smote_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_svc, X_test, y_test)

{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[644  43]
 [  1  12]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       687
           1       0.22      0.92      0.35        13

    accuracy                           0.94       700
   macro avg       0.61      0.93      0.66       700
weighted avg       0.98      0.94      0.96       700

-------------------------
roc auc score: 0.9302429739110962
-------------------------
Log loss: 2.1710579200936704
-------------------------


In [37]:
model_smote_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[672  15]
 [  6   7]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       687
           1       0.32      0.54      0.40        13

    accuracy                           0.97       700
   macro avg       0.65      0.76      0.69       700
weighted avg       0.98      0.97      0.97       700

-------------------------
roc auc score: 0.7583137386630836
-------------------------
Log loss: 1.0361804260779686
-------------------------


In [38]:
model_smote_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200, 'random_state': 27}
confusion matrix:
[[678   9]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.25      0.23      0.24        13

    accuracy                           0.97       700
   macro avg       0.62      0.61      0.61       700
weighted avg       0.97      0.97      0.97       700

-------------------------
roc auc score: 0.6088343970440041
-------------------------
Log loss: 0.9374913541145364
-------------------------
