In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

### Load, merge and filter data

In [2]:
per_game = pd.read_csv(Path('../data/per_game_data.csv'))
advanced = pd.read_csv(Path('../data/advanced_data.csv'))
advanced.drop(columns=['MP'], inplace=True)

data = per_game.merge(advanced, on=['Player', 'Pos', 'Age', 'Tm', 'G', 'season', 'all_nba_1st_team'])
data.dropna(axis=1, how='all', inplace=True)

data.shape

(8295, 51)

In [3]:
data.drop(columns=['Player', 'Pos', 'Age', 'Tm', 'season'], inplace=True)
data.drop(columns=['GS'], inplace=True)

data = data[data.G > 40]
data = data[data.MP > 25]

data.drop(columns=['FG%', '2P%', '3P%', 'FT%'], inplace=True)

### split data and scale it

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = data.drop(columns=['all_nba_1st_team'])
y = data['all_nba_1st_team']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=27, stratify=y)

In [7]:
np.bincount(y_train), np.bincount(y_test)

(array([1394,   27]), array([687,  13]))

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Create model without sampling data

In [10]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [11]:
def create_model(X_train, y_train, estimator, grid, scoring, cv=10):
    clf = GridSearchCV(estimator=estimator, param_grid=grid, scoring=scoring, cv=cv)
    clf.fit(X_train, y_train)

    return clf.best_estimator_, clf.best_params_


In [12]:
from sklearn.metrics import confusion_matrix, classification_report, log_loss, roc_auc_score

In [13]:
def evaluate_model(estimator, X_test, y_test):
    y_predicted = estimator.predict(X_test)

    print('confusion matrix:')
    print(confusion_matrix(y_test, y_predicted))
    print('-------------------------')
    print('classification report:')
    print(classification_report(y_test, y_predicted))
    print('-------------------------')
    print('roc auc score:', roc_auc_score(y_test, y_predicted))
    print('-------------------------')
    print('Log loss:', log_loss(y_test, y_predicted))
    print('-------------------------')


In [14]:
grid_svc = {
    'kernel': ['rbf'],
    'C': [10**i for i in range(-5, 5)],
    'gamma': ['scale', 'auto'] + [10**i for i in range(-3, 3)],
    'probability': [True],
    'random_state': [27]
}

grid_rfc = {
    'n_estimators': np.linspace(start=50, stop=300, num=6, dtype=int),
    'criterion': ['gini'],
    'max_depth': np.linspace(start=10, stop=100, num=10, dtype=int),
    'random_state': [27]
}

grid_gbc = {
    'loss': ['deviance'],
    'n_estimators': np.linspace(start=50, stop=300, num=6, dtype=int),
    'max_depth': np.linspace(start=10, stop=100, num=10, dtype=int),
    'max_features': [None, 'sqrt'],
    'random_state': [27]
}

In [15]:
model_svc, params = create_model(X_train, y_train, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_svc, X_test, y_test)

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[685   2]
 [  8   5]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       0.71      0.38      0.50        13

    accuracy                           0.99       700
   macro avg       0.85      0.69      0.75       700
weighted avg       0.98      0.99      0.98       700

-------------------------
roc auc score: 0.690852088232001
-------------------------
Log loss: 0.49341337591995416
-------------------------


In [16]:
model_rfc, params = create_model(X_train, y_train, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[687   0]
 [ 11   2]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       687
           1       1.00      0.15      0.27        13

    accuracy                           0.98       700
   macro avg       0.99      0.58      0.63       700
weighted avg       0.98      0.98      0.98       700

-------------------------
roc auc score: 0.5769230769230769
-------------------------
Log loss: 0.5427522004914546
-------------------------


In [17]:
model_gbc, params = create_model(X_train, y_train, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 10, 'max_features': None, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[685   2]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       0.60      0.23      0.33        13

    accuracy                           0.98       700
   macro avg       0.79      0.61      0.66       700
weighted avg       0.98      0.98      0.98       700

-------------------------
roc auc score: 0.613929011308924
-------------------------
Log loss: 0.5920955941911276
-------------------------


### undersampling

In [18]:
from imblearn.under_sampling import RandomUnderSampler

In [19]:
ratio = 1/4
rus = RandomUnderSampler(random_state=27, sampling_strategy=ratio)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

In [20]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([108,  27]))

In [21]:
model_undersampling_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_svc, X_test, y_test)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[643  44]
 [  1  12]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       687
           1       0.21      0.92      0.35        13

    accuracy                           0.94       700
   macro avg       0.61      0.93      0.66       700
weighted avg       0.98      0.94      0.95       700

-------------------------
roc auc score: 0.9295151718732505
-------------------------
Log loss: 2.2204001715113
-------------------------


In [22]:
model_undersampling_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[663  24]
 [  2  11]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       687
           1       0.31      0.85      0.46        13

    accuracy                           0.96       700
   macro avg       0.66      0.91      0.72       700
weighted avg       0.98      0.96      0.97       700

-------------------------
roc auc score: 0.9056096741686263
-------------------------
Log loss: 1.28289625229429
-------------------------


In [23]:
model_undersampling_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_undersampling_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[658  29]
 [  3  10]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       687
           1       0.26      0.77      0.38        13

    accuracy                           0.95       700
   macro avg       0.63      0.86      0.68       700
weighted avg       0.98      0.95      0.97       700

-------------------------
roc auc score: 0.8635091255178591
-------------------------
Log loss: 1.5789486185180261
-------------------------


### oversampling

In [24]:
from imblearn.over_sampling import RandomOverSampler

In [25]:
ratio = 1/4
ros = RandomOverSampler(random_state=27, sampling_strategy=ratio)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [26]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([1394,  348]))

In [27]:
model_oversampling_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_svc, X_test, y_test)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[672  15]
 [  4   9]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       687
           1       0.38      0.69      0.49        13

    accuracy                           0.97       700
   macro avg       0.68      0.84      0.74       700
weighted avg       0.98      0.97      0.98       700

-------------------------
roc auc score: 0.8352368155861605
-------------------------
Log loss: 0.937498207806795
-------------------------


In [28]:
model_oversampling_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[687   0]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       1.00      0.23      0.38        13

    accuracy                           0.99       700
   macro avg       0.99      0.62      0.68       700
weighted avg       0.99      0.99      0.98       700

-------------------------
roc auc score: 0.6153846153846154
-------------------------
Log loss: 0.4934110913558678
-------------------------


In [29]:
model_oversampling_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_oversampling_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 10, 'max_features': None, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[682   5]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.38      0.23      0.29        13

    accuracy                           0.98       700
   macro avg       0.68      0.61      0.64       700
weighted avg       0.97      0.98      0.98       700

-------------------------
roc auc score: 0.6117456051953869
-------------------------
Log loss: 0.7401223484440169
-------------------------


### SMOTE

In [30]:
from imblearn.over_sampling import SMOTE

In [31]:
smote = SMOTE(random_state=27, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [32]:
np.bincount(y_train), np.bincount(y_train_resampled)

(array([1394,   27]), array([1394, 1394]))

In [33]:
model_smote_svc, params = create_model(X_train_resampled, y_train_resampled, SVC(), grid_svc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_svc, X_test, y_test)

{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 27}
confusion matrix:
[[645  42]
 [  2  11]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       687
           1       0.21      0.85      0.33        13

    accuracy                           0.94       700
   macro avg       0.60      0.89      0.65       700
weighted avg       0.98      0.94      0.96       700

-------------------------
roc auc score: 0.8925092374874034
-------------------------
Log loss: 2.1710567778116268
-------------------------


In [34]:
model_smote_rfc, params = create_model(X_train_resampled, y_train_resampled, RandomForestClassifier(), grid_rfc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_rfc, X_test, y_test)

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 50, 'random_state': 27}
confusion matrix:
[[669  18]
 [  4   9]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       687
           1       0.33      0.69      0.45        13

    accuracy                           0.97       700
   macro avg       0.66      0.83      0.72       700
weighted avg       0.98      0.97      0.97       700

-------------------------
roc auc score: 0.8330534094726234
-------------------------
Log loss: 1.0855249620596845
-------------------------


In [35]:
model_smote_gbc, params = create_model(X_train_resampled, y_train_resampled, GradientBoostingClassifier(), grid_gbc, scoring='recall', cv=10)

print(params)

evaluate_model(model_smote_gbc, X_test, y_test)

{'loss': 'deviance', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 250, 'random_state': 27}
confusion matrix:
[[680   7]
 [ 10   3]]
-------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       687
           1       0.30      0.23      0.26        13

    accuracy                           0.98       700
   macro avg       0.64      0.61      0.62       700
weighted avg       0.97      0.98      0.97       700

-------------------------
roc auc score: 0.6102900011196954
-------------------------
Log loss: 0.8388068512792766
-------------------------
