# Import Libraries

In [138]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set_theme(context='talk')

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Load the Dataset

In [61]:
# Load the dataset
df = pd.read_csv('../data/nba_data.csv')
print(df.shape)
df.head()

(10749, 28)


Unnamed: 0,Season,H_win,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,...,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,2013,1,0.445,0.353,0.474,0.763,9.7,32.4,42.0,21.0,...,0.779,10.2,34.5,44.7,20.1,6.7,5.4,15.1,20.4,96.7
1,2013,1,0.432,0.348,0.456,0.779,11.4,32.7,44.1,22.7,...,0.76,7.6,29.2,36.9,22.5,8.9,4.5,14.8,19.5,102.2
2,2013,1,0.474,0.352,0.525,0.73,10.5,32.5,43.0,24.6,...,0.757,9.1,32.0,41.0,24.5,7.5,5.4,15.1,19.8,103.0
3,2013,1,0.459,0.369,0.497,0.753,8.8,29.4,38.1,20.9,...,0.751,12.1,32.1,44.1,21.2,7.1,3.7,14.2,20.0,98.2
4,2013,1,0.435,0.333,0.47,0.777,12.0,30.5,42.5,21.0,...,0.782,11.4,31.1,42.5,21.2,7.0,4.2,14.1,23.0,101.3


# Split and Scale the Dataset

In [62]:
# Separate features and labels and split dataset
y = df['H_win']
X = df.drop(['H_win', 'Season'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [63]:
print(X_train.shape)
print(X_test.shape)

(7524, 26)
(3225, 26)


In [64]:
# Scaling
scaler = MinMaxScaler()
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
scaled_X_train.head()

Unnamed: 0,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,STL_V,BLK_V,...,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,0.326316,0.317308,0.246032,0.0,0.7,0.361538,0.635135,0.112903,0.333333,0.254902,...,0.520468,0.414286,0.207692,0.371622,0.346774,0.466667,0.313725,0.560606,0.170732,0.237589
1,0.642105,0.413462,0.730159,0.538012,0.371429,0.369231,0.493243,0.580645,0.355556,0.352941,...,0.532164,0.828571,0.215385,0.567568,0.330645,0.777778,0.431373,0.212121,0.365854,0.620567
2,0.526316,0.259615,0.285714,0.614035,0.4,0.223077,0.385135,0.298387,0.666667,0.352941,...,0.204678,0.742857,0.223077,0.540541,0.290323,0.466667,0.45098,0.348485,0.292683,0.234043
3,0.673684,0.442308,0.634921,0.25731,0.514286,0.376923,0.567568,0.274194,0.466667,0.627451,...,0.666667,0.542857,0.146154,0.378378,0.258065,0.333333,0.352941,0.454545,0.780488,0.333333
4,0.442105,0.192308,0.333333,0.631579,0.614286,0.3,0.547297,0.129032,0.6,0.490196,...,0.298246,0.5,0.415385,0.594595,0.25,0.333333,0.27451,0.121212,0.158537,0.333333


In [78]:
# Scale the test data 
scaled_X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)
scaled_X_test.head(3)

Unnamed: 0,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,STL_V,BLK_V,...,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,0.284211,0.201923,0.166667,0.637427,0.628571,0.1,0.378378,0.241935,0.355556,0.352941,...,0.54386,0.428571,0.038462,0.22973,0.16129,0.488889,0.411765,0.287879,0.670732,0.237589
1,0.621053,0.355769,0.460317,0.766082,0.542857,0.138462,0.371622,0.459677,0.555556,0.411765,...,0.461988,0.257143,0.353846,0.425676,0.169355,0.266667,0.509804,0.378788,0.268293,0.312057
2,0.715789,0.490385,0.634921,0.695906,0.157143,0.376923,0.398649,0.637097,0.422222,0.54902,...,0.526316,0.457143,0.323077,0.493243,0.41129,0.311111,0.156863,0.818182,0.207317,0.531915


# Logistic Regression

In [91]:
# Train the model
lr = LogisticRegression()

# Fit data in the model
lr.fit(scaled_X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
# Make predictions
y_pred = lr.predict(scaled_X_test)

In [105]:
# Check performance
print("LR model score: %.3f" % lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred))

LR model score: 0.657
              precision    recall  f1-score   support

           0       0.63      0.51      0.56      1404
           1       0.67      0.77      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.64      3225
weighted avg       0.65      0.66      0.65      3225



In [139]:
# Optimization: define params_grid dictionary

# add different tolerance, not good practice to max out n of iterations
params_grid_lr = {
    'penalty': ['l1', 'l2'],
    'tol': [0.0001, 0.001, 0.01],
    'C': [1.0, 2, 3, 4, 5, 6],
    'solver': ['liblinear', 'saga']
}

In [140]:
# Define GridSearch 
gs_lr = GridSearchCV(estimator=lr, 
                      param_grid=params_grid_lr,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_lr.fit(scaled_X_train, y_train)



In [141]:
gs_lr.best_params_

{'C': 5, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.001}

In [142]:
# Store the best estimators
best_lr = gs_lr.best_estimator_

# Fit the data 
best_lr.fit(scaled_X_train, y_train)
y_pred_best_lr = best_lr.predict(scaled_X_test)

In [143]:
# Check performance
print("LR model score: %.3f" % best_lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_lr))

LR model score: 0.658
              precision    recall  f1-score   support

           0       0.63      0.51      0.57      1404
           1       0.67      0.77      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.64      3225
weighted avg       0.65      0.66      0.65      3225



In [160]:
# Check performance
best_lr.score(scaled_X_train, y_train)

0.6771664008506114

# RandomForestClassifier

In [118]:
# Instantiate classifier
rfc = RandomForestClassifier()

# Fit the training data
rfc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_rf = rfc.predict(scaled_X_test)

# Check the performance 
print('RFC model score: %.3f' % rfc.score(scaled_X_test, y_test))

RFC model score: 0.615


In [153]:
# Create the random grid
params_grid_rfc = {'n_estimators': [100, 200, 500],
               'max_features': ['log2', 'sqrt'],
               'max_depth': [5, 7, 9],
               'min_samples_split': [10, 12, 15, 17],
               'min_samples_leaf': [1, 2, 4, 5]}

# Define GridSearch 
gs_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=params_grid_rfc,
                      cv=2,
                      n_jobs=-1)

In [154]:
# Fit GridSearch to the data
gs_rfc.fit(scaled_X_train, y_train)

In [155]:
gs_rfc.best_params_

{'max_depth': 7,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 12,
 'n_estimators': 200}

In [156]:
# Store best estimators
best_rfc = gs_rfc.best_estimator_

In [157]:
# Fit the data 
best_rfc.fit(scaled_X_train, y_train)
y_pred_best_rfc = best_rfc.predict(scaled_X_test)

In [158]:
# Check performance
print("RFC model score: %.3f" % best_rfc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_rfc))

RFC model score: 0.644
              precision    recall  f1-score   support

           0       0.63      0.45      0.52      1404
           1       0.65      0.79      0.72      1821

    accuracy                           0.64      3225
   macro avg       0.64      0.62      0.62      3225
weighted avg       0.64      0.64      0.63      3225



In [159]:
# Check performance for trainset as well
best_rfc.score(scaled_X_train, y_train)

0.7222222222222222

# SVM

In [161]:
# Instantiate SVM classifier
svc = SVC()

# Fit the data
svc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_svc = svc.predict(scaled_X_test)

In [162]:
# Check performance
print("SVC model score: %.3f" % svc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_svc))

SVC model score: 0.646
              precision    recall  f1-score   support

           0       0.62      0.48      0.54      1404
           1       0.66      0.77      0.71      1821

    accuracy                           0.65      3225
   macro avg       0.64      0.63      0.63      3225
weighted avg       0.64      0.65      0.64      3225



In [163]:
# Parameter grid for svc model
params_grid_svc = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.75, 1, 1.25, 2],
    'gamma': ['scale', 'auto']}

In [164]:
# Perform GridSearch for SVM
gs_svc = GridSearchCV(estimator=svc, 
                      param_grid=params_grid_svc,
                      cv=2,
                      n_jobs=-1)

In [165]:
# Fit GridSearch to the data
gs_svc.fit(scaled_X_train, y_train)

