# Import Libraries

In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


# Load the basic stats dataset

In [3]:
# Load the dataset
df = pd.read_csv('../data/nba_data.csv')
print(df.shape)
df.head()

(10749, 28)


Unnamed: 0,Season,H_win,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,STL_V,BLK_V,TOV_V,PF_V,PTS_V,FG%_H,3P%_H,2P%_H,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,2013,1,0.445,0.353,0.474,0.763,9.7,32.4,42.0,21.0,7.7,4.3,14.9,20.5,96.5,0.449,0.357,0.477,0.779,10.2,34.5,44.7,20.1,6.7,5.4,15.1,20.4,96.7
1,2013,1,0.432,0.348,0.456,0.779,11.4,32.7,44.1,22.7,7.2,5.2,14.9,19.1,93.7,0.501,0.364,0.558,0.76,7.6,29.2,36.9,22.5,8.9,4.5,14.8,19.5,102.2
2,2013,1,0.474,0.352,0.525,0.73,10.5,32.5,43.0,24.6,8.6,4.8,13.9,21.5,107.9,0.45,0.381,0.478,0.757,9.1,32.0,41.0,24.5,7.5,5.4,15.1,19.8,103.0
3,2013,1,0.459,0.369,0.497,0.753,8.8,29.4,38.1,20.9,8.6,3.8,14.5,21.7,98.5,0.437,0.356,0.461,0.751,12.1,32.1,44.1,21.2,7.1,3.7,14.2,20.0,98.2
4,2013,1,0.435,0.333,0.47,0.777,12.0,30.5,42.5,21.0,7.1,4.2,15.4,21.3,96.2,0.445,0.372,0.475,0.782,11.4,31.1,42.5,21.2,7.0,4.2,14.1,23.0,101.3


# Split and Scale the Dataset

In [4]:
# Separate features and labels and split dataset
y = df['H_win']
X = df.drop(['H_win', 'Season'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
print(X_train.shape)
print(X_test.shape)

(7524, 26)
(3225, 26)


In [6]:
X_train.columns

Index(['FG%_V', '3P%_V', '2P%_V', 'FT%_V', 'ORB_V', 'DRB_V', 'TRB_V', 'AST_V',
       'STL_V', 'BLK_V', 'TOV_V', 'PF_V', 'PTS_V', 'FG%_H', '3P%_H', '2P%_H',
       'FT%_H', 'ORB_H', 'DRB_H', 'TRB_H', 'AST_H', 'STL_H', 'BLK_H', 'TOV_H',
       'PF_H', 'PTS_H'],
      dtype='object')

In [7]:
# Scaling
scaler = MinMaxScaler()
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
scaled_X_train.head()

Unnamed: 0,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,STL_V,BLK_V,TOV_V,PF_V,PTS_V,FG%_H,3P%_H,2P%_H,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,0.326316,0.317308,0.246032,0.0,0.7,0.361538,0.635135,0.112903,0.333333,0.254902,0.363636,0.292683,0.358156,0.473684,0.384615,0.253968,0.520468,0.414286,0.207692,0.371622,0.346774,0.466667,0.313725,0.560606,0.170732,0.237589
1,0.642105,0.413462,0.730159,0.538012,0.371429,0.369231,0.493243,0.580645,0.355556,0.352941,0.5,0.109756,0.56383,0.389474,0.355769,0.428571,0.532164,0.828571,0.215385,0.567568,0.330645,0.777778,0.431373,0.212121,0.365854,0.620567
2,0.526316,0.259615,0.285714,0.614035,0.4,0.223077,0.385135,0.298387,0.666667,0.352941,0.333333,0.304878,0.22695,0.252632,0.307692,0.150794,0.204678,0.742857,0.223077,0.540541,0.290323,0.466667,0.45098,0.348485,0.292683,0.234043
3,0.673684,0.442308,0.634921,0.25731,0.514286,0.376923,0.567568,0.274194,0.466667,0.627451,0.757576,0.463415,0.560284,0.389474,0.576923,0.206349,0.666667,0.542857,0.146154,0.378378,0.258065,0.333333,0.352941,0.454545,0.780488,0.333333
4,0.442105,0.192308,0.333333,0.631579,0.614286,0.3,0.547297,0.129032,0.6,0.490196,0.651515,1.0,0.560284,0.431579,0.173077,0.34127,0.298246,0.5,0.415385,0.594595,0.25,0.333333,0.27451,0.121212,0.158537,0.333333


In [8]:
# Scale the test data 
scaled_X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)
scaled_X_test.head(3)

Unnamed: 0,FG%_V,3P%_V,2P%_V,FT%_V,ORB_V,DRB_V,TRB_V,AST_V,STL_V,BLK_V,TOV_V,PF_V,PTS_V,FG%_H,3P%_H,2P%_H,FT%_H,ORB_H,DRB_H,TRB_H,AST_H,STL_H,BLK_H,TOV_H,PF_H,PTS_H
0,0.284211,0.201923,0.166667,0.637427,0.628571,0.1,0.378378,0.241935,0.355556,0.352941,0.651515,0.573171,0.152482,0.431579,0.576923,0.261905,0.54386,0.428571,0.038462,0.22973,0.16129,0.488889,0.411765,0.287879,0.670732,0.237589
1,0.621053,0.355769,0.460317,0.766082,0.542857,0.138462,0.371622,0.459677,0.555556,0.411765,0.439394,0.426829,0.485816,0.610526,0.576923,0.492063,0.461988,0.257143,0.353846,0.425676,0.169355,0.266667,0.509804,0.378788,0.268293,0.312057
2,0.715789,0.490385,0.634921,0.695906,0.157143,0.376923,0.398649,0.637097,0.422222,0.54902,0.318182,0.390244,0.620567,0.526316,0.375,0.555556,0.526316,0.457143,0.323077,0.493243,0.41129,0.311111,0.156863,0.818182,0.207317,0.531915


# Logistic Regression

In [12]:
# Train the model
lr = LogisticRegression()

# Fit data in the model
lr.fit(scaled_X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Make predictions
y_pred = lr.predict(scaled_X_test)

In [16]:
# Check performance
print("LR model score: %.4f" % lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred))

LR model score: 0.6571
              precision    recall  f1-score   support

           0       0.63      0.51      0.56      1404
           1       0.67      0.77      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.64      3225
weighted avg       0.65      0.66      0.65      3225



In [17]:
# Optimization: define params_grid dictionary

# add different tolerance, not good practice to max out n of iterations
params_grid_lr = {
    'penalty': ['l1', 'l2'],
    'tol': [0.0001, 0.001, 0.01],
    'C': [1.0, 2, 3, 4, 5, 6],
    'solver': ['liblinear', 'saga']
}

In [18]:
# Define GridSearch 
gs_lr = GridSearchCV(estimator=lr, 
                      param_grid=params_grid_lr,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_lr.fit(scaled_X_train, y_train)



In [19]:
gs_lr.best_params_

{'C': 4, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.0001}

In [20]:
# Store the best estimators
best_lr = gs_lr.best_estimator_

# Fit the data 
best_lr.fit(scaled_X_train, y_train)
y_pred_best_lr = best_lr.predict(scaled_X_test)

In [21]:
# Check performance
print("LR model score: %.3f" % best_lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_lr))

LR model score: 0.657
              precision    recall  f1-score   support

           0       0.63      0.51      0.56      1404
           1       0.67      0.77      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.64      3225
weighted avg       0.65      0.66      0.65      3225



In [22]:
# Check performance
best_lr.score(scaled_X_train, y_train)

0.6779638490164805

# RandomForestClassifier

In [24]:
# Instantiate classifier
rfc = RandomForestClassifier()

# Fit the training data
rfc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_rf = rfc.predict(scaled_X_test)

# Check the performance 
print('RFC model score: %.4f' % rfc.score(scaled_X_test, y_test))
print(classification_report(y_test, y_pred_rf))

RFC model score: 0.6195
              precision    recall  f1-score   support

           0       0.57      0.50      0.53      1404
           1       0.65      0.71      0.68      1821

    accuracy                           0.62      3225
   macro avg       0.61      0.61      0.61      3225
weighted avg       0.62      0.62      0.62      3225



In [51]:
# Create the random grid
params_grid_rfc = {'n_estimators': [100, 200, 500, 600],
               'max_features': ['log2', 'sqrt'],
               'max_depth': [3, 5, 7, 9],
               'min_samples_split': [10, 12, 15, 17],
               'min_samples_leaf': [1, 2, 4, 5]}

# Define GridSearch 
gs_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=params_grid_rfc,
                      cv=2,
                      n_jobs=-1)

In [52]:
# Fit GridSearch to the data
gs_rfc.fit(scaled_X_train, y_train)



In [54]:
gs_rfc.best_params_

{'max_depth': 7,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 15,
 'n_estimators': 500}

In [55]:
# Store best estimators
best_rfc = gs_rfc.best_estimator_

In [56]:
# Fit the data 
best_rfc.fit(scaled_X_train, y_train)
y_pred_best_rfc = best_rfc.predict(scaled_X_test)

In [57]:
# Check performance
print("RFC model score: %.3f" % best_rfc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_rfc))

RFC model score: 0.644
              precision    recall  f1-score   support

           0       0.63      0.45      0.53      1404
           1       0.65      0.79      0.72      1821

    accuracy                           0.64      3225
   macro avg       0.64      0.62      0.62      3225
weighted avg       0.64      0.64      0.63      3225



In [31]:
# Check performance for trainset as well
best_rfc.score(scaled_X_train, y_train)

0.6828814460393408

# SVM

In [32]:
# Instantiate SVM classifier
svc = SVC()

# Fit the data
svc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_svc = svc.predict(scaled_X_test)

In [33]:
# Check performance
print("SVC model score: %.3f" % svc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_svc))

SVC model score: 0.646
              precision    recall  f1-score   support

           0       0.62      0.48      0.54      1404
           1       0.66      0.77      0.71      1821

    accuracy                           0.65      3225
   macro avg       0.64      0.63      0.63      3225
weighted avg       0.64      0.65      0.64      3225



In [34]:
# Parameter grid for svc model
params_grid_svc = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.75, 1, 1.25, 2],
    'gamma': ['scale', 'auto']}

In [35]:
# Perform GridSearch for SVM
gs_svc = GridSearchCV(estimator=svc, 
                      param_grid=params_grid_svc,
                      cv=2,
                      n_jobs=-1)

In [36]:
# Fit GridSearch to the data
gs_svc.fit(scaled_X_train, y_train)



In [37]:
gs_svc.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}

In [38]:
# store best estimator
best_svc = gs_svc.best_estimator_

# Fit on the dataset 
best_svc.fit(scaled_X_train, y_train)
y_pred_best_svc = best_svc.predict(scaled_X_test)

In [39]:
# Check performance
print("SVC model score: %.3f" % best_svc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_svc))

SVC model score: 0.656
              precision    recall  f1-score   support

           0       0.63      0.50      0.56      1404
           1       0.67      0.78      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.64      3225
weighted avg       0.65      0.66      0.65      3225



In [40]:
best_svc.score(scaled_X_train, y_train)

0.6794258373205742

# Naive Bayes

In [26]:
# Instantiate NB classifier
gnb = GaussianNB()

# Fit the data
gnb.fit(scaled_X_train, y_train)

# Predictions
y_pred_gnb = gnb.predict(scaled_X_test)

In [27]:
# Check performance
print("Naive Bayes model score: %.3f" % gnb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_gnb))

Naive Bayes model score: 0.633
              precision    recall  f1-score   support

           0       0.58      0.56      0.57      1404
           1       0.67      0.69      0.68      1821

    accuracy                           0.63      3225
   macro avg       0.63      0.62      0.62      3225
weighted avg       0.63      0.63      0.63      3225



In [42]:
# Params grid for GaussianNB
params_grid_gnb = {'var_smoothing': [1e-10, 5e-10, 1e-9, 5e-8]}

In [37]:
# Perform GridSearch for NB
gs_gnb = GridSearchCV(estimator=gnb, 
                      param_grid=params_grid_gnb,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_gnb.fit(scaled_X_train, y_train)

In [38]:
gs_gnb.best_params_

{'var_smoothing': 1e-10}

In [39]:
# store best estimator
best_gnb = gs_gnb.best_estimator_

# Fit on the dataset 
best_gnb.fit(scaled_X_train, y_train)
y_pred_best_gnb = best_gnb.predict(scaled_X_test)

In [41]:
# Check performance
print("Naive Bayes best model score: %.3f" % best_gnb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_gnb))

Naive Bayes best model score: 0.633
              precision    recall  f1-score   support

           0       0.58      0.56      0.57      1404
           1       0.67      0.69      0.68      1821

    accuracy                           0.63      3225
   macro avg       0.63      0.62      0.62      3225
weighted avg       0.63      0.63      0.63      3225



# XGBoost Classifier

In [15]:
# Instantiate XGBClassifier
xgb = XGBClassifier(use_label_encoder=False)

# Fit the data
xgb.fit(scaled_X_train, y_train)



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [16]:
# Make predictions
y_pred_xgb = xgb.predict(scaled_X_test)

In [17]:
# Check performance
print("XGB model score: %.3f" % xgb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_xgb))

XGB model score: 0.615
              precision    recall  f1-score   support

           0       0.56      0.51      0.53      1404
           1       0.65      0.70      0.67      1821

    accuracy                           0.62      3225
   macro avg       0.61      0.60      0.60      3225
weighted avg       0.61      0.62      0.61      3225



In [30]:
# Create parameters dictionary
params_grid_xgb = {
    'max_depth': [3, 4, 5],
    'n_estimators': [80, 90, 100, 110],
    #'subsample': [0.75, 1],
    #'colsample_bytree': [1, 2, 3]    
}



In [26]:
gs_xgb = GridSearchCV(estimator=xgb,
                     param_grid=params_grid_xgb,
                     cv=2,
                     verbose=1)

In [27]:
# Perform the gridsearch
gs_xgb.fit(scaled_X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [28]:
gs_xgb.best_params_

{'max_depth': 4, 'n_estimators': 100}



In [31]:
# Store best estimator combination
best_xgb = gs_xgb.best_estimator_

# Fit on the dataset 
best_xgb.fit(scaled_X_train, y_train)
y_pred_best_xgb = best_xgb.predict(scaled_X_test)



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [32]:
# Check performance
print("XGB best model score: %.3f" % best_xgb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_xgb))

XGB best model score: 0.636
              precision    recall  f1-score   support

           0       0.59      0.51      0.55      1404
           1       0.66      0.73      0.69      1821

    accuracy                           0.64      3225
   macro avg       0.63      0.62      0.62      3225
weighted avg       0.63      0.64      0.63      3225



# Load adv stats dataset

In [10]:
data = pd.read_csv('../data/nba_adv_data.csv')
print(data.shape)
data.head()

(10749, 16)


Unnamed: 0,Season,H_win,ORtg_V,DRtg_V,TS%_V,TOV%_V,ORB%_V,DRB%_V,W%_V,ORtg_H,DRtg_H,TS%_H,TOV%_H,ORB%_H,DRB%_H,W%_H
0,2013,1,101.7,107.4,0.525,13.9,22.4,75.5,0.280488,104.1,99.3,0.535,14.3,24.9,76.8,0.682927
1,2013,1,102.5,100.5,0.518,14.2,27.2,75.4,0.585366,110.9,105.8,0.59,14.6,20.6,73.0,0.658537
2,2013,1,112.1,104.8,0.567,12.7,25.0,72.5,0.695122,104.2,110.6,0.542,13.7,20.2,71.0,0.329268
3,2013,1,106.7,107.7,0.555,14.1,21.7,72.3,0.536585,104.2,107.7,0.518,13.0,27.2,75.8,0.402439
4,2013,1,102.9,107.7,0.517,14.2,27.4,74.2,0.304878,108.8,105.3,0.545,13.2,27.2,74.9,0.585366


In [11]:
# Separate label and features, split dataset
y = data['H_win']
X = data.drop(['Season', 'W%_V', 'W%_H', 'H_win'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = MinMaxScaler()
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
scaled_X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

In [12]:
scaled_X_test.head(3)

Unnamed: 0,ORtg_V,DRtg_V,TS%_V,TOV%_V,ORB%_V,DRB%_V,ORtg_H,DRtg_H,TS%_H,TOV%_H,ORB%_H,DRB%_H
0,0.324561,0.467742,0.198276,0.704918,0.703704,0.301887,0.561404,0.543011,0.405172,0.42623,0.533333,0.311321
1,0.671053,0.698925,0.525862,0.491803,0.688889,0.462264,0.618421,0.33871,0.594828,0.540984,0.392593,0.745283
2,0.635965,0.483871,0.612069,0.344262,0.155556,0.537736,0.526316,0.88172,0.508621,0.770492,0.496296,0.603774


# Logistic regression

In [13]:
lr = LogisticRegression()

# Fit the data 
lr.fit(scaled_X_train, y_train)

# Make predictions
y_pred = lr.predict(scaled_X_test)

In [14]:
# Check performance
print("LR model score: %.4f" % lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred))

LR model score: 0.6682
              precision    recall  f1-score   support

           0       0.64      0.54      0.58      1404
           1       0.68      0.77      0.72      1821

    accuracy                           0.67      3225
   macro avg       0.66      0.65      0.65      3225
weighted avg       0.67      0.67      0.66      3225



In [29]:
params_grid_lr = {
    'penalty': ['l1', 'l2'],
    'tol': [0.0005, 0.001, 0.005, 0.01],
    'C': [1.0, 2, 3, 4, 5, 6],
    'solver': ['liblinear', 'saga']}

In [31]:
gs_lr = GridSearchCV(estimator=lr, 
                      param_grid=params_grid_lr,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_lr.fit(scaled_X_train, y_train)



In [32]:
gs_lr.best_params_

{'C': 2, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.0005}

In [33]:
best_lr = gs_lr.best_estimator_

# Make fit the data
best_lr.fit(scaled_X_train, y_train)

# Make pred
y_pred_best_lr = best_lr.predict(scaled_X_test)

In [34]:
# Check performance
print("LR model score: %.4f" % best_lr.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_lr))

LR model score: 0.6695
              precision    recall  f1-score   support

           0       0.64      0.54      0.59      1404
           1       0.68      0.77      0.72      1821

    accuracy                           0.67      3225
   macro avg       0.66      0.65      0.66      3225
weighted avg       0.67      0.67      0.66      3225



# RandomForest Classifier

In [35]:
# Instantiate classifier
rfc = RandomForestClassifier()

# Fit the training data
rfc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_rf = rfc.predict(scaled_X_test)

# Check the performance 
print('RFC model score: %.4f' % rfc.score(scaled_X_test, y_test))
print(classification_report(y_test, y_pred_rf))

RFC model score: 0.6226
              precision    recall  f1-score   support

           0       0.58      0.51      0.54      1404
           1       0.65      0.71      0.68      1821

    accuracy                           0.62      3225
   macro avg       0.61      0.61      0.61      3225
weighted avg       0.62      0.62      0.62      3225



In [40]:
# Create the random grid
params_grid_rfc = {'n_estimators': [80, 90, 100, 110],
               'max_features': ['log2', 'sqrt'],
               'max_depth': [3, 5, 7, 9],
               'min_samples_split': [10, 12, 15, 17],
               'min_samples_leaf': [4, 5, 6, 7]}

# Define GridSearch 
gs_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=params_grid_rfc,
                      cv=2,
                      n_jobs=-1)

In [41]:
# Fit GridSearch to the data
gs_rfc.fit(scaled_X_train, y_train)

In [39]:
gs_rfc.best_params_

{'max_depth': 5,
 'max_features': 'log2',
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}

In [38]:
# Store best estimators
best_rfc = gs_rfc.best_estimator_

# Fit the data 
best_rfc.fit(scaled_X_train, y_train)
y_pred_best_rfc = best_rfc.predict(scaled_X_test)

# Check performance
print("RFC model score: %.4f" % best_rfc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_rfc))

RFC model score: 0.6552
              precision    recall  f1-score   support

           0       0.64      0.48      0.55      1404
           1       0.66      0.79      0.72      1821

    accuracy                           0.66      3225
   macro avg       0.65      0.64      0.63      3225
weighted avg       0.65      0.66      0.65      3225



# SVM

In [42]:
# Instantiate SVM classifier
svc = SVC()

# Fit the data
svc.fit(scaled_X_train, y_train)

# Make predictions
y_pred_svc = svc.predict(scaled_X_test)

# Check performance
print("SVC model score: %.3f" % svc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_svc))

SVC model score: 0.665
              precision    recall  f1-score   support

           0       0.65      0.50      0.57      1404
           1       0.67      0.79      0.73      1821

    accuracy                           0.66      3225
   macro avg       0.66      0.65      0.65      3225
weighted avg       0.66      0.66      0.66      3225



In [43]:
# Parameter grid for svc model
params_grid_svc = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.75, 1, 1.25, 2],
    'gamma': ['scale', 'auto']}

In [44]:
# Perform GridSearch for SVM
gs_svc = GridSearchCV(estimator=svc, 
                      param_grid=params_grid_svc,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_svc.fit(scaled_X_train, y_train)

gs_svc.best_params_



{'C': 0.75, 'gamma': 'scale', 'kernel': 'linear'}

In [45]:
# store best estimator
best_svc = gs_svc.best_estimator_

# Fit on the dataset 
best_svc.fit(scaled_X_train, y_train)
y_pred_best_svc = best_svc.predict(scaled_X_test)

# Check performance
print("SVC model score: %.3f" % best_svc.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_svc))

SVC model score: 0.670
              precision    recall  f1-score   support

           0       0.65      0.53      0.58      1404
           1       0.68      0.78      0.73      1821

    accuracy                           0.67      3225
   macro avg       0.67      0.65      0.66      3225
weighted avg       0.67      0.67      0.66      3225



# Naive Bayes

In [46]:
# Instantiate NB classifier
gnb = GaussianNB()

# Fit the data
gnb.fit(scaled_X_train, y_train)

# Predictions
y_pred_gnb = gnb.predict(scaled_X_test)

# Check performance
print("Naive Bayes model score: %.3f" % gnb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_gnb))

Naive Bayes model score: 0.663
              precision    recall  f1-score   support

           0       0.63      0.56      0.59      1404
           1       0.69      0.74      0.71      1821

    accuracy                           0.66      3225
   macro avg       0.66      0.65      0.65      3225
weighted avg       0.66      0.66      0.66      3225



In [52]:
# Params grid for GaussianNB
params_grid_gnb = {'var_smoothing': [1e-10, 5e-10, 1e-9, 5e-9]}

# Perform GridSearch for NB
gs_gnb = GridSearchCV(estimator=gnb, 
                      param_grid=params_grid_gnb,
                      cv=2,
                      n_jobs=-1)

# Fit GridSearch to the data
gs_gnb.fit(scaled_X_train, y_train)
gs_gnb.best_params_

{'var_smoothing': 1e-10}

In [53]:
# store best estimator
best_gnb = gs_gnb.best_estimator_

# Fit on the dataset 
best_gnb.fit(scaled_X_train, y_train)
y_pred_best_gnb = best_gnb.predict(scaled_X_test)

# Check performance
print("Naive Bayes best model score: %.3f" % best_gnb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_gnb))

Naive Bayes best model score: 0.663
              precision    recall  f1-score   support

           0       0.63      0.56      0.59      1404
           1       0.69      0.74      0.71      1821

    accuracy                           0.66      3225
   macro avg       0.66      0.65      0.65      3225
weighted avg       0.66      0.66      0.66      3225



# XGBoost

In [54]:
# Instantiate XGBClassifier
xgb = XGBClassifier(use_label_encoder=False)

# Fit the data
xgb.fit(scaled_X_train, y_train)

# Make predictions
y_pred_xgb = xgb.predict(scaled_X_test)

# Check performance
print("XGB model score: %.3f" % xgb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_xgb))

XGB model score: 0.628
              precision    recall  f1-score   support

           0       0.58      0.53      0.55      1404
           1       0.66      0.71      0.68      1821

    accuracy                           0.63      3225
   macro avg       0.62      0.62      0.62      3225
weighted avg       0.63      0.63      0.63      3225



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [59]:
params_dist_xgb = {
    'max_depth': [3, 4, 5],
    'n_estimators': [80, 90, 100, 110],
    'subsample': [0.75, 1],
    #'colsample_bytree': [1]
}

In [60]:
rs_xgb = RandomizedSearchCV(estimator=xgb,
                     param_distributions=params_dist_xgb,
                     cv=10,
                     verbose=1)

# Perform the gridsearch
rs_xgb.fit(scaled_X_train, y_train)
rs_xgb.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


{'subsample': 1, 'n_estimators': 90, 'max_depth': 3}

In [61]:
# store best estimator
best_xgb = rs_xgb.best_estimator_

# Fit on the dataset 
best_xgb.fit(scaled_X_train, y_train)
y_pred_best_xgb = best_xgb.predict(scaled_X_test)

# Check performance
print("XGBClassifier best model score: %.3f" % best_xgb.score(scaled_X_test, y_test))

print(classification_report(y_test, y_pred_best_xgb))

XGBClassifier best model score: 0.663
              precision    recall  f1-score   support

           0       0.63      0.56      0.59      1404
           1       0.69      0.74      0.71      1821

    accuracy                           0.66      3225
   macro avg       0.66      0.65      0.65      3225
weighted avg       0.66      0.66      0.66      3225

