In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from utils import print_metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, \
AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

%matplotlib inline
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
credits = pd.read_excel('credit-cards.xls', header=1)
credits.drop(columns=['ID'], inplace=True)  # Remove ID column

In [48]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier

class Columns(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return X[self.names]

numeric = [
    'LIMIT_BAL',
    'AGE',
    'BILL_AMT1',
    'BILL_AMT2',
    'BILL_AMT3',
    'BILL_AMT4',
    'BILL_AMT5',
    'BILL_AMT6',
    'PAY_0',
    'PAY_2',
    'PAY_3',
    'PAY_4',
    'PAY_5',
    'PAY_6',
    'PAY_AMT1',
    'PAY_AMT2',
    'PAY_AMT3',
    'PAY_AMT4',
    'PAY_AMT5',
    'PAY_AMT6',
]
categorical = ['SEX', 'EDUCATION', 'MARRIAGE']

features = FeatureUnion([
        ('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
    ])
pipe = Pipeline([
    ("features", features),
    ('model', KNeighborsClassifier())
])

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X = credits.iloc[:, :-1]
y = credits.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)

In [51]:
pipe.fit(X_train, y_train)

y_hat_test = pipe.predict(X_test)

In [52]:
print_metrics(y_test, y_hat_test)

Precision Score: 0.528169014084507
Recall Score: 0.34650924024640656
Accuracy Score: 0.7915555555555556
F1 Score: 0.4184748915065096
ROC_AUC_Score: 0.6305007914221256


In [53]:
# Fix issue with libomp conflicts
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [54]:
from xgboost import XGBClassifier

pipe = Pipeline([
    ("features", features),
    ('model', XGBClassifier())
])

pipe.fit(X_train, y_train)

y_hat_test = pipe.predict(X_test)

print_metrics(y_test, y_hat_test)

Precision Score: 0.6564472497745717
Recall Score: 0.3737166324435318
Accuracy Score: 0.8221111111111111
F1 Score: 0.4762839385017992
ROC_AUC_Score: 0.6598447030623784


In [55]:
tested_models = {
    'dt_cls': DecisionTreeClassifier(),
    'rf_clf': RandomForestClassifier(),
    'adaboost_clf': AdaBoostClassifier(),
    'knn_clf': KNeighborsClassifier(),
    'xgb_clf': XGBClassifier(),
    'sv_clf': SVC()
}

for model in tested_models:
    pipe = Pipeline([
        ("features", features),
        ('model', tested_models[model])
    ])

    pipe.fit(X_train, y_train)

    y_hat_test = pipe.predict(X_test)
    
    print('======================')
    print('Tested Model: ', model)
    print('======================')
    print_metrics(y_test, y_hat_test)
    print('\n')
    


Tested Model:  dt_cls
Precision Score: 0.38806660499537465
Recall Score: 0.4306981519507187
Accuracy Score: 0.7297777777777777
F1 Score: 0.40827250608272503
ROC_AUC_Score: 0.6215458995715023


Tested Model:  rf_clf
Precision Score: 0.5772811918063314
Recall Score: 0.3182751540041068
Accuracy Score: 0.802
F1 Score: 0.41032428855062875
ROC_AUC_Score: 0.6269481272005786


Tested Model:  adaboost_clf
Precision Score: 0.6613756613756614
Recall Score: 0.32084188911704314
Accuracy Score: 0.8174444444444444
F1 Score: 0.4320774282751469
ROC_AUC_Score: 0.6377323455795086


Tested Model:  knn_clf
Precision Score: 0.528169014084507
Recall Score: 0.34650924024640656
Accuracy Score: 0.7915555555555556
F1 Score: 0.4184748915065096
ROC_AUC_Score: 0.6305007914221256


Tested Model:  xgb_clf
Precision Score: 0.6564472497745717
Recall Score: 0.3737166324435318
Accuracy Score: 0.8221111111111111
F1 Score: 0.4762839385017992
ROC_AUC_Score: 0.6598447030623784


Tested Model:  sv_clf
Precision Score: 0.66041

In [56]:
# Deal with imbalance using SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=1)
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)

# Transform resample X_train into DataFrame
X_train_resampled = pd.DataFrame(X_train_resampled)
X_train_resampled.columns = X.columns

In [57]:
for model in tested_models:
    pipe = Pipeline([
        ("features", features),
        ('model', tested_models[model])
    ])

    pipe.fit(X_train_resampled, y_train_resampled)

    y_hat_test = pipe.predict(X_test)
    
    print('======================')
    print('Tested Model: ', model)
    print('======================')
    print_metrics(y_test, y_hat_test)
    print('\n')

Tested Model:  dt_cls
Precision Score: 0.33261802575107297
Recall Score: 0.47741273100616016
Accuracy Score: 0.6795555555555556
F1 Score: 0.3920741989881956
ROC_AUC_Score: 0.6064034727067102


Tested Model:  rf_clf
Precision Score: 0.4631578947368421
Recall Score: 0.42915811088295686
Accuracy Score: 0.7687777777777778
F1 Score: 0.4455102584598987
ROC_AUC_Score: 0.6458751416581545


Tested Model:  adaboost_clf
Precision Score: 0.41063515509601184
Recall Score: 0.5708418891170431
Accuracy Score: 0.7297777777777777
F1 Score: 0.47766323024054985
ROC_AUC_Score: 0.6722615571506941


Tested Model:  knn_clf
Precision Score: 0.3598023064250412
Recall Score: 0.5605749486652978
Accuracy Score: 0.689
F1 Score: 0.4382901866345575
ROC_AUC_Score: 0.642525137406954


Tested Model:  xgb_clf
Precision Score: 0.4467026130236416
Recall Score: 0.5528747433264887
Accuracy Score: 0.755
F1 Score: 0.49415003441156224
ROC_AUC_Score: 0.6818542746694837


Tested Model:  sv_clf
Precision Score: 0.4304231795653832


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C':[1,10,100,1000],
    'gamma':[1,0.1,0.001,0.0001],
    'kernel':['linear','rbf']
}

grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2)


grid.fit(X_train_resampled,y_train_resampled)

grid.best_params_

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] C=1, gamma=1, kernel=linear .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [19]:
# specify parameters and distributions to sample from
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

from time import time
from scipy.stats import randint as sp_randint


param_dist = {
    'max_depth': [3, None],
    'max_features': sp_randint(1, 11),
    'min_samples_split': sp_randint(2, 11),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

tuned_models = {
#     'dt_cls': DecisionTreeClassifier(),
    'rf_clf': RandomForestClassifier(),
#     'adaboost_clf': AdaBoostClassifier(),
#     'knn_clf': KNeighborsClassifier(),
#     'xgb_clf': XGBClassifier(),
#     'sv_clf': SVC()
}

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False)

start = time()
random_search.fit(X_train_resampled, y_train_resampled)
print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), n_iter_search))
report(random_search.cv_results_)



RandomizedSearchCV took 90.93 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.825 (std: 0.063)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_split': 5}

Model with rank: 2
Mean validation score: 0.825 (std: 0.068)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.823 (std: 0.066)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_split': 3}



In [58]:
pipe = Pipeline([
    ("features", features),
    ('model', RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features=5, min_samples_split=9))
])

pipe.fit(X_train_resampled, y_train_resampled)

y_hat_test = pipe.predict(X_test)

print('======================')
print('Tested Model: ', 'RF')
print('======================')
print_metrics(y_test, y_hat_test)
print('\n')



Tested Model:  RF
Precision Score: 0.43325842696629213
Recall Score: 0.4948665297741273
Accuracy Score: 0.7505555555555555
F1 Score: 0.4620177330457704
ROC_AUC_Score: 0.6580260045353904




In [59]:
# AdaBoostClassifier(base_estimator=)

param_dist = {
 'n_estimators': [5, 10, 20, 30],
 'learning_rate' : [0.01,0.05,0.1,0.3,1]
 }

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(AdaBoostClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False)

start = time()
random_search.fit(X_train, y_train)
print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), n_iter_search))
report(random_search.cv_results_)


RandomizedSearchCV took 44.67 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 5, 'learning_rate': 0.01}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 10, 'learning_rate': 0.01}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 20, 'learning_rate': 0.01}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 30, 'learning_rate': 0.01}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 5, 'learning_rate': 0.05}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 10, 'learning_rate': 0.05}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 20, 'learning_rate': 0.05}

Model with rank: 1
Mean validation score: 0.818 (std: 0.005)
Parameters: {'n_estimators': 5, 'learni

In [60]:
pipe = Pipeline([
    ("features", features),
    ('model', AdaBoostClassifier(learning_rate=.8))
])

pipe.fit(X_train_resampled, y_train_resampled)

y_hat_test = pipe.predict(X_test)

print('======================')
print('Tested Model: ', 'AdaBoost')
print('======================')
print_metrics(y_test, y_hat_test)
print('\n')

Tested Model:  AdaBoost
Precision Score: 0.4171641791044776
Recall Score: 0.5739219712525667
Accuracy Score: 0.7342222222222222
F1 Score: 0.4831460674157304
ROC_AUC_Score: 0.676212261859976




In [None]:
# SVC(base_estimator=)

# GRID SEARCH FOR 20 COMBINATIONS OF PARAMETERS
param_dist = {
    "C": np.arange(2, 10, 2),
    "gamma": np.arange(0.1, 1, 0.2)
}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False)

start = time()
random_search.fit(X_train_resampled, y_train_resampled)
print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), n_iter_search))
report(random_search.cv_results_)