In [2]:
#imports
#numpy, pandas, scipy, math, matplotlib 
import numpy as np 
import pandas as pd 
import scipy 
from math import sqrt 
import matplotlib.pyplot as plt

#estimators 
from sklearn.covariance import EmpiricalCovariance
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

#feature selection
from sklearn.feature_selection import RFECV

#model metrics 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

#cross validation 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

In [7]:
rawData = pd.read_csv('default of credit cleaned.csv', header=0)
rawData.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_1        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
default      30000 non-null int64
dtypes: int64(24)
memory usage: 5.5 MB


In [9]:
#random sampling to 10,000 observations
tenData = rawData.sample(n=10000)

In [10]:
tenData.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
5493,60000,1,2,1,24,0,0,0,0,0,...,18118,16179,12919,1400,1500,1295,3655,330,507,0
21459,330000,2,3,1,27,-1,-1,-1,-1,0,...,46353,2175,30824,4885,906,46789,10,30824,6269,0
11562,80000,2,1,2,27,0,0,0,0,0,...,36084,13693,79701,3565,1752,1105,1499,78000,0,0
21071,340000,1,1,2,35,0,-1,-1,-1,0,...,29103,58997,12518,16591,27283,29255,58276,12565,13507,0
20313,270000,2,1,1,60,-1,-1,-1,-1,-1,...,836,1972,986,836,836,836,1972,0,836,1


In [11]:
#select the features
features = tenData.drop('default', axis=1)
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
5493,60000,1,2,1,24,0,0,0,0,0,...,18571,18118,16179,12919,1400,1500,1295,3655,330,507
21459,330000,2,3,1,27,-1,-1,-1,-1,0,...,906,46353,2175,30824,4885,906,46789,10,30824,6269
11562,80000,2,1,2,27,0,0,0,0,0,...,61018,36084,13693,79701,3565,1752,1105,1499,78000,0
21071,340000,1,1,2,35,0,-1,-1,-1,0,...,27050,29103,58997,12518,16591,27283,29255,58276,12565,13507
20313,270000,2,1,1,60,-1,-1,-1,-1,-1,...,836,836,1972,986,836,836,836,1972,0,836


In [12]:
#select dependent variable
depVar = tenData['default']

In [13]:
#train and test splitting of data
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=0.3, random_state=42)

In [14]:
X_train.shape, X_test.shape

((7000, 23), (3000, 23))

In [15]:
#scale features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [48]:
emp_cov = EmpiricalCovariance(assume_centered = False).fit(X_train)
emp_cov

EmpiricalCovariance(assume_centered=False, store_precision=True)

In [50]:
covMat = tenData.cov()
print(covMat)

              LIMIT_BAL          SEX     EDUCATION     MARRIAGE  \
LIMIT_BAL  1.664582e+10   123.256326 -21879.965597 -6882.463446   
SEX        1.232563e+02     0.238682      0.003120    -0.009538   
EDUCATION -2.187997e+04     0.003120      0.626277    -0.059024   
MARRIAGE  -6.882463e+03    -0.009538     -0.059024     0.274590   
AGE        1.742577e+05    -0.466494      1.251439    -1.998721   
PAY_1     -3.959133e+04    -0.028848      0.093054     0.019693   
PAY_2     -4.659082e+04    -0.040095      0.105534     0.026819   
PAY_3     -4.496118e+04    -0.036189      0.102597     0.022864   
PAY_4     -4.218687e+04    -0.031802      0.092414     0.025429   
PAY_5     -3.780610e+04    -0.031731      0.078499     0.026086   
PAY_6     -3.515156e+04    -0.024186      0.069064     0.027872   
BILL_AMT1  2.576608e+09 -1684.730497   1844.149505  -994.487250   
BILL_AMT2  2.459342e+09 -1490.135112   1437.968455  -958.144729   
BILL_AMT3  2.383428e+09 -1067.354078   1027.085722 -1316.45230

Random Forest - default

In [35]:
modelRF = RandomForestClassifier(n_estimators=100)
modelRF.fit(X_train, y_train)
print(cross_val_score(modelRF, X_train, y_train, cv=3))
pred_RF = modelRF.predict(X_test)

[0.81362468 0.8058294  0.80925847]


In [36]:
modelRF.score(X_train, y_train)

0.9998571428571429

In [37]:
print(classification_report(y_test, pred_RF))
print(confusion_matrix(y_test, pred_RF))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      2355
           1       0.62      0.35      0.45       645

    accuracy                           0.81      3000
   macro avg       0.73      0.65      0.67      3000
weighted avg       0.79      0.81      0.79      3000

[[2217  138]
 [ 417  228]]


Gradient Boosting - default

In [52]:
modelGB = GradientBoostingClassifier(n_estimators=100, random_state=42)
modelGB.fit(X_train, y_train)
print(cross_val_score(modelGB, X_train, y_train, cv=3))
pred_GB = modelGB.predict(X_test)

[0.81576692 0.81311616 0.81225889]


In [53]:
modelGB.score(X_train, y_train)

0.834

In [54]:
print(classification_report(y_test, pred_GB))
print(confusion_matrix(y_test, pred_GB))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      2355
           1       0.68      0.37      0.48       645

    accuracy                           0.83      3000
   macro avg       0.76      0.66      0.69      3000
weighted avg       0.81      0.83      0.81      3000

[[2241  114]
 [ 406  239]]


SVC - default

In [83]:
modelSVC = SVC()
modelSVC.fit(X_train, y_train)
print(cross_val_score(modelSVC, X_train, y_train))
modelSVC.score(X_train, y_train)

[0.80857143 0.81571429 0.79714286 0.81       0.82928571]


0.8247142857142857

In [84]:
pred_SVC = modelSVC.predict(X_test)

In [85]:
print(classification_report(y_test, pred_SVC))
print(confusion_matrix(y_test, pred_SVC))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      2356
           1       0.68      0.32      0.43       644

    accuracy                           0.82      3000
   macro avg       0.76      0.64      0.66      3000
weighted avg       0.80      0.82      0.79      3000

[[2258   98]
 [ 439  205]]


KNN - default

In [52]:
modelKNN = KNeighborsClassifier(n_neighbors=5)
modelKNN.fit(X_train, y_train)
print(cross_val_score(modelKNN, X_train, y_train, cv=10, scoring='accuracy'))

[0.80047619 0.7947619  0.80190476 0.78857143 0.78238095 0.80142857
 0.79       0.79428571 0.79095238 0.79047619]


In [53]:
modelKNN.score(X_train, y_train)

0.841047619047619

In [54]:
pred_KNN = modelKNN.predict(X_test)
print(classification_report(y_test, pred_KNN))
print(confusion_matrix(y_test, pred_KNN))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      7040
           1       0.54      0.34      0.42      1960

    accuracy                           0.79      9000
   macro avg       0.69      0.63      0.65      9000
weighted avg       0.77      0.79      0.78      9000

[[6472  568]
 [1291  669]]


In [61]:
modelKNN2 = KNeighborsClassifier(n_neighbors=3)
modelKNN2.fit(X_train, y_train)
print(cross_val_score(modelKNN2, X_train, y_train, cv=10, scoring='accuracy'))

[0.77619048 0.77619048 0.77619048 0.77428571 0.7552381  0.78190476
 0.78714286 0.78714286 0.75904762 0.76333333]


In [62]:
modelKNN2.score(X_train, y_train)

0.8645238095238095

In [63]:
pred_KNN2 = modelKNN2.predict(X_test)
print(classification_report(y_test, pred_KNN2))
print(confusion_matrix(y_test, pred_KNN2))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      7040
           1       0.48      0.35      0.41      1960

    accuracy                           0.78      9000
   macro avg       0.66      0.62      0.63      9000
weighted avg       0.76      0.78      0.76      9000

[[6285  755]
 [1265  695]]


Random Forest w/ RFE

In [16]:
modelRF_RFE = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy')
modelRF_RFE.fit(X_train, y_train)
print(cross_val_score(modelRF_RFE, X_train, y_train, cv=3))
pred_RF_RFE = modelRF_RFE.predict(X_test)

[0.81062554 0.80368624 0.80925847]


In [17]:
modelRF_RFE.score(X_train, y_train)

0.9988571428571429

In [18]:
print(classification_report(y_test, pred_RF_RFE))
print(confusion_matrix(y_test, pred_RF_RFE))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      2355
           1       0.65      0.35      0.45       645

    accuracy                           0.82      3000
   macro avg       0.74      0.65      0.67      3000
weighted avg       0.80      0.82      0.80      3000

[[2233  122]
 [ 422  223]]


Gradient Boosting w/ RFE

In [59]:
modelGB_RFE = RFECV(GradientBoostingClassifier(n_estimators=100, random_state=42), scoring='accuracy')
modelGB_RFE.fit(X_train, y_train)
print(cross_val_score(modelGB_RFE, X_train, y_train))
modelGB_RFE.score(X_train, y_train)

[0.81285714 0.82428571 0.80642857 0.81785714 0.82428571]


0.8367142857142857

In [60]:
pred_GB_RFE = modelGB_RFE.predict(X_test)
print(classification_report(y_test, pred_GB_RFE))
print(confusion_matrix(y_test, pred_GB_RFE))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      2356
           1       0.65      0.34      0.45       644

    accuracy                           0.82      3000
   macro avg       0.75      0.65      0.67      3000
weighted avg       0.80      0.82      0.80      3000

[[2236  120]
 [ 422  222]]


SVC w/ RFE

In [88]:
modelSVC_RFE = RFECV(SVC(kernel='linear'), scoring='accuracy')
modelSVC_RFE.fit(X_train, y_train)
print(cross_val_score(modelSVC_RFE, X_train, y_train))
modelSVC_RFE.score(X_train, y_train)

[0.80142857 0.80071429 0.80214286 0.79857143 0.81857143]


0.8035714285714286

In [89]:
pred_SVC_RFE = modelSVC_RFE.predict(X_test)
print(classification_report(y_test, pred_SVC_RFE))
print(confusion_matrix(y_test, pred_SVC_RFE))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      2356
           1       0.66      0.28      0.40       644

    accuracy                           0.81      3000
   macro avg       0.75      0.62      0.64      3000
weighted avg       0.79      0.81      0.78      3000

[[2263   93]
 [ 462  182]]


Random Forest w/ RFE tuning

In [45]:
RFgrid = {'max_depth': [90,100,110], 'min_samples_leaf': [3,4], 'min_samples_split':[5,10], 'n_estimators': [100,200,300]}

In [46]:
searchRF_RFE = GridSearchCV(modelRF, param_grid=RFgrid, cv=3)
searchRF_RFE.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [47]:
searchRF_RFE.best_params_

{'max_depth': 110,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 300}

In [48]:
modelRF_RFEbest = RFECV(RandomForestClassifier(max_depth=110, min_samples_leaf=4, min_samples_split=5, n_estimators=300), scoring='accuracy')
modelRF_RFEbest.fit(X_train, y_train)
print(cross_val_score(modelRF_RFEbest, X_train, y_train, cv=3))
pred_RF_RFEbest = modelRF_RFEbest.predict(X_test)

[0.80848329 0.810973   0.81397342]


In [49]:
modelRF_RFEbest.score(X_train, y_train)

0.889

In [50]:
print(classification_report(y_test, pred_RF_RFEbest))
print(confusion_matrix(y_test, pred_RF_RFEbest))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      2355
           1       0.66      0.35      0.45       645

    accuracy                           0.82      3000
   macro avg       0.75      0.65      0.67      3000
weighted avg       0.80      0.82      0.80      3000

[[2239  116]
 [ 421  224]]


Gradient Boosting w/ RFE tuning

In [51]:
GBgrid = {'learning_rate': [0.05, 0.1, 0.5, 1],'max_depth': [90,100,110], 'min_samples_leaf': [3,4],  'n_estimators': [100,200,300]}

In [55]:
searchGB_RFE = GridSearchCV(modelGB, param_grid=GBgrid, cv=3)
searchGB_RFE.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [56]:
searchGB_RFE.best_params_

{'learning_rate': 0.1,
 'max_depth': 90,
 'min_samples_leaf': 3,
 'n_estimators': 300}

In [57]:
modelGB_RFEbest = RFECV(GradientBoostingClassifier(learning_rate=0.1, max_depth=90, min_samples_leaf=3, n_estimators=300, random_state=42), scoring='accuracy')
modelGB_RFEbest.fit(X_train, y_train)
print(cross_val_score(modelGB_RFEbest, X_train, y_train))
modelGB_RFEbest.score(X_train, y_train)

[0.8        0.815      0.80214286 0.79714286 0.80642857]


0.9988571428571429

In [60]:
pred_GB_RFEbest = modelGB_RFEbest.predict(X_test)
print(classification_report(y_test, pred_GB_RFEbest))
print(confusion_matrix(y_test, pred_GB_RFEbest))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88      2355
           1       0.59      0.36      0.45       645

    accuracy                           0.81      3000
   macro avg       0.72      0.65      0.67      3000
weighted avg       0.79      0.81      0.79      3000

[[2194  161]
 [ 410  235]]


SVC w/ RFE tuning

In [16]:
modelSVC2 = SVC(kernel = 'rbf', class_weight='balanced', random_state = 42)
grid = {'C': [0.1,1,10]}
gridsearch = GridSearchCV(modelSVC2, param_grid=grid, scoring='accuracy', refit=True, cv=3)
gridsearch.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None, param_grid={'C': [0.1, 1, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [18]:
gridsearch.best_params_

{'C': 0.1}

In [62]:
modelSVC_RFEbest = RFECV(SVC(kernel='linear', C=0.1), scoring='accuracy')
modelSVC_RFEbest.fit(X_train, y_train)
print(cross_val_score(modelSVC_RFEbest, X_train, y_train, cv=3))
modelSVC_RFEbest.score(X_train, y_train)

[0.80891174 0.80282898 0.80368624]


0.8044285714285714

In [63]:
pred_SVC_RFEbest = modelSVC_RFEbest.predict(X_test)
print(classification_report(y_test, pred_SVC_RFEbest))
print(confusion_matrix(y_test, pred_SVC_RFEbest))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      2355
           1       0.65      0.27      0.38       645

    accuracy                           0.81      3000
   macro avg       0.74      0.61      0.63      3000
weighted avg       0.79      0.81      0.78      3000

[[2264   91]
 [ 474  171]]
