In [8]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import seaborn           as sns
import statsmodels.api   as sm
import scipy.stats       as stats
import lightgbm          as lgb
import datetime          as dt
import matplotlib
import pydotplus

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


from math                      import sqrt

from sklearn.datasets          import load_boston
from sklearn.model_selection   import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model      import LinearRegression, LogisticRegression
from sklearn.linear_model      import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.linear_model      import ElasticNetCV, ElasticNet
from sklearn.metrics           import mean_squared_error
from sklearn.metrics           import r2_score, roc_auc_score, roc_curve
from sklearn.metrics           import confusion_matrix, accuracy_score, classification_report
from sklearn.tree              import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.externals.six     import StringIO
from sklearn.preprocessing     import StandardScaler
from sklearn.ensemble          import RandomForestClassifier, VotingClassifier
from sklearn.neighbors         import KNeighborsClassifier,KNeighborsRegressor
from sklearn.naive_bayes       import BernoulliNB
from sklearn.svm               import SVC, SVR
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster           import KMeans, AgglomerativeClustering
from sklearn.feature_selection import RFECV

from collections               import Counter as count

from IPython.display           import Image

from scipy.stats               import randint as sp_randint
from scipy.cluster.hierarchy   import dendrogram, linkage

from statsmodels.stats.outliers_influence import variance_inflation_factor

from imblearn.over_sampling    import SMOTE

from category_encoders         import TargetEncoder

from scipy.stats               import ttest_1samp,ttest_ind, wilcoxon

from statsmodels.stats.power   import ttest_power

from mlxtend.feature_selection import SequentialFeatureSelector as sfs



In [9]:
df = pd.read_csv('credit_card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [11]:
import featuretools as ft


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID           30000 non-null int64
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_1        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
DEFAULT      30000 non-null int64
dtypes: int64(25)
memory usage: 5.7 MB


In [4]:
X =df.drop('DEFAULT',axis=1)
y = df['DEFAULT']

### Logistic Regression

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

logreg = LogisticRegression(solver='liblinear', fit_intercept=True)
logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
y_pred = logreg.predict(x_test)

In [7]:
accuracy_score(y_test,y_pred)

0.7777777777777778

In [8]:
y_pred_train = logreg.predict(x_train)

In [9]:
print('Confussion matrix - Train: ','\n',confusion_matrix(y_train,y_pred_train))
print('Overall accuracy - Train: ',accuracy_score(y_train,y_pred_train))
print('Confussion matrix - Test: ','\n',confusion_matrix(y_test,y_pred))
print('Overall accuracy - Test: ',accuracy_score(y_test,y_pred))

Confussion matrix - Train:  
 [[16363     1]
 [ 4635     1]]
Overall accuracy - Train:  0.7792380952380953
Confussion matrix - Test:  
 [[7000    0]
 [2000    0]]
Overall accuracy - Test:  0.7777777777777778


In [10]:
print(classification_report(y_test,y_pred))
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.78      1.00      0.88      7000
          1       0.00      0.00      0.00      2000

avg / total       0.60      0.78      0.68      9000

             precision    recall  f1-score   support

          0       0.78      1.00      0.88     16364
          1       0.50      0.00      0.00      4636

avg / total       0.72      0.78      0.68     21000



### Random Forest

In [12]:
rfc = RandomForestClassifier(n_estimators=10,random_state=1)

rfc.fit(x_train,y_train)

y_pred_train = rfc.predict(x_train)
y_prob_train = rfc.predict_proba(x_train)[:,1]

y_pred = rfc.predict(x_test)
y_prob = rfc.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Random Forest_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Random Forest_test: ', accuracy_score(y_pred, y_test))

print('AUC of Random Forest_train: ', roc_auc_score(y_train, y_prob_train))
print('AUC of Random Forest_test: ', roc_auc_score(y_test, y_prob))

Accuracy of Random Forest_train:  0.9817619047619047
Accuracy of Random Forest_test:  0.805
AUC of Random Forest_train:  0.9993014361688328
AUC of Random Forest_test:  0.7379866071428571


In [15]:
#Classification for test
print(classification_report(y_test,y_pred))
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.83      0.94      0.88      7000
          1       0.62      0.32      0.42      2000

avg / total       0.78      0.81      0.78      9000

             precision    recall  f1-score   support

          0       0.98      1.00      0.99     16364
          1       1.00      0.92      0.96      4636

avg / total       0.98      0.98      0.98     21000



### Random Forest with Hyperparameter Tuning

In [16]:
rfc = RandomForestClassifier(random_state=1)

params = {'n_estimators': sp_randint(5,30),
          'criterion' : ['gini','entropy'],
          'max_depth' : sp_randint(2,10),
          'min_samples_split' : sp_randint(2,20),
          'min_samples_leaf' : sp_randint(1,20),
          'max_features' : sp_randint(2,18)}

rand_search_rfc = RandomizedSearchCV(rfc, param_distributions=params, random_state=1, cv=3)

rand_search_rfc.fit(X,y)

rand_search_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 2,
 'max_features': 15,
 'min_samples_leaf': 16,
 'min_samples_split': 12,
 'n_estimators': 13}

In [17]:
rfc = RandomForestClassifier(**rand_search_rfc.best_params_)

rfc.fit(x_train,y_train)

y_pred_train = rfc.predict(x_train)
y_prob_train = rfc.predict_proba(x_train)[:,1]

y_pred = rfc.predict(x_test)
y_prob = rfc.predict_proba(x_test)[:,1]

print('Accuracy of RandomForest_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of RandomForest_test: ', accuracy_score(y_pred, y_test))

print('AUC of RandomForest_train: ', roc_auc_score(y_train, y_prob_train))
print('AUC of RandomForest_test: ', roc_auc_score(y_test, y_prob))

Accuracy of RandomForest_train:  0.8204285714285714
Accuracy of RandomForest_test:  0.8176666666666667
AUC of RandomForest_train:  0.7350142830207264
AUC of RandomForest_test:  0.7251073214285714


In [18]:
print('Confussion matrix - Train: ','\n',confusion_matrix(y_train,y_pred_train))
print('Overall accuracy - Train: ',accuracy_score(y_train,y_pred_train))
print('Confussion matrix - Test: ','\n',confusion_matrix(y_test,y_pred))
print('Overall accuracy - Test: ',accuracy_score(y_test,y_pred))

Confussion matrix - Train:  
 [[15683   681]
 [ 3090  1546]]
Overall accuracy - Train:  0.8204285714285714
Confussion matrix - Test:  
 [[6728  272]
 [1369  631]]
Overall accuracy - Test:  0.8176666666666667


In [19]:
#Classification for test
print(classification_report(y_test,y_pred))
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.83      0.96      0.89      7000
          1       0.70      0.32      0.43      2000

avg / total       0.80      0.82      0.79      9000

             precision    recall  f1-score   support

          0       0.84      0.96      0.89     16364
          1       0.69      0.33      0.45      4636

avg / total       0.80      0.82      0.80     21000



### LGBM:

In [24]:
import lightgbm as lgb
lgbc = lgb.LGBMClassifier()

In [25]:
lgbc.fit(x_train, y_train)

y_pred = lgbc.predict(x_test)
y_prob = lgbc.predict_proba(x_test)[:,1]

y_pred_train = lgbc.predict(x_train)
y_prob_train = lgbc.predict_proba(x_train)[:,1]

In [26]:
print('Accuracy on Train Set: ', accuracy_score(y_train, y_pred_train))
print('Accuracy on Test Set: ', accuracy_score(y_test, y_pred))
print('AUC of Train Set: ', roc_auc_score(y_train, y_prob_train))
print('AUC of Test Set: ', roc_auc_score(y_test, y_prob))

Accuracy on Train Set:  0.851
Accuracy on Test Set:  0.8186666666666667
AUC of Train Set:  0.886816544883031
AUC of Test Set:  0.7777822857142856


In [27]:
#Classification for test
print(classification_report(y_test,y_pred))
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.84      0.95      0.89      7000
          1       0.67      0.35      0.47      2000

avg / total       0.80      0.82      0.80      9000

             precision    recall  f1-score   support

          0       0.86      0.97      0.91     16364
          1       0.79      0.44      0.57      4636

avg / total       0.84      0.85      0.83     21000



### LGBM with Tuning

In [28]:
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint as sp_randint

lgbc = lgb.LGBMClassifier(random_state=1)

params = {'n_estimators': sp_randint(5,250),
          'max_depth' : sp_randint(2,20),
          'min_child_samples' : sp_randint(1,20),
          'num_leaves' : sp_randint(5,50)}

rand_search_lgbc = RandomizedSearchCV(lgbc, param_distributions=params, random_state=1, cv=3)

rand_search_lgbc.fit(x_train, y_train)

rand_search_lgbc.best_params_

{'max_depth': 3, 'min_child_samples': 1, 'n_estimators': 65, 'num_leaves': 22}

In [29]:
lgbc = lgb.LGBMClassifier(**rand_search_lgbc.best_params_, random_state=1)

lgbc.fit(x_train,y_train)

y_pred_train = lgbc.predict(x_train)
y_prob_train = lgbc.predict_proba(x_train)[:,1]

y_pred = lgbc.predict(x_test)
y_prob = lgbc.predict_proba(x_test)[:,1]

print('Accuracy of LGBC_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of LGBM_test: ', accuracy_score(y_pred, y_test))

print('AUC of LGBC_train: ', roc_auc_score(y_train, y_prob_train))
print('AUC of LGBC_test: ', roc_auc_score(y_test, y_prob))

Accuracy of LGBC_train:  0.8253333333333334
Accuracy of LGBM_test:  0.8201111111111111
AUC of LGBC_train:  0.8013530392690535
AUC of LGBC_test:  0.7803508214285714


In [30]:
#Classification for test
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.84      0.95      0.89      7000
          1       0.68      0.35      0.47      2000

avg / total       0.80      0.82      0.80      9000



In [31]:
#classification for train
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.84      0.95      0.89     16364
          1       0.69      0.37      0.49      4636

avg / total       0.81      0.83      0.80     21000



### Naive Bayes:

In [25]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

bnb.fit(x_train, y_train)

y_pred_train = bnb.predict(x_train)
y_proba_train = bnb.predict_proba(x_train)[:,1]

y_pred = bnb.predict(x_test)
y_proba = bnb.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

Accuracy of Bayes_train:  0.770095238095238
Accuracy of Bayes_test:  0.771
AUC of Bayes_train:  0.7366726759681441
AUC of Bayes_test:  0.7342563214285713


In [26]:
#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.85      0.85      0.85      7000
          1       0.48      0.48      0.48      2000

avg / total       0.77      0.77      0.77      9000

             precision    recall  f1-score   support

          0       0.86      0.85      0.85     16364
          1       0.48      0.50      0.49      4636

avg / total       0.77      0.77      0.77     21000



### KNN:

In [27]:

knn=KNeighborsClassifier()
knn.fit(x_train,y_train)

y_pred_train = knn.predict(x_train)
y_proba_train = knn.predict_proba(x_train)[:,1]

y_pred = knn.predict(x_test)
y_proba = knn.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

Accuracy of Bayes_train:  0.8144285714285714
Accuracy of Bayes_test:  0.75
AUC of Bayes_train:  0.8292686362074707
AUC of Bayes_test:  0.5934543214285715


In [28]:
#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.79      0.91      0.85      7000
          1       0.37      0.17      0.24      2000

avg / total       0.70      0.75      0.71      9000

             precision    recall  f1-score   support

          0       0.83      0.95      0.89     16364
          1       0.66      0.34      0.44      4636

avg / total       0.80      0.81      0.79     21000



### KNN with Tuning:

In [29]:
knn=KNeighborsClassifier()

params ={'n_neighbors':sp_randint(5,30),
         'p':sp_randint(1,5)}

rand_search_knn=RandomizedSearchCV(knn,param_distributions=params,cv=3,random_state=1)

rand_search_knn.fit(X,y)
rand_search_knn.best_params_

{'n_neighbors': 23, 'p': 1}

In [30]:
knn=KNeighborsClassifier(**rand_search_knn.best_params_)

knn.fit(x_train,y_train)

y_pred_train = knn.predict(x_train)
y_proba_train = knn.predict_proba(x_train)[:,1]

y_pred = knn.predict(x_test)
y_proba = knn.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

Accuracy of Bayes_train:  0.7863809523809524
Accuracy of Bayes_test:  0.7806666666666666
AUC of Bayes_train:  0.7295936923767719
AUC of Bayes_test:  0.6491554642857144


In [31]:
#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       0.79      0.98      0.87      7000
          1       0.54      0.09      0.16      2000

avg / total       0.73      0.78      0.71      9000

             precision    recall  f1-score   support

          0       0.79      0.98      0.88     16364
          1       0.59      0.10      0.18      4636

avg / total       0.75      0.79      0.72     21000



## SMOTE:

In [39]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_train_sm, y_train_sm = smote.fit_sample(x_train, y_train)


### Logistic Regression:

In [34]:
logreg.fit(x_train_sm, y_train_sm)

y_prob = logreg.predict_proba(x_test)


y_prob = logreg.predict_proba(x_test)[:,1]


THRESHOLD = 0.50
y_pred = np.where(logreg.predict_proba(x_test)[:,1] > THRESHOLD, 1, 0)

#Accuracy for test 
accuracy_score(y_test,y_pred)

#For train data
y_prob_train = logreg.predict_proba(x_train_sm)[:,1]
y_pred_train = np.where(logreg.predict_proba(x_train_sm)[:,1] > THRESHOLD, 1, 0)

print('Confussion matrix - Train: ','\n',confusion_matrix(y_train_sm,y_pred_train))
print('Overall accuracy - Train: ',accuracy_score(y_train_sm,y_pred_train))
print('Confussion matrix - Test: ','\n',confusion_matrix(y_test,y_pred))
print('Overall accuracy - Test: ',accuracy_score(y_test,y_pred))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Confussion matrix - Train:  
 [[ 8626  7738]
 [ 4630 11734]]
Overall accuracy - Train:  0.6220972867269616
Confussion matrix - Test:  
 [[3677 3323]
 [ 560 1440]]
Overall accuracy - Test:  0.5685555555555556
             precision    recall  f1-score   support

          0       0.87      0.53      0.65      7000
          1       0.30      0.72      0.43      2000

avg / total       0.74      0.57      0.60      9000

             precision    recall  f1-score   support

          0       0.65      0.53      0.58     16364
          1       0.60      0.72      0.65     16364

avg / total       0.63      0.62      0.62     32728



### Random Forest:

In [35]:
rfc = RandomForestClassifier(n_estimators=10,random_state=1)

rfc.fit(x_train_sm,y_train_sm)

y_pred_train = rfc.predict(x_train_sm)
y_prob_train = rfc.predict_proba(x_train_sm)[:,1]

y_pred = rfc.predict(x_test)
y_prob = rfc.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Random Forest_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of Random Forest_test: ', accuracy_score(y_pred, y_test))

print('AUC of Random Forest_train: ', roc_auc_score(y_train_sm, y_prob_train))
print('AUC of Random Forest_test: ', roc_auc_score(y_test, y_prob))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of Random Forest_train:  0.9898863358592032
Accuracy of Random Forest_test:  0.8037777777777778
AUC of Random Forest_train:  0.9997954780097204
AUC of Random Forest_test:  0.7289125714285715
             precision    recall  f1-score   support

          0       0.84      0.93      0.88      7000
          1       0.60      0.36      0.45      2000

avg / total       0.78      0.80      0.78      9000

             precision    recall  f1-score   support

          0       0.98      1.00      0.99     16364
          1       1.00      0.98      0.99     16364

avg / total       0.99      0.99      0.99     32728



### Random Forest with Tuning:

In [36]:
rfc = RandomForestClassifier(random_state=1)

params = {'n_estimators': sp_randint(5,30),
          'criterion' : ['gini','entropy'],
          'max_depth' : sp_randint(2,10),
          'min_samples_split' : sp_randint(2,20),
          'min_samples_leaf' : sp_randint(1,20),
          'max_features' : sp_randint(2,18)}

rand_search_rfc = RandomizedSearchCV(rfc, param_distributions=params, random_state=1, cv=3)

rand_search_rfc.fit(X,y)

rand_search_rfc.best_params_

rfc = RandomForestClassifier(**rand_search_rfc.best_params_)

rfc.fit(x_train_sm,y_train_sm)

y_pred_train = rfc.predict(x_train_sm)
y_prob_train = rfc.predict_proba(x_train_sm)[:,1]

y_pred = rfc.predict(x_test)
y_prob = rfc.predict_proba(x_test)[:,1]

print('Accuracy of RandomForest_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of RandomForest_test: ', accuracy_score(y_pred, y_test))

print('AUC of RandomForest_train: ', roc_auc_score(y_train_sm, y_prob_train))
print('AUC of RandomForest_test: ', roc_auc_score(y_test, y_prob))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of RandomForest_train:  0.7522305059887558
Accuracy of RandomForest_test:  0.7768888888888889
AUC of RandomForest_train:  0.8280683911348046
AUC of RandomForest_test:  0.7255195
             precision    recall  f1-score   support

          0       0.86      0.85      0.86      7000
          1       0.50      0.51      0.50      2000

avg / total       0.78      0.78      0.78      9000

             precision    recall  f1-score   support

          0       0.71      0.85      0.77     16364
          1       0.81      0.65      0.73     16364

avg / total       0.76      0.75      0.75     32728



### LGBM:

In [37]:
lgbc.fit(x_train_sm, y_train_sm)

y_pred = lgbc.predict(x_test)
y_prob = lgbc.predict_proba(x_test)[:,1]

y_pred_train = lgbc.predict(x_train_sm)
y_prob_train = lgbc.predict_proba(x_train_sm)[:,1]

print('Accuracy on Train Set: ', accuracy_score(y_train_sm, y_pred_train))
print('Accuracy on Test Set: ', accuracy_score(y_test, y_pred))
print('AUC of Train Set: ', roc_auc_score(y_train_sm, y_prob_train))
print('AUC of Test Set: ', roc_auc_score(y_test, y_prob))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy on Train Set:  0.8724945001222195
Accuracy on Test Set:  0.8174444444444444
AUC of Train Set:  0.9316294286795257
AUC of Test Set:  0.7732020714285714
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      7000
          1       0.65      0.39      0.49      2000

avg / total       0.80      0.82      0.80      9000

             precision    recall  f1-score   support

          0       0.83      0.94      0.88     16364
          1       0.93      0.81      0.86     16364

avg / total       0.88      0.87      0.87     32728



### LGBM with Tuning:

In [38]:
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint as sp_randint

lgbc = lgb.LGBMClassifier(random_state=1)

params = {'n_estimators': sp_randint(5,250),
          'max_depth' : sp_randint(2,20),
          'min_child_samples' : sp_randint(1,20),
          'num_leaves' : sp_randint(5,50)}

rand_search_lgbc = RandomizedSearchCV(lgbc, param_distributions=params, random_state=1, cv=3)

rand_search_lgbc.fit(x_train_sm, y_train_sm)

rand_search_lgbc.best_params_

lgbc = lgb.LGBMClassifier(**rand_search_lgbc.best_params_, random_state=1)

lgbc.fit(x_train_sm,y_train_sm)

y_pred_train = lgbc.predict(x_train_sm)
y_prob_train = lgbc.predict_proba(x_train_sm)[:,1]

y_pred = lgbc.predict(x_test)
y_prob = lgbc.predict_proba(x_test)[:,1]

print('Accuracy of LGBC_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of LGBM_test: ', accuracy_score(y_pred, y_test))

print('AUC of LGBC_train: ', roc_auc_score(y_train_sm, y_prob_train))
print('AUC of LGBC_test: ', roc_auc_score(y_test, y_prob))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of LGBC_train:  0.8724945001222195
Accuracy of LGBM_test:  0.8174444444444444
AUC of LGBC_train:  0.9316294286795257
AUC of LGBC_test:  0.7732020714285714
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      7000
          1       0.65      0.39      0.49      2000

avg / total       0.80      0.82      0.80      9000

             precision    recall  f1-score   support

          0       0.83      0.94      0.88     16364
          1       0.93      0.81      0.86     16364

avg / total       0.88      0.87      0.87     32728



### Naive Bayes:

In [39]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

bnb.fit(x_train_sm, y_train_sm)

y_pred_train = bnb.predict(x_train_sm)
y_proba_train = bnb.predict_proba(x_train_sm)[:,1]

y_pred = bnb.predict(x_test)
y_proba = bnb.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train_sm, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of Bayes_train:  0.7283365925201662
Accuracy of Bayes_test:  0.7904444444444444
AUC of Bayes_train:  0.7891965440231316
AUC of Bayes_test:  0.7099918214285714
             precision    recall  f1-score   support

          0       0.85      0.89      0.87      7000
          1       0.54      0.44      0.48      2000

avg / total       0.78      0.79      0.78      9000

             precision    recall  f1-score   support

          0       0.67      0.88      0.76     16364
          1       0.83      0.57      0.68     16364

avg / total       0.75      0.73      0.72     32728



### KNN:

In [40]:

knn=KNeighborsClassifier()
knn.fit(x_train_sm,y_train_sm)

y_pred_train = knn.predict(x_train_sm)
y_proba_train = knn.predict_proba(x_train_sm)[:,1]

y_pred = knn.predict(x_test)
y_proba = knn.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train_sm, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of Bayes_train:  0.8402896602297727
Accuracy of Bayes_test:  0.5913333333333334
AUC of Bayes_train:  0.9525702013786694
AUC of Bayes_test:  0.5856885714285713
             precision    recall  f1-score   support

          0       0.82      0.61      0.70      7000
          1       0.28      0.52      0.36      2000

avg / total       0.70      0.59      0.62      9000

             precision    recall  f1-score   support

          0       0.95      0.72      0.82     16364
          1       0.77      0.96      0.86     16364

avg / total       0.86      0.84      0.84     32728



### KNN with Tuning:

In [41]:
knn=KNeighborsClassifier()

params ={'n_neighbors':sp_randint(5,30),
         'p':sp_randint(1,5)}

rand_search_knn=RandomizedSearchCV(knn,param_distributions=params,cv=3,random_state=1)

rand_search_knn.fit(X,y)
rand_search_knn.best_params_

knn=KNeighborsClassifier(**rand_search_knn.best_params_)

knn.fit(x_train_sm,y_train_sm)

y_pred_train = knn.predict(x_train_sm)
y_proba_train = knn.predict_proba(x_train_sm)[:,1]

y_pred = knn.predict(x_test)
y_proba = knn.predict_proba(x_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

print('Accuracy of Bayes_train: ', accuracy_score(y_pred_train, y_train_sm))
print('Accuracy of Bayes_test: ', accuracy_score(y_pred, y_test))

print('AUC of Bayes_train: ', roc_auc_score(y_train_sm, y_proba_train))
print('AUC of Bayes_test: ', roc_auc_score(y_test, y_proba))

#Classification for test
print(classification_report(y_test,y_pred))

#classification for train
print(classification_report(y_train_sm,y_pred_train))

Accuracy of Bayes_train:  0.7160229772671718
Accuracy of Bayes_test:  0.5704444444444444
AUC of Bayes_train:  0.8106014506000467
AUC of Bayes_test:  0.6365498928571428
             precision    recall  f1-score   support

          0       0.84      0.55      0.67      7000
          1       0.29      0.64      0.40      2000

avg / total       0.72      0.57      0.61      9000

             precision    recall  f1-score   support

          0       0.79      0.58      0.67     16364
          1       0.67      0.85      0.75     16364

avg / total       0.73      0.72      0.71     32728



In [34]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier

### Adaboost

In [35]:
# using adaboost
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(x_train,y_train)
y_pred = ada.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
y_pred_train = ada.predict(x_train)
print(classification_report(y_train,y_pred_train))

0.8153333333333334
             precision    recall  f1-score   support

          0       0.83      0.96      0.89      7000
          1       0.68      0.32      0.43      2000

avg / total       0.80      0.82      0.79      9000

             precision    recall  f1-score   support

          0       0.84      0.96      0.89     16364
          1       0.69      0.34      0.45      4636

avg / total       0.80      0.82      0.80     21000



### GradientBoost

In [37]:
# using Gradientboost
grad = GradientBoostingClassifier()
grad.fit(x_train,y_train)
y_pred = grad.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
y_pred_train = grad.predict(x_train)
print(classification_report(y_train,y_pred_train))

0.8182222222222222
             precision    recall  f1-score   support

          0       0.84      0.95      0.89      7000
          1       0.67      0.35      0.46      2000

avg / total       0.80      0.82      0.80      9000

[[6660  340]
 [1296  704]]
             precision    recall  f1-score   support

          0       0.85      0.95      0.90     16364
          1       0.70      0.38      0.50      4636

avg / total       0.81      0.83      0.81     21000



### XGBoost

In [97]:
xgb = grid.best_estimator_
xgb.fit(x_train,y_train)
y_pred = xgb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8185555555555556
             precision    recall  f1-score   support

          0       0.84      0.95      0.89      7000
          1       0.68      0.34      0.46      2000

avg / total       0.80      0.82      0.79      9000

[[6682  318]
 [1315  685]]


In [1]:
(685)/(685+1315)

0.3425

In [95]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, eta=0.05, gamma=0.3,
       gpu_id=-1, importance_type='gain', interaction_constraints=None,
       learning_rate=0.0500000007, max_delta_step=0, max_depth=6,
       min_child_weight=7, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

# SMOTE

### Adaboost

In [40]:
# using adaboost
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(x_train_sm,y_train_sm)
y_pred = ada.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
y_pred_train = ada.predict(x_train)
print(classification_report(y_train,y_pred_train))

0.8163333333333334
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      7000
          1       0.65      0.37      0.47      2000

avg / total       0.80      0.82      0.80      9000

             precision    recall  f1-score   support

          0       0.85      0.94      0.89     16364
          1       0.66      0.39      0.49      4636

avg / total       0.80      0.82      0.80     21000



### GradientBoost

In [41]:
# using Gradientboost
grad = GradientBoostingClassifier()
grad.fit(x_train_sm,y_train_sm)
y_pred = grad.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
y_pred_train = grad.predict(x_train)
print(classification_report(y_train,y_pred_train))

0.8188888888888889
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      7000
          1       0.66      0.38      0.48      2000

avg / total       0.80      0.82      0.80      9000

[[6614  386]
 [1244  756]]
             precision    recall  f1-score   support

          0       0.85      0.94      0.89     16364
          1       0.67      0.40      0.50      4636

avg / total       0.81      0.82      0.81     21000



### XGBoost

In [None]:
xgb = grid.best_estimator_
xgb.fit(pd.DataFrame(x_train_sm,columns=x_train.columns),y_train_sm)
y_pred = xgb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))