## Import Common Package

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, PowerTransformer, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

### Import Data

In [2]:
df = pd.read_csv('CreditCardClean.csv')
pd.options.display.max_columns = 999
pd.set_option('display.float_format', lambda x: '%.2f' % x) #Change Decimal
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,def_pay,SEX_CAT,EDUCATION_CAT,MARRIAGE_CAT,AGE_GROUP,LIMIT_GROUP
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,Female,University,Married,21-30 Years,10K-100K
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,Female,University,Single,21-30 Years,101K-200K
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,Female,University,Single,31-40 Years,10K-100K
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,Female,University,Married,31-40 Years,10K-100K
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,Male,University,Married,51-60 Years,10K-100K


### Data Preparation

In [3]:
df_1 = df.copy()

In [4]:
# Check Missing Value to ensure
df_1.isna().sum()

ID               0
LIMIT_BAL        0
SEX              0
EDUCATION        0
MARRIAGE         0
AGE              0
PAY_0            0
PAY_2            0
PAY_3            0
PAY_4            0
PAY_5            0
PAY_6            0
BILL_AMT1        0
BILL_AMT2        0
BILL_AMT3        0
BILL_AMT4        0
BILL_AMT5        0
BILL_AMT6        0
PAY_AMT1         0
PAY_AMT2         0
PAY_AMT3         0
PAY_AMT4         0
PAY_AMT5         0
PAY_AMT6         0
def_pay          0
SEX_CAT          0
EDUCATION_CAT    0
MARRIAGE_CAT     0
AGE_GROUP        0
LIMIT_GROUP      0
dtype: int64

In [5]:
# Check Data Types to Ensure
df_1.dtypes

ID                int64
LIMIT_BAL         int64
SEX               int64
EDUCATION         int64
MARRIAGE          int64
AGE               int64
PAY_0             int64
PAY_2             int64
PAY_3             int64
PAY_4             int64
PAY_5             int64
PAY_6             int64
BILL_AMT1         int64
BILL_AMT2         int64
BILL_AMT3         int64
BILL_AMT4         int64
BILL_AMT5         int64
BILL_AMT6         int64
PAY_AMT1          int64
PAY_AMT2          int64
PAY_AMT3          int64
PAY_AMT4          int64
PAY_AMT5          int64
PAY_AMT6          int64
def_pay           int64
SEX_CAT          object
EDUCATION_CAT    object
MARRIAGE_CAT     object
AGE_GROUP        object
LIMIT_GROUP      object
dtype: object

In [7]:
# Remove ID, EDUCATION, MARRIAGE, SEX_CAT, AGE_GROUP, LIMIT_GROUP 
# EDUCATION column is removed due to recategorize done in EDA File, so we utilize EDUCATION_CAT columns
# MARRIAGE column is removed due to recategorize done in EDA File, so we utilize MARRIAGE_CAT columns
df_1.drop(columns = ['ID', 'EDUCATION', 'MARRIAGE', 'SEX_CAT', 'AGE_GROUP', 'LIMIT_GROUP'], inplace = True)

In [8]:
df_1.head()

Unnamed: 0,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,def_pay,EDUCATION_CAT,MARRIAGE_CAT
0,20000,2,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,University,Married
1,120000,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,University,Single
2,90000,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,University,Single
3,50000,2,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,University,Married
4,50000,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,University,Married


In [9]:
### One Hot Encoding Column EDUCATION_CAT, MARRIAGE_CAT
df_1 =  pd.get_dummies(data=df_1, columns=['EDUCATION_CAT', 'MARRIAGE_CAT'])

In [10]:
df_1.head()

Unnamed: 0,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,def_pay,EDUCATION_CAT_Graduate School,EDUCATION_CAT_High School,EDUCATION_CAT_Others,EDUCATION_CAT_University,MARRIAGE_CAT_Married,MARRIAGE_CAT_Others,MARRIAGE_CAT_Single
0,20000,2,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,0,0,0,1,1,0,0
1,120000,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,0,0,0,1,0,0,1
2,90000,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,0,0,0,1,0,0,1
3,50000,2,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,0,0,0,1,1,0,0
4,50000,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,0,0,0,1,1,0,0


### Cek Imbalance Data

In [12]:
df_1['def_pay'].value_counts()

0    23364
1     6636
Name: def_pay, dtype: int64

In [13]:
pd.crosstab(index=df_1['def_pay'], columns='count', normalize=True)*100
## Dataset is Imbalanced => For Imbalanced Handling i prefere to use algorithm who have class weight and tuning class_weight value

col_0,count
def_pay,Unnamed: 1_level_1
0,77.88
1,22.12


### FOCUS MACHINE LEARNING

Dataset is imbalance

0: Not-Default
1: Default

Because this dataset is about Credit Card Default, we will focus to get the smallest False Negative and get the best score of recall 1

It's more risk to predict customers don't default but actually they are default => Take more business losses than if we predict customers default but actual don't default

### Splitting Data

In [14]:
X = df_1.drop(columns='def_pay')
y = df_1['def_pay']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = .20, random_state = 42)

## Machine Learning Modelling

## 1. Random Forest

### 1a. Random Forest - Base Algorithm

In [17]:
RF_Base = RandomForestClassifier()

In [18]:
RF_Base.fit(X_train, y_train)

RandomForestClassifier()

In [19]:
y_pred_RFBase = RF_Base.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred_RFBase))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4673
           1       0.63      0.36      0.46      1327

    accuracy                           0.81      6000
   macro avg       0.74      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000



In [21]:
cm_RF_Base = confusion_matrix(y_test, y_pred_RFBase , labels=[1,0])

In [22]:
df_RF_Base = pd.DataFrame(cm_RF_Base, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF_Base

Unnamed: 0,Pred 1,Pred 0
Akt 1,483,844
Akt 0,281,4392


### 1b. Random Forest - Class Weight Tuning

In [23]:
RF_1 = RandomForestClassifier(class_weight={0 : .1 , 1 : .9})

In [25]:
RF_1.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 0.1, 1: 0.9})

In [26]:
y_pred_RF1 = RF_1.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred_RF1))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      4673
           1       0.66      0.33      0.44      1327

    accuracy                           0.81      6000
   macro avg       0.75      0.64      0.67      6000
weighted avg       0.80      0.81      0.79      6000



In [29]:
cm_RF_1 = confusion_matrix(y_test, y_pred_RF1 , labels=[1,0])

In [30]:
df_RF_1 = pd.DataFrame(cm_RF_1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF_1

Unnamed: 0,Pred 1,Pred 0
Akt 1,443,884
Akt 0,229,4444


### 1c. RF Base SMOTE Algorithm

In [36]:
from imblearn.over_sampling import SMOTE

In [37]:
sm = SMOTE(random_state=42)

In [38]:
X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

In [39]:
df_SMOTE = pd.concat([X_train_sm, y_train_sm], axis=1)

In [40]:
df_SMOTE['def_pay'].value_counts()

1    18691
0    18691
Name: def_pay, dtype: int64

In [41]:
RF_SMOTE = RandomForestClassifier()

In [42]:
RF_SMOTE.fit(X_train_sm, y_train_sm)

RandomForestClassifier()

In [43]:
y_predSMOTE = RF_SMOTE.predict(X_test)

In [44]:
print(classification_report(y_test, y_predSMOTE))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      4673
           1       0.56      0.43      0.49      1327

    accuracy                           0.80      6000
   macro avg       0.71      0.67      0.68      6000
weighted avg       0.78      0.80      0.79      6000



In [45]:
cm_SMOTE = confusion_matrix(y_test, y_predSMOTE, labels=[1,0])

In [46]:
df_SMOTE = pd.DataFrame(cm_SMOTE, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_SMOTE

Unnamed: 0,Pred 1,Pred 0
Akt 1,568,759
Akt 0,441,4232


### 1d. Random Forest - Hyper Parameter Tuning

In [77]:
param_RF = {
    "n_estimators" : np.arange(100, 1000, 100),
    "max_depth" : [None, 5, 10, 25, 40, 50, 60, 80],
    "min_samples_leaf" :np.arange(1,51),
    "max_features" : ['auto', 0.3, 0.5, 0.8],
    "class_weight" : [{0:x, 1: 1 - x} for x in [.1, .20]]
}

In [78]:
RF_rand = RandomForestClassifier(random_state=42)

In [79]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [80]:
RF_RS= RandomizedSearchCV(RF_rand, param_RF, cv=skf, n_iter=50, n_jobs=-1, verbose=1, random_state=42, scoring='recall')

In [81]:
RF_RS.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 20.1min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
                   estimator=RandomForestClassifier(random_state=42), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'class_weight': [{0: 0.1, 1: 0.9},
                                                         {0: 0.2, 1: 0.8}],
                                        'max_depth': [None, 5, 10, 25, 40, 50,
                                                      60, 80],
                                        'max_features': ['auto', 0.3, 0.5, 0.8],
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]),
                                        'n_estimators': array([100, 200, 300, 400, 500, 600, 700, 800, 900])},
                   random_state=42, sc

In [82]:
RF_RS.best_params_

{'n_estimators': 100,
 'min_samples_leaf': 12,
 'max_features': 'auto',
 'max_depth': 5,
 'class_weight': {0: 0.1, 1: 0.9}}

In [83]:
Model_RF_Tuned = RF_RS.best_estimator_

In [84]:
y_pred_RF_Tuned = Model_RF_Tuned.predict(X_test)

In [85]:
print(classification_report(y_test, y_pred_RF_Tuned))

              precision    recall  f1-score   support

           0       0.94      0.19      0.32      4673
           1       0.25      0.95      0.40      1327

    accuracy                           0.36      6000
   macro avg       0.59      0.57      0.36      6000
weighted avg       0.78      0.36      0.34      6000



In [86]:
cm_RF_Tuned = confusion_matrix(y_test, y_pred_RF_Tuned, labels=[1,0])

In [87]:
df_RF_Tuned = pd.DataFrame(cm_RF_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,1265,62
Akt 0,3768,905


### 1d. Random Forest - Hyper Parameter Tuning (FINE TUNING)

In [66]:
param_RF2 = {
    "n_estimators" : [700, 750, 800],
    "max_depth" : [3,4],
    "min_samples_leaf" :[20, 25,30],
#     "max_features" : ['auto', 0.2, 0.1, 0.3],
    "class_weight" : [{0:x, 1: 1 - x} for x in [.15, .20]]
}

In [67]:
RF_rand2 = RandomForestClassifier()

In [68]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [69]:
RF_GS1 = GridSearchCV(RF_rand2, param_RF2, cv=skf, n_jobs=-1, verbose=1, scoring='recall')

In [70]:
RF_GS1.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.6min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': [{0: 0.15, 1: 0.85}, {0: 0.2, 1: 0.8}],
                         'max_depth': [3, 4], 'min_samples_leaf': [20, 25, 30],
                         'n_estimators': [700, 750, 800]},
             scoring='recall', verbose=1)

In [71]:
RF_GS1.best_params_

{'class_weight': {0: 0.15, 1: 0.85},
 'max_depth': 3,
 'min_samples_leaf': 30,
 'n_estimators': 750}

In [72]:
Model_GS1_Tuned = RF_GS1.best_estimator_

In [73]:
y_pred_RF2_Tuned = Model_GS1_Tuned.predict(X_test)

In [74]:
print(classification_report(y_test, y_pred_RF2_Tuned))

              precision    recall  f1-score   support

           0       0.91      0.47      0.62      4673
           1       0.31      0.84      0.45      1327

    accuracy                           0.55      6000
   macro avg       0.61      0.66      0.54      6000
weighted avg       0.78      0.55      0.58      6000



In [75]:
cm_RF2_Tuned = confusion_matrix(y_test, y_pred_RF2_Tuned, labels=[1,0])

In [76]:
df_RF2_Tuned = pd.DataFrame(cm_RF2_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF2_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,1116,211
Akt 0,2469,2204


In [None]:
# BELUM DIPAKE

### 1e. Random Forest - Hyper Parameter Tuning (FINE TUNING2)

In [196]:
param_RF3 = {
    "n_estimators" : [1000],
    "max_depth" : [2],
    "min_samples_leaf" :[20],
#     "max_features" : [0.3],
    "class_weight" : [{0:x, 1: 1 - x} for x in [.2, .25, .3, .4, .45, .5]]
}

In [197]:
RF_rand3 = RandomForestClassifier()

In [198]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [199]:
RF_GS2 = GridSearchCV(RF_rand3, param_RF3, cv=skf, n_jobs=-1, verbose=1, scoring='recall')

In [184]:
# {'n_estimators': 100,
#  'min_samples_leaf': 12,
#  'max_features': 'auto',
#  'max_depth': 5,
#  'class_weight': {0: 0.1, 1: 0.9}}

# {'class_weight': {0: 0.2, 1: 0.8},
#  'max_depth': 3,
#  'min_samples_leaf': 20,
#  'n_estimators': 800}
# , .25, .3, .4, .45, .5

In [200]:
RF_GS2.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   38.8s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': [{0: 0.2, 1: 0.8}, {0: 0.25, 1: 0.75},
                                          {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6},
                                          {0: 0.45, 1: 0.55},
                                          {0: 0.5, 1: 0.5}],
                         'max_depth': [2], 'min_samples_leaf': [20],
                         'n_estimators': [1000]},
             scoring='recall', verbose=1)

In [201]:
RF_GS2.best_params_

{'class_weight': {0: 0.2, 1: 0.8},
 'max_depth': 2,
 'min_samples_leaf': 20,
 'n_estimators': 1000}

In [202]:
Model_GS2_Tuned = RF_GS2.best_estimator_

In [203]:
y_pred_RF3_Tuned = Model_GS2_Tuned.predict(X_test)

In [204]:
print(classification_report(y_test, y_pred_RF3_Tuned))

              precision    recall  f1-score   support

           0       0.87      0.80      0.84      4673
           1       0.46      0.59      0.52      1327

    accuracy                           0.76      6000
   macro avg       0.67      0.70      0.68      6000
weighted avg       0.78      0.76      0.77      6000



In [205]:
cm_RF3_Tuned = confusion_matrix(y_test, y_pred_RF3_Tuned, labels=[1,0])

In [206]:
df_RF3_Tuned = pd.DataFrame(cm_RF3_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF3_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,782,545
Akt 0,918,3755


In [None]:
{'class_weight': {0: 0.15, 1: 0.85},
 'max_depth': 3,
 'min_samples_leaf': 30,
 'n_estimators': 750}

### 1f. Random Forest - Hyper Parameter Tuning (FINE TUNING3)

In [242]:
param_RF4 = {
    "n_estimators" : [650],
    "max_depth" : [3],
    "min_samples_leaf" :[35],
#     "max_features" : [0.3],
    "class_weight" : [{0:x, 1: 1 - x} for x in [.15, .2]]
}

In [243]:
RF_rand4 = RandomForestClassifier()

In [244]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [245]:
RF_GS3 = GridSearchCV(RF_rand4, param_RF4, cv=skf, n_jobs=-1, verbose=1, scoring='recall')

In [184]:
# {'n_estimators': 100,
#  'min_samples_leaf': 12,
#  'max_features': 'auto',
#  'max_depth': 5,
#  'class_weight': {0: 0.1, 1: 0.9}}

# {'class_weight': {0: 0.2, 1: 0.8},
#  'max_depth': 3,
#  'min_samples_leaf': 20,
#  'n_estimators': 800}
# , .25, .3, .4, .45, .5

In [246]:
RF_GS3.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   10.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': [{0: 0.15, 1: 0.85}, {0: 0.2, 1: 0.8}],
                         'max_depth': [3], 'min_samples_leaf': [35],
                         'n_estimators': [650]},
             scoring='recall', verbose=1)

In [247]:
RF_GS3.best_params_

{'class_weight': {0: 0.15, 1: 0.85},
 'max_depth': 3,
 'min_samples_leaf': 35,
 'n_estimators': 650}

In [248]:
Model_GS3_Tuned = RF_GS3.best_estimator_

In [249]:
y_pred_RF4_Tuned = Model_GS3_Tuned.predict(X_test)

In [250]:
print(classification_report(y_test, y_pred_RF4_Tuned))

              precision    recall  f1-score   support

           0       0.91      0.46      0.61      4673
           1       0.31      0.85      0.45      1327

    accuracy                           0.54      6000
   macro avg       0.61      0.65      0.53      6000
weighted avg       0.78      0.54      0.57      6000



In [251]:
cm_RF4_Tuned = confusion_matrix(y_test, y_pred_RF4_Tuned, labels=[1,0])

In [252]:
df_RF4_Tuned = pd.DataFrame(cm_RF3_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_RF4_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,1194,133
Akt 0,3180,1493


===============================================================================================================

## 2. XGBoost

### 2a. XGBoost - Base Algorithm

In [47]:
XG_Base = XGBClassifier()

In [48]:
XG_Base.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
y_pred_XGBase = XG_Base.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred_XGBase))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4673
           1       0.64      0.36      0.46      1327

    accuracy                           0.81      6000
   macro avg       0.74      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000



In [51]:
cm_XG_Base = confusion_matrix(y_test, y_pred_XGBase , labels=[1,0])

In [52]:
df_XG_Base = pd.DataFrame(cm_XG_Base, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_XG_Base

Unnamed: 0,Pred 1,Pred 0
Akt 1,479,848
Akt 0,273,4400


### 2b. XGBoost - Class Weight Tuning

In [53]:
XG_1 = XGBClassifier(scale_pos_weight =0.99)

In [55]:
XG_1.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=0.99, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
y_pred_XG1 = XG_1.predict(X_test)

In [57]:
print(classification_report(y_test, y_pred_XG1))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4673
           1       0.64      0.36      0.46      1327

    accuracy                           0.81      6000
   macro avg       0.74      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000



In [58]:
cm_XG_1 = confusion_matrix(y_test, y_pred_XG1 , labels=[1,0])

In [59]:
df_XG_1 = pd.DataFrame(cm_XG_1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_XG_1

Unnamed: 0,Pred 1,Pred 0
Akt 1,478,849
Akt 0,271,4402


### 2c. XGBoost- Hyper Parameter Tuning

In [136]:
param_XG = {
    'learning_rate': [1],
    "n_estimators" : np.arange(100, 10000, 100),
 "max_depth" : [None, 5, 10, 25, 30, 40, 50, 60, 80, 95, 100],
    'min_child_weight': [None, 5, 10, 25, 30, 40, 50, 60, 80, 95, 100],
    'eta':[.3],
    'subsample': [None, 5, 10, 25, 30, 40, 50, 60, 80, 95, 100],
    'colsample_bytree': [None, 5, 10, 25, 30, 40, 50, 60, 80, 95, 100],
    # Other parameters
    'objective': ['binary:logistic'],
    'scale_pos_weight' : [0.99]
}

In [137]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [138]:
XG_rand = XGBClassifier(random_state=42)

In [139]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [140]:
XG_RS= RandomizedSearchCV(XG_rand, param_XG, cv=skf, n_iter=50, n_jobs=-1, verbose=1, random_state=42, scoring='recall')

In [141]:
XG_RS.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 135 out of 150 | elapsed:    2.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.0min finished




RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None...
       5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600,
       6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700,
       7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800,
       8900, 9000, 9100, 9200, 9300, 9400, 9500, 

In [142]:
XG_RS.best_params_

{'subsample': None,
 'scale_pos_weight': 0.99,
 'objective': 'binary:logistic',
 'n_estimators': 3600,
 'min_child_weight': 40,
 'max_depth': 40,
 'learning_rate': 1,
 'eta': 0.3,
 'colsample_bytree': None}

In [143]:
Model_XG_Tuned = XG_RS.best_estimator_

In [144]:
y_pred_XG_Tuned = Model_XG_Tuned.predict(X_test)

In [145]:
print(classification_report(y_test, y_pred_XG_Tuned))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      4673
           1       0.49      0.38      0.43      1327

    accuracy                           0.78      6000
   macro avg       0.66      0.63      0.64      6000
weighted avg       0.76      0.78      0.76      6000



In [146]:
cm_XG_Tuned = confusion_matrix(y_test, y_pred_XG_Tuned, labels=[1,0])

In [147]:
df_XG_Tuned = pd.DataFrame(cm_XG_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_XG_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,501,826
Akt 0,519,4154


## 3.Logistic Regression

### 3a. LR Base

In [113]:
LR_Base = LogisticRegression()

In [114]:
LR_Base.fit(X_train, y_train)

LogisticRegression()

In [115]:
y_pred_LRBase = LR_Base.predict(X_test)

In [116]:
print(classification_report(y_test, y_pred_LRBase))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88      4673
           1       0.00      0.00      0.00      1327

    accuracy                           0.78      6000
   macro avg       0.39      0.50      0.44      6000
weighted avg       0.61      0.78      0.68      6000



In [117]:
cm_LR_Base = confusion_matrix(y_test, y_pred_LRBase , labels=[1,0])

In [118]:
df_LR_Base = pd.DataFrame(cm_LR_Base, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_LR_Base

Unnamed: 0,Pred 1,Pred 0
Akt 1,0,1327
Akt 0,0,4673


### 3b. LR with weighted parameter

In [119]:
LR_1 = LogisticRegression(class_weight={0:.1, 1:.9})

In [120]:
LR_1.fit(X_train,y_train)

LogisticRegression(class_weight={0: 0.1, 1: 0.9})

In [121]:
y_pred_LR1 = LR_1.predict(X_test)

In [122]:
print(classification_report(y_test,y_pred_LR1))

              precision    recall  f1-score   support

           0       0.94      0.10      0.17      4673
           1       0.23      0.98      0.38      1327

    accuracy                           0.29      6000
   macro avg       0.59      0.54      0.28      6000
weighted avg       0.78      0.29      0.22      6000



In [123]:
cm_LR1 = confusion_matrix(y_test,y_pred_LR1,labels=[1,0])

In [124]:
df_LR1 = pd.DataFrame(cm_LR1, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_LR1

Unnamed: 0,Pred 1,Pred 0
Akt 1,1296,31
Akt 0,4225,448


### 3c. LR With Hyperparameter tuning

In [125]:
param_LR = {
    'penalty' : ['none', 'l1', 'l2'],
    'C' : np.logspace(-4 , 4, 14),
    'class_weight' : [{0 : x, 1 : 1 - x} for x in [.05,.1,.15, .2,]]
}

In [126]:
LR_HP = LogisticRegression()

In [127]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [128]:
LR_GS = GridSearchCV(LR_HP, param_LR, cv=skf, n_jobs=-1, verbose=1, scoring='recall')

In [129]:
LR_GS.fit(X_train, y_train)

Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed:   22.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 4.12462638e-04, 1.70125428e-03, 7.01703829e-03,
       2.89426612e-02, 1.19377664e-01, 4.92388263e-01, 2.03091762e+00,
       8.37677640e+00, 3.45510729e+01, 1.42510267e+02, 5.87801607e+02,
       2.42446202e+03, 1.00000000e+04]),
                         'class_weight': [{0: 0.05, 1: 0.95}, {0: 0.1, 1: 0.9},
                                          {0: 0.15, 1: 0.85},
                                          {0: 0.2, 1: 0.8}],
                         'penalty': ['none', 'l1', 'l2']},
             scoring='recall', verbose=1)

In [130]:
LR_GS.best_params_

{'C': 587.8016072274924, 'class_weight': {0: 0.05, 1: 0.95}, 'penalty': 'l2'}

In [131]:
LR_GS_Tuned = LR_GS.best_estimator_

In [132]:
y_pred_LR_Tuned = LR_GS_Tuned.predict(X_test)

In [133]:
print(classification_report(y_test, y_pred_LR_Tuned))

              precision    recall  f1-score   support

           0       0.96      0.02      0.04      4673
           1       0.22      1.00      0.37      1327

    accuracy                           0.24      6000
   macro avg       0.59      0.51      0.20      6000
weighted avg       0.80      0.24      0.11      6000



In [134]:
cm_LR_Tuned = confusion_matrix(y_test, y_pred_LR_Tuned, labels=[1,0])

In [135]:
df_LR_Tuned = pd.DataFrame(cm_LR_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_LR_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,1323,4
Akt 0,4579,94


### 4a. KNN Base

In [88]:
KNN_Base = KNeighborsClassifier()

In [89]:
KNN_Base.fit(X_train, y_train)

KNeighborsClassifier()

In [90]:
y_pred_KNNBase = KNN_Base.predict(X_test)

In [91]:
print(classification_report(y_test, y_pred_KNNBase))

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      4673
           1       0.37      0.17      0.24      1327

    accuracy                           0.75      6000
   macro avg       0.58      0.55      0.54      6000
weighted avg       0.70      0.75      0.72      6000



In [92]:
cm_KNN_Base = confusion_matrix(y_test, y_pred_KNNBase , labels=[1,0])

In [93]:
df_KNN_Base = pd.DataFrame(cm_KNN_Base, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_KNN_Base

Unnamed: 0,Pred 1,Pred 0
Akt 1,231,1096
Akt 0,390,4283


### 4b. KNN With Hyperparameter tuning

In [253]:
param_KNN = {
    'n_neighbors': np.arange(20,73,2),
    'p': [1, 2],
    'weights': ['uniform', 'distance'],
}

In [255]:
KNN_HP = KNeighborsClassifier()

In [256]:
skf = StratifiedKFold(n_splits=3, random_state=42)

In [257]:
KNN_GS = GridSearchCV(KNN_HP, param_KNN, cv=skf, n_jobs=-1, verbose=1, scoring='recall')

In [258]:
KNN_GS.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  3.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': array([20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52,
       54, 56, 58, 60, 62, 64, 66, 68, 70, 72]),
                         'p': [1, 2], 'weights': ['uniform', 'distance']},
             scoring='recall', verbose=1)

In [259]:
KNN_GS.best_params_

{'n_neighbors': 22, 'p': 1, 'weights': 'distance'}

In [260]:
KNN_GS_Tuned = KNN_GS.best_estimator_

In [261]:
y_pred_KNN_Tuned = KNN_GS_Tuned.predict(X_test)

In [262]:
print(classification_report(y_test, y_pred_KNN_Tuned))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87      4673
           1       0.48      0.12      0.19      1327

    accuracy                           0.78      6000
   macro avg       0.64      0.54      0.53      6000
weighted avg       0.72      0.78      0.72      6000



In [263]:
cm_KNN_Tuned = confusion_matrix(y_test, y_pred_KNN_Tuned, labels=[1,0])

In [264]:
df_KNN_Tuned = pd.DataFrame(cm_KNN_Tuned, index=['Akt 1', 'Akt 0'], columns =  ['Pred 1', 'Pred 0'])
df_KNN_Tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,159,1168
Akt 0,172,4501


# Recommendation

- Create countermeasure program for customers who are predicted will have credit card default
- Please go to EDA file to get full insight

# Summary

- We suggest to use Random Forest - Hyper Parameter Tuning (FINE TUNING3), because after rough calculation, it has the lowest loss of money
- The model has recall 1 score of 0.85 with False Positive 3180 and False Negative 133
- With ML predict, we could make early prediction and decide strategy to prevent

# Export

In [265]:
import joblib

In [266]:
joblib.dump(Model_GS3_Tuned, 'ModelCreditCard1')

['ModelCreditCard1']

# Business Loss Rough Calculation

- False Positive Loss

We don't have proit loss on FP

- False Negative Loss

Assume 1 false negative = 10000 (mode credit limit)

In [267]:
10000 * 133

1330000

In [None]:
It was the lowest compared other model