# Defalut Credit Card 💳
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

import xgboost as xgb

## [1] Load Data
---

In [None]:
colab_path = '/content/drive/MyDrive/Colab Notebooks'
file_path = colab_path + '/python-mldlnl/natural-language-processing/Data/default of credit card clients.csv'

In [None]:
data = pd.read_csv(file_path, header=1).drop(columns="ID")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   30000 non-null  int64
 1   SEX                         30000 non-null  int64
 2   EDUCATION                   30000 non-null  int64
 3   MARRIAGE                    30000 non-null  int64
 4   AGE                         30000 non-null  int64
 5   PAY_0                       30000 non-null  int64
 6   PAY_2                       30000 non-null  int64
 7   PAY_3                       30000 non-null  int64
 8   PAY_4                       30000 non-null  int64
 9   PAY_5                       30000 non-null  int64
 10  PAY_6                       30000 non-null  int64
 11  BILL_AMT1                   30000 non-null  int64
 12  BILL_AMT2                   30000 non-null  int64
 13  BILL_AMT3                   30000 non-null  int64
 14  BILL_A

In [None]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [None]:
X = data.drop(columns="default payment next month")
y = data['default payment next month']

In [None]:
category = ["SEX", "EDUCATION", "MARRIAGE"]

for i in category:
    X[i] = X[i].astype("object")

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   LIMIT_BAL  30000 non-null  int64 
 1   SEX        30000 non-null  object
 2   EDUCATION  30000 non-null  object
 3   MARRIAGE   30000 non-null  object
 4   AGE        30000 non-null  int64 
 5   PAY_0      30000 non-null  int64 
 6   PAY_2      30000 non-null  int64 
 7   PAY_3      30000 non-null  int64 
 8   PAY_4      30000 non-null  int64 
 9   PAY_5      30000 non-null  int64 
 10  PAY_6      30000 non-null  int64 
 11  BILL_AMT1  30000 non-null  int64 
 12  BILL_AMT2  30000 non-null  int64 
 13  BILL_AMT3  30000 non-null  int64 
 14  BILL_AMT4  30000 non-null  int64 
 15  BILL_AMT5  30000 non-null  int64 
 16  BILL_AMT6  30000 non-null  int64 
 17  PAY_AMT1   30000 non-null  int64 
 18  PAY_AMT2   30000 non-null  int64 
 19  PAY_AMT3   30000 non-null  int64 
 20  PAY_AMT4   30000 non-null  i

## [2] Preprocessing
---

In [None]:
def preprocessing(df:pd.DataFrame):
    columns = df.columns

    for col in columns:
        if df[col].dtype == "object":
            ohe = OneHotEncoder(sparse=False)
            ohe_arr = ohe.fit_transform(df[col].values.reshape(-1, 1))
            ohe_df = pd.DataFrame(ohe_arr)

            df = pd.concat([df, ohe_df], axis=1)
            df = df.drop(columns=col)
        else:
            pass

    df = MinMaxScaler().fit_transform(df)

    return df

In [None]:
X_pre = preprocessing(X)



In [None]:
X_pre.shape

(30000, 33)

## [3] PCA
---

In [None]:
pca = PCA(n_components=0.95).fit(X_pre)

X_pca = pca.transform(X_pre)

In [None]:
X_pca.shape

(30000, 7)

In [None]:
# 주성분 행렬
print("주성분 행렬 : ", pca.singular_values_)

# 주성분에 투영 후 분산 정도 설명
print("분산 정도 : ", pca.explained_variance_)

# 주성분에 투영 후 분산 정도 설명
print("분산 정도 비율 : ", pca.explained_variance_ratio_)

주성분 행렬 :  [125.9595317  119.21788538 107.88352473  77.37351817  41.88450817
  26.53589807  23.47956079]
분산 정도 :  [0.52887775 0.47377927 0.38797476 0.19956203 0.05847902 0.02347258
 0.01837694]
분산 정도 비율 :  [0.29966878 0.26844929 0.21983138 0.11307436 0.03313495 0.01329986
 0.0104126 ]


## [4] train test split
---

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, stratify=y, random_state=42
)

## [5] Make Model
---

In [None]:
model = xgb.XGBClassifier(
    colsample_bytree=0.75,
    max_depth=5,
    min_child_weight=5,
    reg_alpha=10
)

In [None]:
# params = {
#     "max_depth": [3, 5, 7],
#     "min_child_weight": [1, 3, 5],
#     "reg_alpha": [0.01, 0.1, 1, 10],
#     "colsample_bytree": [0.5, 0.75, 1]
# }

# grid = GridSearchCV(
#     model,
#     params,
#     cv=5
# )

# grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=XGBClassifier(),
             param_grid={'colsample_bytree': [0.5, 0.75, 1],
                         'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5],
                         'reg_alpha': [0.01, 0.1, 1, 10]})

In [None]:
# grid.best_params_
{'colsample_bytree': 0.75,
 'max_depth': 5,
 'min_child_weight': 5,
 'reg_alpha': 10}

{'colsample_bytree': 0.75,
 'max_depth': 5,
 'min_child_weight': 5,
 'reg_alpha': 10}

## [6] Train Model
---

In [None]:
model.fit(X_train, y_train)

XGBClassifier(colsample_bytree=0.75, max_depth=5, min_child_weight=5,
              reg_alpha=10)

In [None]:
print(f"train score : {model.score(X_train, y_train)}")
print(f"test score : {model.score(X_test, y_test)}")

train score : 0.8160444444444445
test score : 0.8030666666666667
