# Classification 종합실습

 ## 신용대출 심사

* 고객사는 ## 은행입니다. 신용평가 업무를 인공지능으로 전환하고자 여러분에게 모델링을 의뢰하였습니다.
* 대출업무는
    * 은행 창구에서 신청을 받고
    * 본사의 심사부서에서는 신용평가를 통해 대출 신청에 대한 승인 여부를 결정해 왔습니다.

* 현장의 요구
    * 경쟁사의 공격적인 대출상품 판매로, 본사에서는 자사 은행의 대출 실적이 줄어들고 있는 것에 부담을 느끼고 있습니다.
    * 그런데, 자사 은행에서는 신용평가 결과의 정확성에 의문을 품고 있으며, 신용평가 기준을 완화하여 가급적 대출승인 범위를 더 확대해 주기를 요구합니다. 

* 신용평가 업무를 인공지능으로 전환
    * 현장의 요구를 감안하여, 과거 사람이 하던 평가방식을 개선하고자 인공지능에 의한 예측 모델을 만들고, 정확도를 높이고자 합니다.
    * 최적의 모델을 생성해 봅시다.


## 1.환경준비

### (1) import

In [2]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import * 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) 데이터 준비

In [3]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.loc[data['Payment'] == 4, 'Payment'] = 3
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,Apartment,CreditCount,Occupation,Dependents,Telephone,ForeignWorker
0,1,3,24,2,0,1249,2,4,2,1,28,2,1,3,1,1,1
1,1,2,9,2,0,276,3,4,4,1,22,1,1,2,1,1,1
2,1,1,18,3,2,1049,2,2,4,2,21,1,1,3,1,1,1
3,1,1,24,3,1,6419,5,2,4,4,44,3,2,4,2,2,1
4,1,3,12,2,2,1424,5,2,4,1,55,2,1,4,1,2,1


|	칼럼명	|	설명	|	 	|	값 의미	|
|	-----	|	-----	|	-----	|	-----	|
|	Creditability	|	Creditability(Target)	|	향후 신용도	|	0 : Bad, 1 : Good	|
|	AccountBalance	|	Account Balance	|	은행잔고	|	1: No account, 2 : None (No balance), 3 : Some Balance	|
|	CreditDuration	|	Duration of Credit (month)	|	신청한 대출기간(월)	|	숫자	|
|	Payment	|	Payment Status of Previous Credit	|	과거 대출 납입 상태	|	0 : 연체, 1 : 기타신용, 2 : 완납, 3 : 정상 대출상환 중 |
|	Purpose	|	Purpose	|	신청한 대출목적	|	1 : New Car , 2 : Used Car , 3 : Furniture , 4 : TV , 5 : Appliances , 6 : Repair , 8 : Vacation , 9 :Retraining , 10 : Business , 0 : Other	|
|	CreditAmount	|	Credit Amount($)	|	신청한 대출금액	|		|
|	Employment	|	Length of current employment(Month)	|	현 직업 근무 기간	|	1: Unemployed,  2: <1 Year,  3: [1, 4),  4: [4, 7),  5: Above 7	|
|	SexMarital	|	Sex & Marital Status	|	성별 & 결혼상태	|	1: Male, Divorced, 2: Male, Single , 3: Male, Married/Widowed , 4: Female	|
|	CurrentAddress	|	Duration in Current address	|	현 거주지 거주기간	|	1: <1 Year , 2: [1, 4) , 3: [4, 7) , 4: Above 7	|
|	MostValuable	|	Most valuable available asset	|	가장 가치있는 자산	|	1: None , 2: Car , 3: Life Insurance , 4: Real Estate	|
|	Age	|	Age (years)	|	나이	|		|
|	AppartmentType	|	Type of apartment	|	주거환경	|	1: free apartment, 2: Rented, 3: Owned	|
|	NoCredits	|	No of Credits at this Bank	|	현재 총 대출 건수	|	1 : one, 2 : 2 ~ 3, 3 : 4 ~ 5, 4 : 6 ~	|
|	Occupation	|	Occupation	|	직업	|	1: Unemployed, unskilled, 2: Unskilled Permanent Resident, 3: Skilled, 4: Executive	|
|	Telephone	|	Telephone	|	전화기 소유 여부	|	2: Yes , 1: No	|
|	ForeignWorker	|	Foreign Worker	|	외국인 근로자 여부	|	2: Yes , 1: No	|


## 2.데이터 준비

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [4]:
target = 'Creditability'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

In [5]:
dummy_vars = ['Employment', 'CurrentAddress', 'CreditCount', 'Dependents', 'Telephone', 'AccountBalance', 'Payment', 'Purpose', 'SexMarital', 'MostValuableAsset', 'Apartment','Occupation','ForeignWorker']
x = pd.get_dummies(x, columns = dummy_vars, drop_first = True)
x.head()

Unnamed: 0,Duration,CreditAmount,Age,Employment_2,Employment_3,Employment_4,Employment_5,CurrentAddress_2,CurrentAddress_3,CurrentAddress_4,...,SexMarital_4,MostValuableAsset_2,MostValuableAsset_3,MostValuableAsset_4,Apartment_2,Apartment_3,Occupation_2,Occupation_3,Occupation_4,ForeignWorker_2
0,24,1249,28,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,0
1,9,276,22,0,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,18,1049,21,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,24,6419,44,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,0,1,0
4,12,1424,55,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0


### (5) 데이터분할2 : train : validation 나누기

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling

In [7]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## 3.모델링

* 사용 알고리즘 : LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, SVC
* 3가지의 알고리즘을 선정하고 성능을 튜닝해 봅시다.

### (1) 로지스틱 회귀

* 함수 생성

In [23]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit(disp=False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)
        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

* 전진선택법 수행

In [24]:
selected, step_df = forward_stepwise_logistic(x_train, y_train)

* 선택된 변수

In [33]:
selected

['AccountBalance_3',
 'Payment_3',
 'Purpose_1',
 'CreditAmount',
 'Payment_2',
 'Duration',
 'Employment_2',
 'AccountBalance_2',
 'ForeignWorker_2',
 'Purpose_3',
 'SexMarital_3',
 'CreditCount_2',
 'CurrentAddress_2',
 'MostValuableAsset_4',
 'Telephone_2',
 'Occupation_4']

In [26]:
step_df

Unnamed: 0,step,feature,aic
0,1.0,[AccountBalance_3],766.998784
1,1.0,[Age],830.197635
2,1.0,[Apartment_2],844.952933
3,1.0,[Payment_3],861.716743
4,1.0,[SexMarital_3],866.666531
...,...,...,...
531,16.0,"[AccountBalance_3, Payment_3, Purpose_1, Credi...",686.277690
532,16.0,"[AccountBalance_3, Payment_3, Purpose_1, Credi...",686.292828
533,16.0,"[AccountBalance_3, Payment_3, Purpose_1, Credi...",686.295740
534,16.0,"[AccountBalance_3, Payment_3, Purpose_1, Credi...",686.306342


* 전체 변수로 모델링

In [35]:
model_total = LogisticRegression().fit(x_train, y_train)
pred_total = model_total.predict(x_val)

print(classification_report(y_val, pred_total))

              precision    recall  f1-score   support

           0       0.59      0.46      0.52        97
           1       0.77      0.85      0.81       203

    accuracy                           0.72       300
   macro avg       0.68      0.66      0.66       300
weighted avg       0.71      0.72      0.71       300



* 전진선택법 변수로 모델링

In [36]:
model_forwards = LogisticRegression().fit(x_train[selected], y_train)
pred_forwards = model_forwards.predict(x_val[selected])

print(classification_report(y_val, pred_forwards))

              precision    recall  f1-score   support

           0       0.62      0.46      0.53        97
           1       0.77      0.86      0.81       203

    accuracy                           0.73       300
   macro avg       0.69      0.66      0.67       300
weighted avg       0.72      0.73      0.72       300



### (2) 의사결정나무

In [38]:
model_rs = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                              param_distributions={'max_depth': range(1, 11),
                                                   'min_samples_leaf': range(20, 300, 20)},
                              cv=5,).fit(x_train, y_train)

In [39]:
model_rs.best_params_

{'min_samples_leaf': 80, 'max_depth': 8}

In [40]:
model_rs.best_score_

0.7428571428571429

In [42]:
model_rs.cv_results_

{'mean_fit_time': array([0.0028368 , 0.00161057, 0.0008564 , 0.00101833, 0.00103502,
        0.00118561, 0.00142031, 0.00102816, 0.00125604, 0.00080051]),
 'std_fit_time': array([3.42760590e-03, 4.81386498e-04, 1.93557125e-05, 1.63905794e-05,
        2.16073011e-05, 1.70180262e-04, 9.36563283e-04, 6.97928355e-05,
        4.63569108e-05, 1.06609102e-05]),
 'mean_score_time': array([0.00070205, 0.0006855 , 0.00048466, 0.00048838, 0.00046573,
        0.00048103, 0.00051608, 0.00050025, 0.00054979, 0.0004591 ]),
 'std_score_time': array([2.74431595e-04, 3.24986804e-04, 1.31233348e-05, 1.32124340e-05,
        6.78819188e-06, 2.00535963e-05, 3.73660867e-05, 4.13633629e-05,
        1.17377222e-04, 8.55999069e-06]),
 'param_min_samples_leaf': masked_array(data=[140, 60, 260, 100, 80, 60, 200, 100, 40, 260],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked

In [43]:
pred_rs = model_rs.predict(x_val)
print(classification_report(y_val, pred_rs))

              precision    recall  f1-score   support

           0       0.55      0.46      0.50        97
           1       0.76      0.82      0.79       203

    accuracy                           0.70       300
   macro avg       0.66      0.64      0.65       300
weighted avg       0.69      0.70      0.70       300



### (3) KNN

In [47]:
KNN_rs = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                            param_distributions={'n_neighbors': [3, 5, 10, 15],
                                                 'metric': ['minkowski', 'euclidean', 'cityblock', 'chevyshev']},
                            cv=5,
                            n_iter=20).fit(x_train_s, y_train)

In [48]:
KNN_rs.best_params_

{'n_neighbors': 5, 'metric': 'minkowski'}

In [49]:
KNN_rs.best_score_

0.7214285714285715

In [51]:
KNN_rs.cv_results_

{'mean_fit_time': array([0.0005836 , 0.00019879, 0.00020838, 0.0001966 , 0.00018239,
        0.00018592, 0.00020385, 0.00019865, 0.00022397, 0.0001852 ,
        0.00019393, 0.00024738, 0.0002039 , 0.00016971, 0.00016227,
        0.00015545]),
 'std_fit_time': array([6.12178698e-04, 1.73776890e-05, 1.74003099e-05, 1.13569358e-05,
        7.02586827e-06, 1.02972540e-05, 4.00787234e-05, 7.41692571e-06,
        3.55985686e-05, 1.26378940e-05, 8.55733404e-06, 3.40314775e-05,
        3.20438118e-05, 2.74088056e-06, 3.29190361e-06, 9.41677936e-07]),
 'mean_score_time': array([0.01222262, 0.00267863, 0.00295849, 0.00285544, 0.00277405,
        0.00309787, 0.00343633, 0.00394716, 0.00352068, 0.00385618,
        0.00414844, 0.00438046, 0.        , 0.        , 0.        ,
        0.        ]),
 'std_score_time': array([1.67179902e-02, 1.10331195e-04, 1.89966526e-04, 1.48904312e-04,
        6.33966434e-05, 5.67019065e-04, 7.00889231e-04, 8.31001021e-04,
        3.25520436e-04, 1.14136002e-03, 4.29

In [52]:
pred_KNN_rs = KNN_rs.predict(x_val_s)
print(classification_report(y_val, pred_KNN_rs))

              precision    recall  f1-score   support

           0       0.49      0.29      0.36        97
           1       0.72      0.86      0.78       203

    accuracy                           0.67       300
   macro avg       0.60      0.57      0.57       300
weighted avg       0.64      0.67      0.65       300



### (4) SVM

In [59]:
pred_KNN_rs == KNN_rs.best_estimator_.predict(x_val_s)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,