In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append('mymodule')
from mymodule.evals import print_eval_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

## 분류 실습 : 산탄데르 고객만족 예측
-  XGBoost와 LightGBM을 이용

#### 예제 데이터
- Kaggle의 산탄데르 고객만족(Santander Customer Satisfaction) 데이터 세트
- 산탄데르 은행이 캐글에 의뢰한 데이터
- https://www.kaggle.com/c/santander-customer-satisfaction/data
- features : 370개, 모두 익명 처리
- target : 1이면 불만, 0이면 만족
- 모델의 성능평가:  ROC-AUC

#### 데이터 준비 및 파악

In [3]:
cust_df = pd.read_csv('data/santander/train.csv', encoding='latin-1')
cust_df.shape

(76020, 371)

In [4]:
cust_df.head(3)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


In [5]:
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [6]:
cust_df.isna().sum() > 0

ID                         False
var3                       False
var15                      False
imp_ent_var16_ult1         False
imp_op_var39_comer_ult1    False
                           ...  
saldo_medio_var44_hace3    False
saldo_medio_var44_ult1     False
saldo_medio_var44_ult3     False
var38                      False
TARGET                     False
Length: 371, dtype: bool

In [7]:
cust_df.TARGET.value_counts()

TARGET
0    73012
1     3008
Name: count, dtype: int64

In [8]:
np.round(cust_df[cust_df.TARGET == 1]['TARGET'].count() / cust_df.shape[0], 4)

np.float64(0.0396)

In [9]:
desc = cust_df.describe().T
desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,76020.0,75964.050723,43781.947379,1.00,38104.7500,76043.00,113748.7500,151838.00
var3,76020.0,-1523.199277,39033.462364,-999999.00,2.0000,2.00,2.0000,238.00
var15,76020.0,33.212865,12.956486,5.00,23.0000,28.00,40.0000,105.00
imp_ent_var16_ult1,76020.0,86.208265,1614.757313,0.00,0.0000,0.00,0.0000,210000.00
imp_op_var39_comer_ult1,76020.0,72.363067,339.315831,0.00,0.0000,0.00,0.0000,12888.03
...,...,...,...,...,...,...,...,...
saldo_medio_var44_hace3,76020.0,1.858575,147.786584,0.00,0.0000,0.00,0.0000,24650.01
saldo_medio_var44_ult1,76020.0,76.026165,4040.337842,0.00,0.0000,0.00,0.0000,681462.90
saldo_medio_var44_ult3,76020.0,56.614351,2852.579397,0.00,0.0000,0.00,0.0000,397884.30
var38,76020.0,117235.809430,182664.598503,5163.75,67870.6125,106409.16,118756.2525,22034738.76


In [10]:
desc.loc[desc[desc['min'] < 0].index]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
var3,76020.0,-1523.199,39033.46,-999999.0,2.0,2.0,2.0,238.0
saldo_var1,76020.0,48.44911,10937.47,-0.9,0.0,0.0,0.0,3000000.0
saldo_var5,76020.0,1028.468,9852.14,-2895.72,0.0,3.0,90.0,619329.2
saldo_var8,76020.0,141.2268,2515.656,-4942.26,0.0,0.0,0.0,240045.0
saldo_var30,76020.0,13679.67,63014.08,-4942.26,0.0,3.0,235.995,3458077.0
saldo_var40,76020.0,4.368602,113.9688,-0.9,0.0,0.0,0.0,8192.61
saldo_var42,76020.0,7191.725,49145.31,-4942.26,0.0,3.0,120.0,3008077.0
delta_imp_aport_var13_1y3,76020.0,48671400.0,695953700.0,-1.0,0.0,0.0,0.0,10000000000.0
delta_imp_aport_var17_1y3,76020.0,5130229.0,226443500.0,-1.0,0.0,0.0,0.0,10000000000.0
delta_imp_aport_var33_1y3,76020.0,131544.3,36269040.0,-1.0,0.0,0.0,0.0,10000000000.0


#### 데이터 전처리

- 결측치 처리 / 피처 삭제

In [11]:
cust_df[cust_df['var3'] == -999999].index

Index([  782,  3361,  4103,  4422,  7053,  8214,  9210, 11015, 11055, 11163,
       ...
       70334, 70567, 71191, 72194, 74231, 74796, 74846, 75012, 75783, 75968],
      dtype='int64', length=116)

In [12]:
# var3 피처의 값 -999999 : 결측치를 나타냄
cust_df['var3'].replace(-999999, 2, inplace=True)

In [13]:
cust_df.drop('ID', axis=1, inplace=True)

In [14]:
cust_df.shape

(76020, 370)

#### 학습/테스트 데이터 준비

In [15]:
X = cust_df.iloc[:, :-1]
y = cust_df.iloc[:,-1] # y = cust_df.TARGET
X.shape

(76020, 369)

#### 학습/검증 데이터 세트

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

print(f'Train: {X_train.shape}, Test: {X_test.shape}')
print(f'Train target :\n {y_train.value_counts()}')
print(f'Test target :\n {y_test.value_counts()}')
print(f'Train target :\n {y_train.value_counts()/y_train.count()}')

Train: (60816, 369), Test: (15204, 369)
Train target :
 TARGET
0    58410
1     2406
Name: count, dtype: int64
Test target :
 TARGET
0    14602
1      602
Name: count, dtype: int64
Train target :
 TARGET
0    0.960438
1    0.039562
Name: count, dtype: float64


In [17]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

### XGBoost 모델 학습과 하이퍼 파라미터 튜닝

#### 디본 파라미터 설정 후 학습/성능예측

In [18]:
print_eval_score(y_test, pred, pred_proba)

NameError: name 'pred' is not defined

### HyperOpt를 이용한 모델 튜닝

검색 공간 설정

In [None]:
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin

In [None]:
param_search_space = {'n_estimators':hp.quniform('n_estimators',100,600,50),
                     'max_depth':hp.quniform('max_depth',5,20,1),
                     'min_child_weight':hp.quniform('min_child_weight',1,6,1),
                     'learning_rate':hp.uniform('learning_rate',0.01,0.2),
                     'colsample_bytree':hp.uniform('colsample_bytree',0.5,0.95)
                     }

#### 목적 함수 설정

- 조기 중단을 위해 KFold 사용(cross_val_score)

In [None]:
def objective_func(search_space):
    xgb = XGBClassifier(n_estimators=int(search_space['n_estimators']),
                      max_depth= int(search_space['max_depth']),
                      min_child_weight=search_space['min_child_weight'],
                      learning_rate=search_space['learning_rate'],
                      colsample_bytree=search_space['colsample_bytree'],
                      eval_metric='auc')

    scores = cross_val_score(xgb, X_train, y_train, scoring='roc_auc', cv=3)
    return {'loss':-1*np.mean(scores), 'status':STATUS_OK}

#### fmin()함수를 사용하여 최적 파라미터 추출

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
trial_val = Trials()
best = fmin(fn=objective_func, 
               space= param_search_space, 
               algo = tpe.suggest, 
               max_evals = 50,
               trials=trial_val,
               rstate=np.random.default_rng(seed=30))
# print(f'best_params:{best}')

for k in best.keys():
    print(f'{k}: \t{best[k]:.5f}')

#### 최적으로 찾은 하이퍼파라미터로 학습과 예측

In [None]:
xgb2 = XGBClassifier(n_estimators=int(best['n_estimators']),
                      max_depth= int(best['max_depth']),
                      min_child_weight=int(best['min_child_weight']),
                      learning_rate=best['learning_rate'],
                      colsample_bytree=best['colsample_bytree'],
                      ealry_stopping_round=50,
                      eval_metric='auc')
evals = [(X_tr, y_tr), (X_val, y_val)]
xgb2.fit(X_tr, y_tr, eval_set=evals, verbose=False)
pred = xgb2.predict(X_test)
pred_proba = xgb2.predict_proba(X_test)[:,1]
print_eval_score(y_test, pred, pred_proba, target_names=['satisfied', 'Dissatisfied'])

#### 피처 중요도 시각화

In [None]:
from mymodule.dtVisual import plot_ftr_importances

plot_ftr_importances(xgb2, X.columns, top_n=20)

In [None]:
from xgboost import plot_importance

fig, axs = plt.subplots(figsize=(12,8))
plot_importance(xgb2, ax=axs, max_num_features=20, height=0.4)
plt.show()

### LightGBM 모델 학습과 하이퍼 파라미터 튜닝

In [None]:
from lightgbm import LGBMClassifier, early_stopping
lgbm = LGBMClassifier(n_estimators=150)
eval_set = [(X_tr, y_tr),(X_val, y_val)]
lgbm.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=100)], eval_metric='auc',
        eval_set=eval_set)
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)[:,1]
print_eval_score(y_test, pred, pred_proba, target_names=['Satisfied', 'Dissatisfied'])

#### 검색 공간 설정

In [None]:
lgbm_search_space = {'n_estimators':hp.quniform('n_estimators',100,600,50),
                     'max_depth':hp.quniform('max_depth',50,160,1),
                     'num_leaves':hp.quniform('num_leaves',32,64,1),
                     'min_child_samples':hp.quniform('min_child_samples',60, 100, 1),
                     'learning_rate':hp.uniform('learning_rate',0.01,0.2),
                     'subsample':hp.uniform('subsample',0.6, 1)
                     }

#### 목적 함수 설정

In [None]:
from sklearn.model_selection import KFold
def lgbm_objective_func(search_space):
    lgbm_clf = LGBMClassifier(n_estimators=int(search_space['n_estimators']),
                              max_depth=int(search_space['max_depth']),
                              num_leaves=int(search_space['num_leaves']),
                              min_child_samples=int(search_space['min_child_samples']),
                              learning_rate=search_space['learning_rate'],
                              subsample=search_space['subsample']
                             )
    scores = []
    kf = KFold(n_splits=3)
    for tr_idx, val_idx in kf.split(X_train):
        X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]
        lgbm_clf.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=30)],
                     eval_set = [(X_tr, y_tr), (X_val, y_val)], eval_metric='auc')
        pred_proba = lgbm_clf.predict_proba(X_val)[:,1]
        scores.append(roc_auc_score(y_val, pred_proba))

    return {'loss':-1*np.mean(scores), 'status':STATUS_OK}                 

#### fmin()함수로 최적 파라미터 추출

In [None]:
trial_val_lgbm = Trials()
best_lgbm = fmin(fn=lgbm_objective_func, 
               space= lgbm_search_space, 
               algo = tpe.suggest, 
               max_evals = 100,
               trials=trial_val,
               rstate=np.random.default_rng(seed=30))
# print(f'best_params:{best}')

for k in best_lgbm.keys():
    print(f'{k}: \t{best_lgbm[k]:.5f}')

#### 최적의 하이퍼파라미터로 학습 및 예측

In [19]:
lgbm2 = LGBMClassifier(n_estimators=int(best_lgbm['n_estimators']),
                              max_depth=int(best_lgbm['max_depth']),
                              num_leaves=int(best_lgbm['num_leaves']),
                              min_child_samples=int(best_lgbm['min_child_samples']),
                              learning_rate=best_lgbm['learning_rate'],
                              subsample=best_lgbm['subsample'], verbosity=-1
                             )

lgbm2.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=100)],
                     eval_set = [(X_tr, y_tr), (X_val, y_val)], eval_metric='auc')
pred2 = lgbm2.predict(X_test)
pred_proba2 = lgbm2.predict_proba(X_test)[:,1]
print_eval_score(y_test, pred2, pred_proba2, target_names=['Satisfied', 'Dissatisfied'])        

NameError: name 'LGBMClassifier' is not defined

----