# [모듈 2.2] 고객 이탈 데이터 준비, Autogluon 훈련 및 평가

### [알림] <font coler="red"> conda_python3 커널 </font> 과 함께 사용해야 합니다.

* 이 노트북은 `0.1.Install_Package` 반드시 먼저 실행해야 합니다.



# 0. 환경 셋업

In [1]:
import pandas as pd
pd.set_option("display.max_columns", 500)

import numpy as np
import matplotlib.pyplot as plt
import os
import time
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")


import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "ml_data_prep_workshop/autogluon-bankfraud"
role = get_execution_role()

In [2]:
%load_ext autoreload
%autoreload 2

# 1. 데이타 준비

In [3]:
fraud = pd.read_csv('data/PS_20174392719_1491204439457_log.csv')
fraud

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


###  레이블 컬럼을 숫자형으로 변경 혹은 유지

레이블 컬럼이 숫자형으로 1, 0 이면 데이터 프레임의 가장 앞에 위치 시킴.
이유는 "상관 계수 분석시" 에 레이블 컬럼도 포함시기키 위함 입니다.

In [4]:
from src.tabular_utils import change_y

#    isChange 이면 레이블 컬럼을 데이터 프레임의 가장 맨 앞에 위치 함.
#    그렇지 않으면 'True.' --> 1 로 바꾸고, 아니면 0으로 합니다.
fraud = change_y(fraud, col='isFraud', isChange=False)
fraud

Unnamed: 0,isFraud,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0
1,0,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0
2,1,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,0
3,1,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,0
4,0,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,1,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,0
6362616,1,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,0
6362617,1,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,0
6362618,1,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,0


## 피쳐 제거
- "데이터 프로 파일링 분석" 과 "상관 계수 분석" 에 기반하여 아래 컬럼을 삭제 합니다.

In [5]:
drop_cols = ['step','isFlaggedFraud','nameOrig','nameDest']
fraud_cl = fraud.drop(columns=drop_cols)
fraud_cl

Unnamed: 0,isFraud,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,0,PAYMENT,9839.64,170136.00,160296.36,0.00,0.00
1,0,PAYMENT,1864.28,21249.00,19384.72,0.00,0.00
2,1,TRANSFER,181.00,181.00,0.00,0.00,0.00
3,1,CASH_OUT,181.00,181.00,0.00,21182.00,0.00
4,0,PAYMENT,11668.14,41554.00,29885.86,0.00,0.00
...,...,...,...,...,...,...,...
6362615,1,CASH_OUT,339682.13,339682.13,0.00,0.00,339682.13
6362616,1,TRANSFER,6311409.28,6311409.28,0.00,0.00,0.00
6362617,1,CASH_OUT,6311409.28,6311409.28,0.00,68488.84,6379898.11
6362618,1,TRANSFER,850002.52,850002.52,0.00,0.00,0.00


# 3. 데이터 세트 분리
- 전체 데이타를 8:1:1 의 비율로 훈련, 검증, 테스트 데이터 셋으로 분리 합니다.
- 훈련과 검증 데이터는 CSV 데이터로 로컬에 저장 합니다.

In [6]:
train_data, validation_data, test_data = np.split(
    fraud_cl.sample(frac=1, random_state=1024),
    [int(0.8 * len(fraud_cl)), int(0.9 * len(fraud_cl))],
)
train_data.to_csv("train.csv", header=True, index=False)
validation_data.to_csv("validation.csv", header=True, index=False)

# Autogluon

In [7]:
train_df = train_data.copy()
val_df = validation_data.copy()
test_df = test_data.copy()

In [8]:
train_df.head()

Unnamed: 0,isFraud,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
185334,0,PAYMENT,1855.14,0.0,0.0,0.0,0.0
1267555,0,CASH_IN,183881.02,69630.0,253511.02,0.0,0.0
1974680,0,TRANSFER,2304730.15,59251.0,0.0,5251.04,2309981.19
201403,0,PAYMENT,8271.71,0.0,0.0,0.0,0.0
5661861,0,CASH_IN,87252.58,11602.0,98854.58,732047.92,644795.35


## 오토글루온 데이터 타입으로 변경

TBD

In [9]:
from autogluon.tabular import TabularPredictor as task
from autogluon.tabular import TabularDataset

# train_data = TabularDataset(auto_train_file)
train_data = TabularDataset(train_df)
val_data = TabularDataset(val_df)
test_data = TabularDataset(test_df)



In [10]:

import autogluon.core as ag

eval_metric = 'roc_auc' 

save_path = 'basic_fraud_autogluon_models'

presets = 'medium_quality_faster_train'
# presets = 'optimize_for_deployment'
label = 'isFraud'

tabular_predictor = task(label=label,
                         path = save_path,
                         eval_metric = eval_metric,
                        )

exclude_model_list = ['KNN','GBM','NN','FASTAI','RF','CAT','XT']
# exclude_model_list = ['KNN','NN','FASTAI','RF']





In [11]:
%%time

predictor = tabular_predictor.fit(
                                   train_data = train_data, 
                                   tuning_data = val_data,    
                                   presets=presets,
                                   excluded_model_types= exclude_model_list    
                                   )


Presets specified: ['medium_quality_faster_train']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "basic_fraud_autogluon_models/"
AutoGluon Version:  0.3.1
Train Data Rows:    5090096
Train Data Columns: 6
Tuning Data Rows:    636262
Tuning Data Columns: 6
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:    

CPU times: user 11min 6s, sys: 7.05 s, total: 11min 13s
Wall time: 1min 10s


## 모델 예측

In [12]:
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(labels=[label],axis=1)  # delete label column to prove we're not cheating


In [13]:
# predictor = task.load(dir)  # unnecessary, just demonstrates how to load previously-trained predictor from file

prediction = predictor.predict(test_data_nolab)
prediction_prob = predictor.predict_proba(test_data_nolab)
# print("Predictions:  ", prediction)
# perf = predictor.evaluate_predictions(y_true=y_test, y_pred=prediction, auxiliary_metrics=True)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=prediction_prob, auxiliary_metrics=True)

Evaluation: roc_auc on test data: 0.9997979435687551
Evaluations on test data:
{
    "roc_auc": 0.9997979435687551,
    "accuracy": 0.9996730906450487,
    "balanced_accuracy": 0.8884538049734524,
    "mcc": 0.8622507041484996,
    "f1": 0.8577291381668946,
    "precision": 0.9572519083969465,
    "recall": 0.7769516728624535
}


In [14]:
from src.tabular_utils import get_prediction_set, compute_f1

threshold = 500
df_pred = get_prediction_set(prediction, prediction_prob, threshold )    
df_pred

Unnamed: 0,score,pred
4859644,0,0
2524773,0,0
3149940,0,0
5920777,0,0
1981300,0,0
...,...,...
4063833,0,0
5261925,0,0
3521121,0,0
4924305,0,0


In [15]:
from sklearn.metrics import classification_report, roc_auc_score
from IPython.display import display as dp
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#from src import p_utils
%matplotlib inline
%config InlineBackend.figure_format='retina'



compute_f1(y_test, df_pred.pred.values)

- ROC_AUC SCORE
	-0.888

- F1 SCORE
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    635455
           1       0.96      0.78      0.86       807

    accuracy                           1.00    636262
   macro avg       0.98      0.89      0.93    636262
weighted avg       1.00      1.00      1.00    636262

[[635427     28]
 [   180    627]]


## 리더 보드 생성

In [16]:
 predictor.leaderboard(test_data,  extra_info=False, silent=True)


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.999798,0.99991,1.540726,1.952909,42.876859,1.540726,1.952909,42.876859,1,True,1
1,WeightedEnsemble_L2,0.999798,0.99991,1.546977,2.137329,43.069138,0.00625,0.18442,0.192279,2,True,2


## Feature Importance

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

fea_importance_raw = predictor.feature_importance(test_data)

fea_importance = fea_importance_raw['importance']

f, ax = plt.subplots(figsize=(10,5))
plot = sns.barplot(x=fea_importance.index, y = fea_importance.values)

ax.set_title('Feature Importance')
plot.set_xticklabels(plot.get_xticklabels(),rotation='vertical')
plt.show()

Computing feature importance via permutation shuffling for 6 features using 1000 rows with 3 shuffle sets...
	1.49s	= Expected runtime (0.5s per shuffle set)


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [18]:
# fea_importance_raw

In [31]:
test_data.isFraud.value_counts()

0    635455
1       807
Name: isFraud, dtype: int64