In [1]:
from pycaret.classification import *
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

# 실험 환경 구축 (Setup the enviornment)
PyCaret에서는 모델 학습 전 실험 환경을 구축 해주어야 합니다. setup 함수를 통해 환경을 구축할 수 있습니다.  
setup 단계에서는 PyCaret이 자동으로 컬럼 형태를 인식합니다. 그 후 사용자에게 제대로 인식되었는지 확인을 받게 됩니다.   그 때 enter를 눌러주시면 됩니다.  
또한 주어진 데이터의 얼마를 사용하여 train / validation을 구축할지 묻게 되는데, 전체 데이터를 사용하고 싶다면 enter 눌러주시면 됩니다.  
In PyCaret you have to setup the environment before experimenting with the models. It can be done by using 'setup' method.  
In setup stage, PyCaret automatically interprets column types of the given data and asks the user if it has intepreted it correctly. You can customize whether you want each columns to be interpreted differently by using the parameters in setup method. In this tutorial we will just go with the automatic interpretation by pressing 'enter'.  
Also, it asks the ratio of dataset used to contruct train/validation sets. We will use 100% of the dataset so just press 'enter' again.  

In [6]:
clf = setup(data = train, target = 'credit')

Unnamed: 0,Description,Value
0,session_id,1713
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26457, 20)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,13
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
#NA 처리 해주자
train.fillna('NaN',inplace = True)
test.fillna('NaN',inplace = True)

In [9]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function                   <function log_loss at 0x7fe066778ca0>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

# 모델 학습 및 비교
환경 구축을 했으니 PyCaret에서 제공하는 기본 모델에 대해 학습하고 비교해보겠습니다.  
compared_models 함수를 통해 15개의 기본 모델을 학습하고 성능을 비교할 수 있습니다.  
log_loss 기준  
Now we have constructed the environment, we will now train and compare the default models provided in PyCaret  
By using 'compare_models' method we can easily train and compare 15 default models provided in the package  

In [15]:
#svm, ridge는 predict_proba 미지원
best_3 = compare_models(fold = 5, sort = "logloss",n_select = 4,exclude=['svm','ridge'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
catboost,CatBoost Classifier,0.7015,0.6985,0.442,0.6864,0.6358,0.2599,0.3255,0.7721,4.996
xgboost,Extreme Gradient Boosting,0.6991,0.7017,0.4428,0.677,0.6354,0.258,0.3191,0.7733,7.24
lightgbm,Light Gradient Boosting Machine,0.6977,0.6918,0.4243,0.6889,0.6212,0.2321,0.3132,0.7741,0.966
gbc,Gradient Boosting Classifier,0.6942,0.6464,0.4102,0.6662,0.6089,0.2103,0.3029,0.7973,3.134
nb,Naive Bayes,0.645,0.606,0.3347,0.539,0.507,0.0048,0.0389,0.8629,0.052
lda,Linear Discriminant Analysis,0.6451,0.6048,0.3376,0.5279,0.5127,0.013,0.0506,0.863,0.13
lr,Logistic Regression,0.644,0.5892,0.3333,0.4147,0.5045,0.0,0.0,0.8678,1.074
rf,Random Forest Classifier,0.698,0.7369,0.5266,0.6733,0.6786,0.3572,0.3653,0.9886,0.956
ada,Ada Boost Classifier,0.6918,0.6226,0.4033,0.6249,0.6018,0.196,0.2966,1.0786,0.348
et,Extra Trees Classifier,0.6695,0.6997,0.5094,0.6466,0.6534,0.307,0.3118,2.4807,1.06


# 모델 앙상블

In [17]:
blended = blend_models(estimator_list = best_3, fold = 5, optimize = 'logloss', method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.7006,0.704,0.4258,0.6929,0.6236,0.2362,0.3241,0.761
1,0.6947,0.6998,0.4153,0.6721,0.6134,0.2197,0.3033,0.7711
2,0.7038,0.7093,0.4323,0.6891,0.6302,0.253,0.3333,0.761
3,0.6984,0.7042,0.4261,0.718,0.6221,0.2316,0.3164,0.766
4,0.6965,0.7055,0.4187,0.6825,0.6166,0.2245,0.3093,0.7677
Mean,0.6988,0.7046,0.4236,0.6909,0.6212,0.233,0.3173,0.7654
SD,0.0032,0.003,0.006,0.0153,0.0058,0.0115,0.0106,0.0039


# 모델 예측 (prediction)

In [None]:
pred_holdout = predict_model(blended)

In [None]:
#전체 데이터를 활용한 final 학습
final_model = finalize_model(blended)

In [None]:
#Accurary, AUC, Logloss 셋다 상위 4개인 모델 사용
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model_custom])
prections = prep_pipe.predict_proba(test)
prections

In [None]:
i, j, k = [], [], []
for row in prections:
  i.append(row[0])
  j.append(row[1])
  k.append(row[2])

len(i), len(j), len(k)

In [None]:
submit['0'] = i
submit['1'] = j
submit['2'] = k
submit.head(10)
submit.to_csv(PATH+"\submisison.csv",index=False)

In [None]:
#결과 제출 - predict with test set
#predictions = predict_model(final_model, data = test)
#submission['credit'] = predictions['Score']
#submission.to_csv('submission_proba.csv', index = False)