In [43]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import missingno

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import model_selection, svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss, accuracy_score

%matplotlib inline # plt.show()를 하지 않아도 자동으로 생성되도록 만든다.

UsageError: unrecognized arguments: # plt.show()를 하지 않아도 자동으로 생성되도록 만든다.


## Pipeline 사용자함수

In [100]:
def run_pipeline(data_df, target):
    #자동으로 num과 cat 변수 갈라서 df 생성
    data_df_cat = data_df.select_dtypes(include=np.object)
    data_df_num = data_df.select_dtypes(exclude=np.object)

    # binary df 만들어주기
    data_df_bi = data_df[['gender','car','reality','work_phone','phone','email','dup']]

    # cat인데 num df에 들어간 변수 num df에서 drop해주기
    data_df_num = data_df_num.drop(columns=['gender','car','reality','work_phone','phone','email','dup'])

    x_train, x_test, y_train, y_test = model_selection.train_test_split(data_df,
                                                                    target,
                                                                   test_size = 0.2,
                                                                   random_state=0)
    
    binary_features = data_df_bi.columns
    
    numeric_features = data_df_num.columns
    numeric_transformer = StandardScaler() # cf) RobustScaler

    categorical_features = data_df_cat.columns
    categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') # categories='auto' : just for ignoring warning messages

    preprocessor = ColumnTransformer(
        transformers=[ # List of (name, transformer, column(s))
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
             ('bi','passthrough',binary_features)])

    preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
    preprocessor_pipe.fit(x_train)

    x_train_transformed = preprocessor_pipe.transform(x_train)
    x_test_transformed = preprocessor_pipe.transform(x_test)
    y_test = pd.DataFrame(y_test)  # typeerror: 'numpy.float64' object is not callable 해결
    
    return x_train_transformed, x_test_transformed, y_train, y_test

In [116]:
# Gradient Boosting 모델에 대해서 score와 logloss를 출력, logloss반환
from sklearn.metrics import log_loss, accuracy_score

def get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test):  
    params = {'n_estimators':1000, 'max_depth':4, 'min_samples_split':2,
      'learning_rate': 0.1, 'random_state': 0}
    
#     params = {'n_estimators':10, 'max_depth':3, 'min_samples_split':2,
#       'learning_rate': 1, 'random_state': 0}


    # 모델 학습
    model = GradientBoostingClassifier(**params)
    model.fit(x_train_transformed, y_train)

    # test셋에 대해서 예측 정확도 출력
    x_pred = model.predict(x_test_transformed)
    accuracy = accuracy_score(x_pred, y_test)
    print("model score:", round(accuracy, 4))
    
    proba_result = model.predict_proba(x_test_transformed)
    print("proba_result :",proba_result)

    pro_logloss = log_loss(y_test, proba_result)
    print('logloss: ',pro_logloss)
    
    return pro_logloss

# 머신러닝 피처별 모델 비교

    
피처들을 제거했을때보다 그대로 활용했을때 logloss가 적게 나옴

In [52]:
data_df_groupby = pd.read_csv('pre_credit_df(fill_groupby).csv')
data_df_ml = pd.read_csv('pre_credit_df(fill_ML).csv')

In [53]:
data_df_groupby['occyp_type'].value_counts()

Laborers                 7850
Unemployed               4440
Core staff               3039
Sales staff              2539
Managers                 2167
Drivers                  1575
High skill tech staff    1040
Accountants               902
Medicine staff            864
Cooking staff             457
Security staff            424
Cleaning staff            403
Private service staff     243
Low-skill Laborers        127
Waiters/barmen staff      124
Secretaries                97
Realty agents              63
HR staff                   62
IT staff                   41
Name: occyp_type, dtype: int64

In [54]:
data_df_ml['occyp_type'].value_counts()

Laborers                 5669
Unemployed               4439
Core staff               3225
Sales staff              3168
Managers                 2692
Drivers                  1810
High skill tech staff    1153
Accountants              1072
Medicine staff            970
Cooking staff             494
Cleaning staff            466
Security staff            460
Private service staff     252
Low-skill Laborers        142
Waiters/barmen staff      131
Secretaries               110
Realty agents              92
HR staff                   66
IT staff                   46
Name: occyp_type, dtype: int64

# 01. Gradient Boosting Classifier

### 직업열을 최빈값으로 채웠을때

In [106]:
data_df = pd.read_csv('pre_credit_df(fill_groupby).csv')
target = data_df['credit']
data_df = data_df.drop(['credit'], axis=1)

In [107]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df, target)

In [108]:
log_loss = get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)

model score: 0.7113
proba_result : [[0.19253043 0.54913261 0.25833696]
 [0.2124928  0.1122863  0.6752209 ]
 [0.09996602 0.18176523 0.71826876]
 ...
 [0.16646437 0.14562803 0.68790759]
 [0.04555345 0.0822485  0.87219805]
 [0.10149636 0.25429551 0.64420813]]
logloss:  0.750289981989438


### 직업열을 ML로 채웠을때

In [117]:
data_df = pd.read_csv('pre_credit_df(fill_ML).csv')
target = data_df['credit']
data_df = data_df.drop(['credit'], axis=1)

In [118]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df, target)

In [119]:
log_loss = get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)

model score: 0.7099
proba_result : [[0.24639096 0.56067398 0.19293506]
 [0.18534298 0.10426497 0.71039206]
 [0.1234076  0.23043188 0.64616051]
 ...
 [0.22919343 0.1334627  0.63734386]
 [0.06669923 0.08847617 0.8448246 ]
 [0.12243065 0.23232722 0.64524212]]
logloss:  0.7518756527434367


## Gradient Boosting Classifier (occyp_type 제거)

In [120]:
data_df_del_occyp = data_df.drop(['occyp_type'], axis=1)

In [121]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df_del_occyp, target)

In [122]:
log_loss = get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)

model score: 0.7073
proba_result : [[0.19978546 0.62226832 0.17794622]
 [0.20078572 0.08143067 0.71778361]
 [0.13221115 0.18697141 0.68081744]
 ...
 [0.19819948 0.15221955 0.64958097]
 [0.06225055 0.06812585 0.8696236 ]
 [0.08484535 0.17060914 0.74454551]]


TypeError: 'numpy.float64' object is not callable

## Gradient Boosting Classifier (adult_num 추가)
- adult_num = family_size - child_num

In [None]:
data_df_adult = data_df.copy()
data_df_adult['adult_num'] = data_df['family_size'] - data_df['child_num']
data_df_adult = data_df_adult.drop(['child_num', 'family_size'], axis=1)
data_df_adult

In [None]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df_adult, target)

In [None]:
log_loss = get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)

## Gradient Boosting Classifier (child_num 제거)

In [None]:
data_df_del_child = data_df.drop(['child_num'],axis=1).copy()
data_df_del_child.head()

In [None]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df_del_child, target)

In [None]:
log_loss = get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)

## Gradient Boosting (occpy_type 결측치 최빈값으로 채웠을때)

In [None]:
# occyp_type 
data_df = pd.read_csv('pre_credit_df(fill_groupby).csv')
target = data_df['credit']
data_df = data_df.drop(['credit'], axis=1)

In [None]:
data_df

In [None]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df, target)

In [None]:
get_logloss_GBC(x_train_transformed, x_test_transformed, y_train, y_test)