In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import catboost
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting
#from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import SimpleImputer
from tqdm import tqdm

from pathlib import Path
import warnings
warnings.filterwarnings(action = 'ignore')



### 데이터 불러오기

In [2]:
y_train = pd.read_csv('train.csv').credit

In [3]:
test_A_ct = pd.read_csv('test/test_A_ct.csv')
test_A_nm = pd.read_csv('test/test_A_nm.csv')
test_B_ct = pd.read_csv('test/test_B_ct.csv')
test_B_nm = pd.read_csv('test/test_B_nm.csv')
test_C_ct = pd.read_csv('test/test_C_ct.csv')
test_C_nm = pd.read_csv('test/test_C_nm.csv')
test_D_ct = pd.read_csv('test/test_D_ct.csv')
test_D_nm = pd.read_csv('test/test_D_nm.csv')
test_E_ct = pd.read_csv('test/test_E_ct.csv')
test_E_nm = pd.read_csv('test/test_E_nm.csv')

In [4]:
train_A_ct = pd.read_csv('train/train_A_ct.csv')
train_A_nm = pd.read_csv('train/train_A_nm.csv')
train_B_ct = pd.read_csv('train/train_B_ct.csv')
train_B_nm = pd.read_csv('train/train_B_nm.csv')
train_C_ct = pd.read_csv('train/train_C_ct.csv')
train_C_nm = pd.read_csv('train/train_C_nm.csv')
train_D_ct = pd.read_csv('train/train_D_ct.csv')
train_D_nm = pd.read_csv('train/train_D_nm.csv')
train_E_ct = pd.read_csv('train/train_E_ct.csv')
train_E_nm = pd.read_csv('train/train_E_nm.csv')

In [5]:
X_test_cat = pd.concat([test_A_ct, test_B_ct, test_C_ct, test_D_ct, test_E_ct], axis = 1)
X_test_num = pd.concat([test_A_nm, test_B_nm, test_C_nm, test_D_nm, test_E_nm], axis = 1)

In [6]:
X_train_cat = pd.concat([train_A_ct, train_B_ct, train_C_ct, train_D_ct, train_E_ct], axis = 1)
X_train_num = pd.concat([train_A_nm, train_B_nm, train_C_nm, train_D_nm, train_E_nm], axis = 1)

In [7]:
X_train_cat.shape, X_test_cat.shape, X_train_num.shape, X_test_num.shape

((26457, 46), (10000, 46), (26457, 6725), (10000, 6725))

In [8]:
pd.set_option('display.max_rows', None)

In [9]:
X_train_num.isnull().sum()

Unnamed: 0                                                                  6
work_phone                                                                  6
phone                                                                       6
email                                                                       6
family_size                                                                 6
begin_month                                                                 6
before_EMPLOYED                                                             6
income_total_befofeEMP_ratio                                                6
before_EMPLOYED_m                                                           6
before_EMPLOYED_w                                                           6
Age                                                                         6
DAYS_BIRTH_m                                                                6
DAYS_BIRTH_w                                                    

In [10]:
X_train_cat.isnull().sum()

Unnamed: 0           6
gender               6
car                  6
reality              6
income_type          6
edu_type             6
family_type          6
house_type           6
occyp_type           6
ID                   6
Unnamed: 0           0
income_total         0
DAYS_BIRTH           0
DAYS_EMPLOYED        0
family_size          0
begin_month          0
Unnamed: 0           0
gender               0
car                  0
reality              0
income_type          0
edu_type             0
family_type          0
house_type           0
occyp_type           0
personal_id          0
personal_begin_id    0
g_r_c                0
p_w_e                0
Unnamed: 0           0
gender               0
car                  0
reality              0
income_type          0
family_type          0
occyp_type           0
CODE                 0
index                0
gender               0
car                  0
reality              0
income_type          0
edu_type             0
family_type

In [11]:
X_tr_col = [X_train_cat.columns]

In [12]:
X_train_num.fillna(X_train_num.mean, inplace=True)

In [13]:
imputer_mode = SimpleImputer(strategy='most_frequent')
imputer_mode.fit(X_train_cat)
X_train_cat = imputer_mode.transform(X_train_cat)

In [14]:
X_train_cat = pd.DataFrame(X_train_cat)

In [15]:
X_train_cat.columns = X_tr_col

In [16]:
X_test_cat.columns = X_tr_col

In [17]:
X_train_cat.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,occyp_type,ID,...,CODE,index,gender.1,car.1,reality.1,income_type.1,edu_type.1,family_type.1,house_type.1,occyp_type.1
0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,F-13899202500.0Commercial associate,0,0,0,0,0,1,1,2,18
1,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,...,F-11380247500.0Commercial associate,1,0,0,1,0,4,0,1,8
2,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,3.0,3.0,...,M-19087450000.0Working,2,1,1,1,4,1,1,1,10
3,3.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,4.0,4.0,...,F-15088202500.0Commercial associate,3,0,0,1,0,4,1,1,14
4,4.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0,3.0,5.0,...,F-15037157500.0State servant,4,0,1,1,2,1,1,1,10


In [18]:
df_num = pd.concat([X_train_num, X_test_num])
df_cat = pd.concat([X_train_cat, X_test_cat])

### Feature 중복 제거

In [19]:
df_num_columns = list(pd.RangeIndex(len(df_num.columns)))
df_cat_columns = list(pd.RangeIndex(len(df_cat.columns)))

In [20]:
df_num = df_num.loc[:,~df_num.T.duplicated()]
df_cat = df_cat.loc[:,~df_cat.T.duplicated()]

In [21]:
len(df_num.columns), len(df_cat.columns)

(5927, 39)

In [22]:
df_num_columns = list(pd.RangeIndex(len(df_num.columns)))
df_cat_columns = list(pd.RangeIndex(len(df_cat.columns)))

In [23]:
df_num.drop(['Unnamed: 0'], axis=1, inplace=True)
df_cat.drop(['Unnamed: 0'], axis=1, inplace=True)

#### Numeric features

In [24]:
#df_num = df_num.apply(lambda x:x.clip(x.quantile(.05), x.quantile(.95)), axis=0)

In [25]:
df_num.shape, df_cat.shape

((36457, 5924), (36457, 36))

In [26]:
36451-10000

26451

In [27]:
X_train_num = df_num.iloc[:26451,:]
X_test_num = df_num.iloc[26451:,:]
X_train_cat = df_cat.iloc[:26451,:]
X_test_cat = df_cat.iloc[26451:,:]

In [28]:
X_train_cat.shape, X_train_num.shape, X_test_cat.shape, X_test_num.shape

((26451, 36), (26451, 5924), (10006, 36), (10006, 5924))

In [29]:
import klib

In [30]:
# 결측값 파악
klib.missingval_plot(X_train_cat)

No missing values found in the dataset.


In [31]:
klib.missingval_plot(X_train_num)

No missing values found in the dataset.


In [32]:
klib.missingval_plot(X_test_cat)

No missing values found in the dataset.


In [33]:
klib.missingval_plot(X_test_num)

No missing values found in the dataset.


#### 추가 전처리
1. 'Unnamed: 0' 컬럼 모두 삭제
2. nan값 포함된 행 모두 삭제(train data 6개만 존재) -> 위에서 처리
3. 중복되는 이름의 컬럼 정리

In [34]:
X_cat = pd.concat([X_train_cat, X_test_cat], axis=0)
X_num = pd.concat([X_train_num, X_test_num], axis=0)

In [35]:
X_cat.shape, X_num.shape

((36457, 36), (36457, 5924))

In [36]:
col_list = []

for col in X_num.columns:
    if col+'2' in col_list:
        col_list.append(col+'3')
    elif col in col_list:
        col_list.append(col+'2')
    else:
        col_list.append(col)

In [37]:
X_num.columns = col_list

In [38]:
alist = []

for col in X_cat.columns:
    alist.append(col[0])
    
X_cat.columns = alist

In [39]:
col_list2 = []

for col in X_cat.columns:
    if col+'2' in col_list2:
        col_list2.append(col+'3')
    elif col in col_list2:
        col_list2.append(col+'2')
    else:
        col_list2.append(col)

In [40]:
X_cat.columns = col_list2

In [41]:
len(X_num.columns), len(set(X_num.columns))

(5924, 5924)

In [42]:
len(X_cat.columns), len(set(X_cat.columns))

(36, 35)

In [43]:
col_list2[-1] = 'occyp_type4'

In [44]:
X_cat.columns = col_list2

In [45]:
len(X_cat.columns), len(set(X_cat.columns))

(36, 36)

X_num.to_csv('X_num.csv', index=False)
X_cat.to_csv('X_cat.csv', index=False)

* * *

## SHAP

In [46]:
X_num.shape, X_cat.shape

((36457, 5924), (36457, 36))

In [None]:
X_num.iloc[2]

In [None]:
# DF, based on which importance is checked
X_importance = X_test

# Explain model predictions using shap library:
model = CatBoostClassifier().fit(X_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

shap_sum = pd.DataFrame(np.abs(shap_values).mean(axis=0)).T.iloc[0]
importance_df = pd.DataFrame([X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
importance_df

In [47]:
model = LGBMClassifier(random_state=42).fit(X_num, y_train)
X_importance = X_num

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

shap_sum = np.abs(shap_values).mean(axis=0)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: work_phone, phone, email, family_size, begin_month, before_EMPLOYED, income_total_befofeEMP_ratio, before_EMPLOYED_m, before_EMPLOYED_w, Age, DAYS_BIRTH_m, DAYS_BIRTH_w, EMPLOYED, DAYS_EMPLOYED_m, DAYS_EMPLOYED_w, ability, income_mean

In [None]:
importance_df = pd.DataFrame([X_train.columns, shap_sum]).T
importance_df.columns = ['feature_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)