In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train.head(5)
test.head(5)

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,M,Y,N,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,-21990,365243,1,0,1,0,,2.0,-60.0
1,26458,F,N,Y,0,135000.0,State servant,Higher education,Married,House / apartment,-18964,-8671,1,0,1,0,Core staff,2.0,-36.0
2,26459,F,N,Y,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,-15887,-217,1,1,1,0,Laborers,2.0,-40.0
3,26460,M,Y,N,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-19270,-2531,1,1,0,0,Drivers,2.0,-41.0
4,26461,F,Y,Y,0,225000.0,State servant,Higher education,Married,House / apartment,-17822,-9385,1,1,0,0,Managers,2.0,-8.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [8]:
train.shape#26457,20
test.shape#10000,19

(10000, 19)

In [9]:
#결측치 처리
train.fillna('NaN',inplace = True)
test.fillna('NaN',inplace = True)

#이상치 처리
train['family_size']>7인 데이터 제거

In [10]:
train = train[(train['family_size']<=7)].reset_index(drop = True)
train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26446,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26447,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26448,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26449,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


## Feature Engineering
1.의미없는 변수 제거<br>
-index제거  
-FLAG_MOBIL 삭제 : 모든 값이 1로 동일

In [11]:
train.drop(['index','FLAG_MOBIL'],axis = 1,inplace = True)
test.drop(['index','FLAG_MOBIL'],axis = 1, inplace = True)

In [12]:
train.shape #
test.shape#

(10000, 17)

2.DAYS_EMPLOYED<br>
-양수인 데이터는 현재 무직자임. 0처리

In [13]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x : 0 if x>0 else x)
test['DAYS_EMNPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x : 0 if x>0 else x)
#map은 series만 사용가능 / apply 는 dataframe 사용할떄, 즉 두개 이상의 변수 사용할떄 

3. 현재기준으로 음수 처리 되어있는 변수들 양수로 변환<br>
DAYS_BIRTH,begin_month,DAYS_EMPLOYED

In [14]:
feats = ['DAYS_BIRTH','begin_month','DAYS_EMPLOYED']
for i in feats:
    train[i] = np.abs(train[i])
    test[i] = np.abs(test[i])

In [15]:
train.head(3)

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22.0,2.0


4.파생변수 생성<br>
-numeric 변수는 최대한 다양한 특징을 보일 수 있또록 생성  
-category변수는 여러가지 조합, but 전체 변수를 합친 ID하나만 만들었을때 가장 log_loss낮았음.
>카테고리 변수를 파생변수로 생성시 성능이 안좋아질 수도 있음. 따라서 numeric변수보다 신중하게 생성해야함.<br>
>의미없는 카테고리 변수 생성시 모델 성능 저하<br>  
날짜 변수 처리시, 해당 변수의 월, 주 파생 변수를 생성.
----

In [22]:
train['before_EMPLOYED'] = train['DAYS_BIRTH'] - train['DAYS_EMPLOYED']
np.floor(train['before_EMPLOYED']/30)
(np.floor(train['DAYS_BIRTH']/30)/12)

0        38.583333
1        31.583333
2        53.000000
3        41.833333
4        41.750000
           ...    
26446    33.500000
26447    42.416667
26448    28.000000
26449    28.166667
26450    54.333333
Name: DAYS_BIRTH, Length: 26451, dtype: float64

In [198]:
for df in [train,test]:
    #before EMPLOYED : 고용되기 전까지의 일수 == 취직된 나이
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_beforeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED'] #무슨 의미가 있는 변수일까? 집안수준?
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED']/30) - ((np.floor(df['before_EMPLOYED']/30)/12).astype(int)*12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED']/7) - ((np.floor(df['before_EMPLOYED']/7)/4).astype(int)*4)
    
    #DAY_BIRTH 파생변수 > 1. AGE,태어난 월, 태어난 주 
    df['Age'] = df["DAYS_BIRTH"] //365 #// operate : round(,0)
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH']/30)-((np.floor(df['DAYS_BIRTH']/30)/12).astype(int)*12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH']/ 7)-((np.floor(df['DAYS_BIRTH']/7)/4).astype(int)*4)
    
    #DAYS_EMPLOYED_m 파생변수 > 1.EMPLOYED(근속연수),DAYS_EMPLOYED_m(고용된달),DAYS_EMPLOYED_w(고용된주)
    df['EMPLOYED'] = df['DAYS_EMPLOYED']//365
    df["DAYS_EMPLOYED_m"] = np.floor(df['DAYS_EMPLOYED']/30)-((np.floor(df["DAYS_EMPLOYED"]/30)/12).astype(int)*12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED']/7) -((np.floor(df['DAYS_EMPLOYED']/7)/4).astype(int)*4)
    
    #??ability : 소득/(살아온 일수 + 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean : 소득 / 가족 수 
    df['income_mean'] = df['income_total'] + df['family_size']
    
    #***ID생성*** 고유한 사람 파악.index()개념
    df['ID'] = \
    df['child_num'].astype(str)+'_'+df['income_total'].astype(str)+'_'+\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str)+'_'+\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str)+'_'+\
    df['email'].astype(str) +'_'+df['family_size'].astype(str)+'_'+\
    df['gender'].astype(str)+'_'+df['car'].astype(str)+'_'+\
    df['reality'].astype(str)+'_'+df['income_type'].astype(str)+'_'+\
    df['edu_type'].astype(str)+'_'+df['family_type'].astype(str)+'_'+\
    df['house_type'].astype(str)+'_'+df['occyp_type'].astype(str)
    

5.파생변수와 다중공선을 보이는 컬럼삭제  
cols = ['child_num','DAYS_BIRTH',"DAYS_EMPLOYED"]

In [199]:
cols = ['child_num','DAYS_BIRTH',"DAYS_EMPLOYED"]
train.drop(cols,axis = 1,inplace = True)
test.drop(cols,axis = 1,inplace = True)

##Scaling,Encoding.  
1.Numeric,Category 컬럼 분류

In [200]:
#df.dtypes : 데이터 타입 반환
numeric_features = train.dtypes[train.dtypes != 'object'].index.tolist()#tolist() 리스트로
numeric_features.remove('credit')
print("Num of numerical features",len(numeric_features))

categoric_features = train.dtypes[train.dtypes == 'object'].index.tolist()
print('Num of Categoric features',len(categoric_features))

Num of numerical features 18
Num of Categoric features 9


In [201]:
numeric_features

['income_total',
 'work_phone',
 'phone',
 'email',
 'family_size',
 'begin_month',
 'before_EMPLOYED',
 'income_total_beforeEMP_ratio',
 'before_EMPLOYED_m',
 'before_EMPLOYED_w',
 'Age',
 'DAYS_BIRTH_m',
 'DAYS_BIRTH_w',
 'EMPLOYED',
 'DAYS_EMPLOYED_m',
 'DAYS_EMPLOYED_w',
 'ability',
 'income_mean']

In [202]:
categoric_features

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type',
 'ID']

In [203]:
train['ID']

0        0_202500.0_13899_4709_0_0_0_2.0_F_N_N_Commerci...
1        1_247500.0_11380_1540_0_0_1_3.0_F_N_Y_Commerci...
2        0_450000.0_19087_4434_0_1_0_2.0_M_Y_Y_Working_...
3        0_202500.0_15088_2092_0_1_0_2.0_F_N_Y_Commerci...
4        0_157500.0_15037_2105_0_0_0_2.0_F_Y_Y_State se...
                               ...                        
26446    2_225000.0_12079_1984_0_0_0_4.0_F_N_N_State se...
26447    1_180000.0_15291_2475_0_0_0_2.0_F_N_Y_Working_...
26448    0_292500.0_10082_2015_0_0_0_2.0_F_Y_N_Working_...
26449    0_171000.0_10145_107_0_0_0_1.0_M_N_Y_Working_I...
26450    0_81000.0_19569_1013_0_0_0_2.0_F_N_N_Working_S...
Name: ID, Length: 26451, dtype: object

In [204]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

3.OrdinalEncoder  
    -카테고리 변수는 ordinal_encoder 변환
    -ID는 변환 후 정수 처리

In [205]:
#credit(tartget)이 순서형이므로 ordinal_encoder?!
encoder = OrdinalEncoder(categoric_features)
train[categoric_features] = encoder.fit_transform(train[categoric_features],train['credit'])
test[categoric_features] = encoder.transform(test[categoric_features])

In [206]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26451 entries, 0 to 26450
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gender                        26451 non-null  int64  
 1   car                           26451 non-null  int64  
 2   reality                       26451 non-null  int64  
 3   income_total                  26451 non-null  float64
 4   income_type                   26451 non-null  int64  
 5   edu_type                      26451 non-null  int64  
 6   family_type                   26451 non-null  int64  
 7   house_type                    26451 non-null  int64  
 8   work_phone                    26451 non-null  int64  
 9   phone                         26451 non-null  int64  
 10  email                         26451 non-null  int64  
 11  occyp_type                    26451 non-null  int64  
 12  family_size                   26451 non-null  float64
 13  b

In [207]:
for col in categoric_features:
    print(col)
    print(train.loc[:,col].value_counts())

gender
1    17694
2     8757
Name: gender, dtype: int64
car
1    16407
2    10044
Name: car, dtype: int64
reality
2    17826
1     8625
Name: reality, dtype: int64
income_type
2    13639
1     6202
4     4449
3     2154
5        7
Name: income_type, dtype: int64
edu_type
2    17989
1     7162
3     1020
4      257
5       23
Name: edu_type, dtype: int64
family_type
1    18194
4     3495
2     2123
3     1536
5     1103
Name: family_type, dtype: int64
house_type
2    23647
3     1257
1      818
5      429
6      190
4      110
Name: house_type, dtype: int64
occyp_type
1     8171
2     4512
6     2646
4     2539
3     2167
7     1572
5     1040
9      902
8      864
14     457
11     424
12     401
13     243
18     127
19     123
15      97
10      63
16      62
17      41
Name: occyp_type, dtype: int64
ID
131     35
386     24
728     24
1493    21
285     20
        ..
8703     1
8641     1
4617     1
2570     1
2047     1
Name: ID, Length: 8756, dtype: int64


In [208]:
train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

4.Clustering

In [210]:
kmean_train = train.drop(['credit'],axis = 1)
kmeans = KMeans(n_clusters = 36, random_state = 42).fit(kmean_train)
train['cluster'] = kmeans.predict(kmean_train)
test['cluster'] = kmeans.predict(test)

In [211]:
train.head(5)

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,...,Age,DAYS_BIRTH_m,DAYS_BIRTH_w,EMPLOYED,DAYS_EMPLOYED_m,DAYS_EMPLOYED_w,ability,income_mean,ID,cluster
0,1,1,1,12.218505,1,1,1,1,0,0,...,38,7.0,1.0,12,0.0,0.0,10.882416,202502.0,1,31
1,1,1,2,12.419174,1,2,2,2,0,0,...,31,7.0,1.0,4,3.0,0.0,19.156347,247503.0,2,18
2,2,2,2,13.017007,2,1,1,2,0,1,...,52,0.0,2.0,12,3.0,1.0,19.13184,450002.0,3,4
3,1,1,2,12.218505,1,2,1,2,0,1,...,41,10.0,3.0,5,9.0,2.0,11.786962,202502.0,4,31
4,1,2,2,11.967193,3,1,1,2,0,0,...,41,9.0,0.0,5,10.0,0.0,9.187959,157502.0,5,10


5.numeric 변수들 scaling

In [212]:
numeric_features.remove("income_total")
scaler = StandardScaler()
train[numeric_features] = scaler.fit_transform(train[numeric_features])
test[numeric_features] = scaler.transform(test[numeric_features])

전처리 완료<br>
-----

# 모델 학습

In [213]:
#Modeling with catmoost
n_est =500
seed = 42
n_fold =5
n_class = 3

target = 'credit'
X=train.drop(target,axis = 1)
y=train[target]
X_test = test

In [218]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115127
0:	learn: 1.0346960	test: 1.0333820	best: 1.0333820 (0)	total: 20.9ms	remaining: 20.9s
100:	learn: 0.7049036	test: 0.6440103	best: 0.6439612 (96)	total: 5.33s	remaining: 47.4s
200:	learn: 0.6850821	test: 0.6423158	best: 0.6423158 (200)	total: 10.6s	remaining: 42.2s
300:	learn: 0.6656269	test: 0.6411788	best: 0.6411788 (300)	total: 16.1s	remaining: 37.3s
400:	learn: 0.6488328	test: 0.6419315	best: 0.6411743 (301)	total: 21.5s	remaining: 32.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6411743032
bestIteration = 301

Shrink model to first 302 iterations.


CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=26]=-0.7342287299181643 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [219]:
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [220]:
plot_feature_importance(model_cat.get_feature_importance(),X_test.columns,'CATBOOST')

ValueError: arrays must all be same length