In [None]:
#import matplotlib_hangul
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X_train = pd.read_csv('data/preprocessing_train.csv', encoding = 'cp949')
X_test = pd.read_csv('data/preprocessing_test.csv', encoding = 'cp949')

submission = pd.read_csv('data/sample_submission.csv', encoding = 'cp949')

## __Feature Engineering__

### DAYS_EMPLOYED

- 양수값은 고용되지 않은 상태를 의미하므로 무직자라고 생각함. 0으로 처리하기

In [None]:
X_train['DAYS_EMPLOYED'] = X_train['DAYS_EMPLOYED'].map(lambda x:0 if x >0 else x)
X_test['DAYS_EMPLOYED'] = X_test['DAYS_EMPLOYED'].map(lambda x:0 if x >0 else x)

### DAYS_BIRTH, DAYS_EMPLOYED, begin_month 
- 음수값 -> 양수값으로 변환하기

In [None]:
new_features = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']
for features in new_features:
    X_train[features] = np.abs(X_train[features])
    X_test[features] = np.abs(X_test[features])

In [None]:
X_train.head()

- 양수값으로 잘 변환한것을 알 수 있음.

### 'age' Feauture 생성 -> DAYS_BIRTH으로 변환 작업

In [None]:
X_train['age'] = round(X_train['DAYS_BIRTH']/365,0).astype(np.int32)
X_train.head(1)

In [None]:
X_test['age'] = round(X_test['DAYS_BIRTH']/365,0).astype(np.int32)
X_test.head(1)

### 시각화해서 분포 확인하기

In [None]:
X_train['age'].plot.hist(bins = range(10,80,10), color = 'c', edgecolor = 'k')

plt.xlabel('age')
plt.title('Age distribution')
plt.show()

- 시각화한 결과, 데이터에 30, 40, 50대가 주로 이루고 있음

### 'age_group' feature 생성 -> age으로 변환 작업

In [None]:
X_train['age_group'] = X_train['age'].copy()
X_test['age_group'] = X_test['age'].copy()

In [None]:
X_train['age_group'] = X_train['age_group'].replace({21, 22, 23, 24, 25, 26, 27, 27, 28, 29}, '20대')
X_train['age_group'] = X_train['age_group'].replace({30, 31, 32, 33, 34, 35, 36, 37, 38, 39}, '30대')
X_train['age_group'] = X_train['age_group'].replace({40, 41, 42, 43, 44, 45, 46, 47, 48, 49}, '40대')
X_train['age_group'] = X_train['age_group'].replace({50, 51, 52, 53, 54, 55, 56, 57, 58, 59}, '50대')
X_train['age_group'] = X_train['age_group'].replace({60, 61, 62, 63, 64, 65, 66, 67, 68, 69}, '60대')

In [None]:
X_train['age_group'].value_counts()

In [None]:
X_test['age_group'] = X_test['age_group'].replace({21, 22, 23, 24, 25, 26, 27, 27, 28, 29}, '20대')
X_test['age_group'] = X_test['age_group'].replace({30, 31, 32, 33, 34, 35, 36, 37, 38, 39}, '30대')
X_test['age_group'] = X_test['age_group'].replace({40, 41, 42, 43, 44, 45, 46, 47, 48, 49}, '40대')
X_test['age_group'] = X_test['age_group'].replace({50, 51, 52, 53, 54, 55, 56, 57, 58, 59}, '50대')
X_test['age_group'] = X_test['age_group'].replace({60, 61, 62, 63, 64, 65, 66, 67, 68, 69}, '60대')

In [None]:
X_test['age_group'].value_counts()

### 'year of service' feature 생성 -> DAYS_EMPLOYED으로 변환 작업
- 근무년수

In [None]:
X_train['year_of_service'] = round(X_train['DAYS_EMPLOYED']/365,0).astype(np.int64)
X_train.head(1)

In [None]:
X_test['year_of_service'] = round(X_test['DAYS_EMPLOYED']/365,0).astype(np.int64)
X_test.head(5)

### 'years_group' feature 생성 -> year_of_service으로 변환 작업
- 근무경력 기준

In [None]:
X_train['years_group'] = X_train['year_of_service'].copy()
X_test['years_group'] = X_test['year_of_service'].copy()

In [None]:
X_train['years_group'].value_counts(ascending = False)

In [None]:
X_train['years_group'] = X_train['years_group'].replace(0, '근무경력없음')
X_train['years_group'] = X_train['years_group'].replace({1,2,3}, '1~3년')
X_train['years_group'] = X_train['years_group'].replace({4,5,6}, '4~6년')
X_train['years_group'] = X_train['years_group'].replace({7,8,9}, '7~9년')
X_train['years_group'] = X_train['years_group'].replace({10,11,12,13,14,15}, '10~15년')
X_train['years_group'] = X_train['years_group'].replace({16,17,18,19,20}, '16~20년')
X_train['years_group'] = X_train['years_group'].replace({21,22,23,24,25}, '21년~25년')
X_train['years_group'] = X_train['years_group'].replace({26,27,28,29,30}, '26년~30년')
X_train['years_group'] = X_train['years_group'].replace({31,32,33,34,35,36,37,38,39,40,41,43}, '31년이상')

In [None]:
X_train['years_group'].value_counts()

In [None]:
X_test['years_group'] = X_test['years_group'].replace(0, '근무경력없음')
X_test['years_group'] = X_test['years_group'].replace({1,2,3}, '1~3년')
X_test['years_group'] = X_test['years_group'].replace({4,5,6}, '4~6년')
X_test['years_group'] = X_test['years_group'].replace({7,8,9}, '7~9년')
X_test['years_group'] = X_test['years_group'].replace({10,11,12,13,14,15}, '10~15년')
X_test['years_group'] = X_test['years_group'].replace({16,17,18,19,20}, '16~20년')
X_test['years_group'] = X_test['years_group'].replace({21,22,23,24,25}, '21년~25년')
X_test['years_group'] = X_test['years_group'].replace({26,27,28,29,30}, '26년~30년')
X_test['years_group'] = X_test['years_group'].replace({31,32,33,34,35,36,37,38,39,40,41,42, 43}, '31년이상')

In [None]:
X_test['years_group'].value_counts()

### DAYS_BIRTH_month, DAYS_BIRTH_week 피처 생성 
- DAYS_BIRTH 변환 작업

In [None]:
# DAYS_BIRTH
X_train['DAYS_BIRTH_month']=np.floor((X_train['DAYS_BIRTH'])/30)-((np.floor((X_train['DAYS_BIRTH'])/30)/12).astype(int)*12)
X_train['DAYS_BIRTH_week']=np.floor((X_train['DAYS_BIRTH'])/7)-((np.floor((X_train['DAYS_BIRTH'])/7)/4).astype(int)*4)

In [None]:
X_train['DAYS_BIRTH_month'] = X_train['DAYS_BIRTH_month'].astype(int)
X_train['DAYS_BIRTH_week'] = X_train['DAYS_BIRTH_week'].astype(int)

In [None]:
X_test['DAYS_BIRTH_month']=np.floor((X_test['DAYS_BIRTH'])/30)-((np.floor((X_test['DAYS_BIRTH'])/30)/12).astype(int)*12)
X_test['DAYS_BIRTH_week']=np.floor((X_test['DAYS_BIRTH'])/7)-((np.floor((X_test['DAYS_BIRTH'])/7)/4).astype(int)*4)

In [None]:
X_test['DAYS_BIRTH_month'] = X_test['DAYS_BIRTH_month'].astype(int)
X_test['DAYS_BIRTH_week'] = X_test['DAYS_BIRTH_week'].astype(int)

### DAYS_EMPLOYED_month, DAYS_EMPLOYED_week 피처 생성
- DAYS_EMPLOYED 변환 작업

In [None]:
# DAYS_EMPLOYED
X_train['DAYS_EMPLOYED_month']=np.floor((X_train['DAYS_EMPLOYED'])/30)-((np.floor((X_train['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
X_train['DAYS_EMPLOYED_week']=np.floor((X_train['DAYS_EMPLOYED'])/7)-((np.floor((X_train['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

In [None]:
X_train['DAYS_EMPLOYED_month'] = X_train['DAYS_EMPLOYED_month'].astype(int)
X_train['DAYS_EMPLOYED_week'] = X_train['DAYS_EMPLOYED_week'].astype(int)

In [None]:
X_test['DAYS_EMPLOYED_month']=np.floor((X_test['DAYS_EMPLOYED'])/30)-((np.floor((X_test['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
X_test['DAYS_EMPLOYED_week']=np.floor((X_test['DAYS_EMPLOYED'])/7)-((np.floor((X_test['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

In [None]:
X_test['DAYS_EMPLOYED_month'] = X_test['DAYS_EMPLOYED_month'].astype(int)
X_test['DAYS_EMPLOYED_week'] = X_test['DAYS_EMPLOYED_week'].astype(int)

### DAYS_EMPLOYED_month, DAYS_EMPLOYED_week 피처 생성
- DAYS_EMPLOYED 변환 작업

In [None]:
# before_EMPLOYED: 고용되기 전까지의 일수

X_train['before_EMPLOYED']= X_train['DAYS_BIRTH']-X_train['DAYS_EMPLOYED']

In [None]:
X_train['before_EMPLOYED_month']=np.floor((X_train['before_EMPLOYED'])/30)-((np.floor((X_train['before_EMPLOYED'])/30)/12).astype(int)*12)
X_train['before_EMPLOYED_week']=np.floor((X_train['before_EMPLOYED'])/7)-((np.floor((X_train['before_EMPLOYED'])/7)/4).astype(int)*4)

In [None]:
X_train['before_EMPLOYED'] = X_train['before_EMPLOYED'].astype(int)
X_train['before_EMPLOYED_month'] = X_train['before_EMPLOYED_month'].astype(int)
X_train['before_EMPLOYED_week'] = X_train['before_EMPLOYED_week'].astype(int)

In [None]:
# before_EMPLOYED: 고용되기 전까지의 일수

X_test['before_EMPLOYED']= X_test['DAYS_BIRTH']-X_test['DAYS_EMPLOYED']
X_test['before_EMPLOYED_month']=np.floor((X_test['before_EMPLOYED'])/30)-((np.floor((X_train['before_EMPLOYED'])/30)/12).astype(int)*12)
X_test['before_EMPLOYED_week']=np.floor((X_test['before_EMPLOYED'])/7)-((np.floor((X_train['before_EMPLOYED'])/7)/4).astype(int)*4)

In [None]:
X_test['before_EMPLOYED'] = X_test['before_EMPLOYED'].astype(int)
X_test['before_EMPLOYED_month'] = X_test['before_EMPLOYED_month'].astype(int)
X_test['before_EMPLOYED_week'] = X_test['before_EMPLOYED_week'].astype(int)

### DAYS_BIRTH 범주화

In [None]:
a = []

for i in X_train['DAYS_BIRTH']:
    if i<=X_train['DAYS_BIRTH'].quantile(q=0.25):
        a.append(1)
    elif i<=X_train['DAYS_BIRTH'].quantile(q=0.5):
        a.append(2)
    elif i<=X_train['DAYS_BIRTH'].quantile(q=0.75):
        a.append(3)
    else:
        a.append(4)

X_train['DAYS_BIRTH_class'] = a

a = []

for i in X_test['DAYS_BIRTH']:
    if i<=X_test['DAYS_BIRTH'].quantile(q=0.25):
        a.append(1)
    elif i<=X_test['DAYS_BIRTH'].quantile(q=0.5):
        a.append(2)
    elif i<=X_test['DAYS_BIRTH'].quantile(q=0.75):
        a.append(3)
    else:
        a.append(4)

X_test['DAYS_BIRTH_class'] = a

### begin_month 범주화

In [None]:
a = []

for i in X_train['begin_month']:
    if i <= X_train['begin_month'].quantile(q=0.25):
        a.append(1)
    elif i <= X_train['begin_month'].quantile(q=0.5):
        a.append(2)
    elif i <= X_train['begin_month'].quantile(q=0.75):
        a.append(3)
    else:
        a.append(4)

X_train['begin_month_class'] = a

a = []

for i in X_test['begin_month']:
    if i <= X_test['begin_month'].quantile(q=0.25):
        a.append(1)
    elif i <= X_test['begin_month'].quantile(q=0.5):
        a.append(2)
    elif i <= X_test['begin_month'].quantile(q=0.75):
        a.append(3)
    else:
        a.append(4)

X_test['begin_month_class'] = a


### income_total*10000

In [None]:
X_train['income_total'] = X_train['income_total'].astype(float)
X_test['income_total'] = X_test['income_total'].astype(float)

X_train['income_total*10000'] = X_train['income_total']/10000 
X_test['income_total*10000'] = X_test['income_total']/10000

### income_total_dev & income_total_log

In [None]:
def numeric_process(data):

    # 편차 제곱 변수 생성
    data['income_total_dev'] = (
        data['income_total*10000'] - data['income_total*10000'].mean())**2

    # 로그 변환
    data['income_total_log'] = data['income_total*10000'].apply(np.log1p)

    return data


X_train = numeric_process(X_train)
X_test = numeric_process(X_test)

### 연간소득/가족규모

In [None]:
X_train['소득/가족'] = X_train['income_total'] / X_train['family_size']
X_test['소득/가족'] = X_test['income_total'] / X_test['family_size']

### 연간소득/자녀수

In [None]:
X_train['소득/자녀'] = X_train['income_total'] / X_train['child_num']
X_test['소득/자녀'] = X_test['income_total'] / X_test['child_num']

### 소득분류별 연간소득

In [None]:
object1 = X_train.groupby('income_type')['income_total'].agg([('income_type__총income_total', np.sum),
                                            ('income_type_평균income_total', np.mean),
                                            ('income_type_최대income_total', np.max),
                                            ('income_type__최소income_total', np.min),
                                            ('income_type__income_total표준편차', np.std),   
                                            ('income_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object1, on = 'income_type', how='left')



object2 = X_test.groupby('income_type')['income_total'].agg([('income_type__총income_total', np.sum),
                                            ('income_type_평균income_total', np.mean),
                                            ('income_type_최대income_total', np.max),
                                            ('income_type__최소income_total', np.min),
                                            ('income_type__income_total표준편차', np.std),   
                                            ('income_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object2, on = 'income_type', how='left')

### 교육수준별 연간소득

In [None]:
object3 = X_train.groupby('edu_type')['income_total'].agg([('edu_type__총income_total', np.sum),
                                            ('edu_type_평균income_total', np.mean),
                                            ('edu_type_최대income_total', np.max),
                                            ('edu_type__최소income_total', np.min),
                                            ('edu_type__income_total표준편차', np.std),   
                                            ('edu_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object3, on = 'edu_type', how='left')



object4 = X_test.groupby('edu_type')['income_total'].agg([('edu_type__총income_total', np.sum),
                                            ('edu_type_평균income_total', np.mean),
                                            ('edu_type_최대income_total', np.max),
                                            ('edu_type__최소income_total', np.min),
                                            ('edu_type__income_total표준편차', np.std),   
                                            ('edu_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object4, on = 'edu_type', how='left')

### 결혼여부별 연간소득

In [None]:
object5 = X_train.groupby('family_type')['income_total'].agg([('family_type__총income_total', np.sum),
                                            ('family_type_평균income_total', np.mean),
                                            ('family_type_최대income_total', np.max),
                                            ('family_type__최소income_total', np.min),
                                            ('family_type__income_total표준편차', np.std),   
                                            ('family_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object5, on = 'family_type', how='left')



object6 = X_test.groupby('family_type')['income_total'].agg([('family_type__총income_total', np.sum),
                                            ('family_type_평균income_total', np.mean),
                                            ('family_type_최대income_total', np.max),
                                            ('family_type__최소income_total', np.min),
                                            ('family_type__income_total표준편차', np.std),   
                                            ('family_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object6, on = 'family_type', how='left')

### 생활방식별 연간소득

In [None]:
object7 = X_train.groupby('house_type')['income_total'].agg([('house_type__총income_total', np.sum),
                                            ('house_type_평균income_total', np.mean),
                                            ('house_type_최대income_total', np.max),
                                            ('house_type__최소income_total', np.min),
                                            ('house_type__income_total표준편차', np.std),   
                                            ('house_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object7, on = 'house_type', how='left')



object8 = X_test.groupby('house_type')['income_total'].agg([('house_type__총income_total', np.sum),
                                            ('house_type_평균income_total', np.mean),
                                            ('house_type_최대income_total', np.max),
                                            ('house_type__최소income_total', np.min),
                                            ('house_type__income_total표준편차', np.std),   
                                            ('house_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object8, on = 'house_type', how='left')

### 나이대별 연간소득

In [None]:
object9 = X_train.groupby('age_group')['income_total'].agg([('Age_type__총income_total', np.sum),
                                            ('Age_type_평균income_total', np.mean),
                                            ('Age_type_최대income_total', np.max),
                                            ('Age_type__최소income_total', np.min),
                                            ('Age_type__income_total표준편차', np.std),   
                                            ('Age_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object9, on = 'age_group', how='left')



object10 = X_test.groupby('age_group')['income_total'].agg([('Age_type__총income_total', np.sum),
                                            ('Age_type_평균income_total', np.mean),
                                            ('Age_type_최대income_total', np.max),
                                            ('Age_type__최소income_total', np.min),
                                            ('Age_type__income_total표준편차', np.std),   
                                            ('Age_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object10, on = 'age_group', how='left')

### 직업유형별 연간소득

In [None]:
object11 = X_train.groupby('occyp_type')['income_total'].agg([('occyp_type__총income_total', np.sum),
                                            ('occyp_type_평균income_total', np.mean),
                                            ('occyp_type_최대income_total', np.max),
                                            ('occyp_type__최소income_total', np.min),
                                            ('occyp_type__income_total표준편차', np.std),   
                                            ('occyp_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_train= pd.merge(X_train, object11, on = 'occyp_type', how='left')



object12 = X_test.groupby('occyp_type')['income_total'].agg([('occyp_type__총income_total', np.sum),
                                            ('occyp_type_평균income_total', np.mean),
                                            ('occyp_type_최대income_total', np.max),
                                            ('occyp_type__최소income_total', np.min),
                                            ('occyp_type__income_total표준편차', np.std),   
                                            ('occyp_type__income_total변동계수', lambda x : np.std(x)/np.mean(x))]).reset_index().fillna(0)
X_test= pd.merge(X_test, object12, on = 'occyp_type', how='left')

### EMPLOYED_RATIO

In [None]:
X_train['EMPLOYED_RATIO'] = X_train['DAYS_EMPLOYED'] / X_train['DAYS_BIRTH']
X_test['EMPLOYED_RATIO'] = X_test['DAYS_EMPLOYED'] / X_test['DAYS_BIRTH']

### income_per_days_birth

In [None]:
X_train['income_per_days_birth'] = X_train['income_total'] / X_train['DAYS_BIRTH']
X_test['income_per_days_birth'] = X_test['income_total'] / X_test['DAYS_BIRTH']

### income_per_days_birth_X_DAYS_BIRTH

In [None]:
X_train['income_per_days_birth_X_DAYS_BIRTH'] = X_train['income_per_days_birth'] * X_train['DAYS_BIRTH']
X_test['income_per_days_birth_X_DAYS_BIRTH'] = X_test['income_per_days_birth'] * X_test['DAYS_BIRTH']

### begin_month_X_DAYS_BIRTH

In [None]:
X_train['begin_month_X_DAYS_BIRTH'] = X_train['begin_month'] * X_train['DAYS_BIRTH']
X_test['begin_month_X_DAYS_BIRTH'] = X_test['begin_month'] * X_test['DAYS_BIRTH']

### BIRTH*id

In [None]:
X_train['BIRTH*id'] = X_train['DAYS_BIRTH'] * X_train['id']
X_test['BIRTH*id'] = X_test['DAYS_BIRTH'] * X_test['id']

### EMP*id

In [None]:
X_train['EMP*id'] = X_train['DAYS_EMPLOYED'] * X_train['id']
X_test['EMP*id'] = X_test['DAYS_EMPLOYED'] * X_test['id']

### BIRTH*EMP

In [None]:
X_train['BIRTH*EMP'] = X_train['DAYS_BIRTH'] * X_train['DAYS_EMPLOYED']
X_test['BIRTH*EMP'] = X_test['DAYS_BIRTH'] * X_test['DAYS_EMPLOYED']

### possible

In [None]:
X_train['possible'] = X_train['income_total'] / (X_train['DAYS_BIRTH'] + X_train['DAYS_EMPLOYED'])
X_test['possible'] = X_test['income_total'] / (X_test['DAYS_BIRTH'] + X_test['DAYS_EMPLOYED'])

### possible_class

In [None]:
a = []

for i in X_train['possible'].clip(0, 35):
    if i <= 5:
        a.append(1)
    elif i <= 10:
        a.append(2)
    elif i <= 15:
        a.append(3)
    elif i <= 20:
        a.append(4)
    elif i <= 25:
        a.append(5)
    elif i <= 30:
        a.append(6)
    elif i <= 35:
        a.append(7)

X_train['possible_class'] = a

a = []

for i in X_test['possible'].clip(0, 35):
    if i <= 5:
        a.append(1)
    elif i <= 10:
        a.append(2)
    elif i <= 15:
        a.append(3)
    elif i <= 20:
        a.append(4)
    elif i <= 25:
        a.append(5)
    elif i <= 30:
        a.append(6)
    elif i <= 35:
        a.append(7)

X_test['possible_class'] = a

### car_reality

In [None]:
X_train['car_reality'] = X_train['car'] + X_train['reality']
X_test['car_reality'] = X_test['car'] + X_test['reality']

### ID + DAYS_BIRTH

In [None]:
object1 = X_train.groupby('id')['DAYS_BIRTH'].agg([('id_총DAYS_BIRTH', np.sum),
                                           #  ('구매건수', np.size),
                                            ('id_평균DAYS_BIRTH', np.mean),
                                            ('id_최대DAYS_BIRTH', np.max),
                                            ('id_최소DAYS_BIRTH', np.min),
                                            ('id_DAYS_BIRTH표준편차', np.std),   
                                            ('id_DAYS_BIRTH변동계수', lambda X_train : np.std(X_train)/np.mean(X_train))]).reset_index().fillna(0)

X_train= pd.merge(X_train, object1, on = 'id', how='left')

object1 = X_test.groupby('id')['DAYS_BIRTH'].agg([('id_총DAYS_BIRTH', np.sum),
                                           #  ('구매건수', np.size),
                                            ('id_평균DAYS_BIRTH', np.mean),
                                            ('id_최대DAYS_BIRTH', np.max),
                                            ('id_최소DAYS_BIRTH', np.min),
                                            ('id_DAYS_BIRTH표준편차', np.std),   
                                            ('id_DAYS_BIRTH변동계수', lambda X_test : np.std(X_test)/np.mean(X_test))]).reset_index().fillna(0)

X_test= pd.merge(X_test, object1, on = 'id', how='left')

###  ID + begin_month

In [None]:
object90 = X_train.groupby('id')['begin_month'].agg([('id_총begin_months5', np.sum),
                                           #  ('구매건수', np.size),
                                            ('id_평균begin_month', np.mean),
                                            ('id_최대begin_month', np.max),
                                            ('id_최소begin_month', np.min),
                                            ('id_begin_month표준편차', np.std),   
                                            ('id_begin_month변동계수', lambda X_train : np.std(X_train)/np.mean(X_train))]).reset_index().fillna(0)

X_train= pd.merge(X_train, object90, on = 'id', how='left')

object91 = X_test.groupby('id')['begin_month'].agg([('id_총begin_months5', np.sum),
                                           #  ('구매건수', np.size),
                                            ('id_평균begin_month', np.mean),
                                            ('id_최대begin_month', np.max),
                                            ('id_최소begin_month', np.min),
                                            ('id_begin_month표준편차', np.std),   
                                            ('id_begin_month변동계수', lambda X_test : np.std(X_test)/np.mean(X_test))]).reset_index().fillna(0)

X_test= pd.merge(X_test, object91, on = 'id', how='left')

### possible 범주화 + begin_month_class 범주화 + DAYS_BIRTH_class 범주화

In [None]:
X_train['pos+beg+dBirth'] = X_train['possible_class'] + X_train['begin_month_class'] + X_train['DAYS_BIRTH_class']
X_test['pos+beg+dBirth'] = X_test['possible_class'] + X_test['begin_month_class'] + X_test['DAYS_BIRTH_class']

### income/before_EMPLOYED

In [None]:
X_train['income/before_EMPLOYED'] = X_train['income_total'] / X_train['before_EMPLOYED']
X_test['income/before_EMPLOYED'] = X_test['income_total'] / X_test['before_EMPLOYED']

### edu_type 별 신뢰도 0의 비율

In [None]:
a = []

for i in X_train['edu_type']:
    if i == 'Academic degree':
        a.append(0.08)
    elif i == 'Higher education':
        a.append(0.126)
    elif i == 'Incomplete higher':
        a.append(0.111)
    elif i == 'Lower Secondary':
        a.append(0.108)
    else:
        a.append(0.12)

X_train['edu_type_num'] = a

a = []

for i in X_test['edu_type']:
    if i == 'Academic degree':
        a.append(0.08)
    elif i == 'Higher education':
        a.append(0.126)
    elif i == 'Incomplete higher':
        a.append(0.111)
    elif i == 'Lower Secondary':
        a.append(0.108)
    else:
        a.append(0.12)


X_test['edu_type_num'] = a


### edu_type_num / DAYS_BIRTH 

In [None]:
X_train['edu_type_num/DAYS_BIRTH'] = (X_train['edu_type_num'] / X_train['DAYS_BIRTH'])*10000
X_test['edu_type_num/DAYS_BIRTH'] = (X_test['edu_type_num'] / X_test['DAYS_BIRTH'])*10000

### edu_type_num * income_per_days_birth

In [None]:
X_train['edu_type_num*income_per_days_birth'] = X_train['edu_type_num'] * X_train['income_per_days_birth']
X_test['edu_type_num*income_per_days_birth'] = X_test['edu_type_num'] * X_test['income_per_days_birth']

### 데이터 타입 변경

In [None]:
X_train['family_size'] = X_train['family_size'].astype(int)
X_test['family_size'] = X_test['family_size'].astype(int)

X_train['age'] = X_train['age'].astype('int64')
X_test['age'] = X_test['age'].astype('int64')

X_train['year_of_service'] = X_train['year_of_service'].astype('int64')
X_test['year_of_service'] = X_test['year_of_service'].astype('int64')

In [None]:
X_train.to_csv('data/feature_enginerring_train.csv', encoding='utf-8')
X_test.to_csv('data/feature_enginerring_test.csv', encoding='utf-8')