In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import joblib
%matplotlib inline
import seaborn as sns

In [2]:
# X = pd.read_csv(os.path.join(DIR,'files/unzipped_data/application_train.csv'), nrows=None)
X = pd.read_csv('application_train.csv', nrows = None)
X.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# 计算贷款年金支付占总收入的比例
X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']

# 计算拥有汽车的年龄相对于出生年龄的时间比例
X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']

# 计算拥有汽车的年龄相对于就业日期的时间比例
X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']

# 计算家庭成员中儿童的比例
X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']

# 计算贷款金额与贷款年金支付的比例
X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']

# 计算贷款金额与贷款商品价格的比例
X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']

# 计算贷款金额与总收入的比例
X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']

# 计算就业时间相对于出生日期的时间比例
X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']

# 计算总收入与贷款金额的比例
X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']

# 计算人均收入（去除儿童）
X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])

# 计算人均收入
X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']

# 计算年金支付与贷款金额的比例
X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']

# 计算最近更换电话号码的时间相对于出生日期的时间比例
X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']

# 计算最近更换电话号码的时间相对于就业日期的时间比例
X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']

In [16]:
AGGREGATION_RECIPIES = [
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),
                                              ('AMT_CREDIT', 'max'),
                                              ('EXT_SOURCE_1', 'mean'),
                                              ('EXT_SOURCE_2', 'mean'),
                                              ('OWN_CAR_AGE', 'max'),
                                              ('OWN_CAR_AGE', 'sum')]),
    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                                            ('AMT_INCOME_TOTAL', 'mean'),
                                            ('DAYS_REGISTRATION', 'mean'),
                                            ('EXT_SOURCE_1', 'mean')]),
    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),
                                                 ('CNT_CHILDREN', 'mean'),
                                                 ('DAYS_ID_PUBLISH', 'mean')]),
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
                                                                                           ('EXT_SOURCE_2', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
                                                  ('APARTMENTS_AVG', 'mean'),
                                                  ('BASEMENTAREA_AVG', 'mean'),
                                                  ('EXT_SOURCE_1', 'mean'),
                                                  ('EXT_SOURCE_2', 'mean'),
                                                  ('EXT_SOURCE_3', 'mean'),
                                                  ('NONLIVINGAREA_AVG', 'mean'),
                                                  ('OWN_CAR_AGE', 'mean'),
                                                  ('YEARS_BUILD_AVG', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
                                                                            ('EXT_SOURCE_1', 'mean')]),
    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                           ('CNT_CHILDREN', 'mean'),
                           ('CNT_FAM_MEMBERS', 'mean'),
                           ('DAYS_BIRTH', 'mean'),
                           ('DAYS_EMPLOYED', 'mean'),
                           ('DAYS_ID_PUBLISH', 'mean'),
                           ('DAYS_REGISTRATION', 'mean'),
                           ('EXT_SOURCE_1', 'mean'),
                           ('EXT_SOURCE_2', 'mean'),
                           ('EXT_SOURCE_3', 'mean')]),
]

In [17]:
groupby_aggregate_names = []
for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
    group_object = X.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        X = X.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        groupby_aggregate_names.append(groupby_aggregate_name)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):


  0%|          | 0/7 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for select, agg in tqdm(specs):


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
diff_feature_names = []
for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
    for select, agg in tqdm(specs):
        if agg in ['mean','median','max','min']:
            groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
            diff_name = '{}_diff'.format(groupby_aggregate_name)
            abs_diff_name = '{}_abs_diff'.format(groupby_aggregate_name)

            X[diff_name] = X[select] - X[groupby_aggregate_name] 
            X[abs_diff_name] = np.abs(X[select] - X[groupby_aggregate_name]) 

            diff_feature_names.append(diff_name)
            diff_feature_names.append(abs_diff_name)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):


  0%|          | 0/7 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for select, agg in tqdm(specs):


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [28]:
X['long_employment'] = (X['DAYS_EMPLOYED'] < -2000).astype(int)
feature_names = ['long_employment']

In [32]:
X['retirement_age'] = (X['DAYS_BIRTH'] < -14000).astype(int)
feature_names = ['DAYS_BIRTH','retirement_age']

In [36]:
application_eng = pd.concat([X_agg, X_eng, X_age, X_employment, X_diff], axis=1)

In [37]:
application_eng = application_eng.drop(columns=['TARGET', 'DAYS_BIRTH'])
application_eng

Unnamed: 0_level_0,CODE_GENDER_NAME_EDUCATION_TYPE_max_AMT_ANNUITY,CODE_GENDER_NAME_EDUCATION_TYPE_max_AMT_CREDIT,CODE_GENDER_NAME_EDUCATION_TYPE_mean_EXT_SOURCE_1,CODE_GENDER_NAME_EDUCATION_TYPE_mean_EXT_SOURCE_2,CODE_GENDER_NAME_EDUCATION_TYPE_max_OWN_CAR_AGE,CODE_GENDER_NAME_EDUCATION_TYPE_sum_OWN_CAR_AGE,CODE_GENDER_ORGANIZATION_TYPE_mean_AMT_ANNUITY,CODE_GENDER_ORGANIZATION_TYPE_mean_AMT_INCOME_TOTAL,CODE_GENDER_ORGANIZATION_TYPE_mean_DAYS_REGISTRATION,CODE_GENDER_ORGANIZATION_TYPE_mean_EXT_SOURCE_1,...,OCCUPATION_TYPE_mean_DAYS_ID_PUBLISH_diff,OCCUPATION_TYPE_mean_DAYS_ID_PUBLISH_abs_diff,OCCUPATION_TYPE_mean_DAYS_REGISTRATION_diff,OCCUPATION_TYPE_mean_DAYS_REGISTRATION_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_1_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_1_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_2_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_2_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_3_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_3_abs_diff
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,173704.5,2961000.0,0.385275,0.496320,91.0,521062.0,29725.858312,208566.954548,-4442.857420,0.401004,...,733.826804,733.826804,1061.988820,1061.988820,-0.339016,0.339016,-0.235471,0.235471,-0.362717,0.362717
100003,230161.5,4050000.0,0.578213,0.555785,65.0,171528.0,26463.578694,153236.991435,-4965.608351,0.591029,...,2459.544868,2459.544868,3189.710229,3189.710229,-0.200252,0.200252,0.094487,0.094487,,
100004,173704.5,2961000.0,0.385275,0.496320,91.0,521062.0,28003.280337,180212.879701,-4677.001719,0.426982,...,322.826804,322.826804,449.988820,449.988820,,,0.057493,0.057493,0.227474,0.227474
100006,216589.5,4027680.0,0.535930,0.503957,65.0,349652.0,28160.113645,175499.426888,-4709.777998,0.520902,...,416.826804,416.826804,-5123.011180,5123.011180,,,0.152022,0.152022,,
100007,173704.5,2961000.0,0.385275,0.496320,91.0,521062.0,26599.295455,179284.090909,-4276.931818,0.550301,...,-707.455132,707.455132,64.710229,64.710229,,,-0.205020,0.205020,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,173704.5,2961000.0,0.385275,0.496320,91.0,521062.0,33933.438202,213583.146067,-4288.348315,0.393967,...,660.391502,660.391502,-3911.408852,3911.408852,-0.314011,0.314011,0.186275,0.186275,,
456252,216589.5,4027680.0,0.535930,0.503957,65.0,349652.0,23592.609662,133968.829812,-6692.617415,0.712356,...,,,,,,,,,,
456253,230161.5,4050000.0,0.578213,0.555785,65.0,171528.0,26463.578694,153236.991435,-4965.608351,0.591029,...,-2237.159375,2237.159375,-2321.812830,2321.812830,0.189994,0.189994,-0.027934,0.027934,-0.280824,0.280824
456254,216589.5,4027680.0,0.535930,0.503957,65.0,349652.0,27064.791182,175089.798898,-4905.762167,0.524846,...,1922.826804,1922.826804,2147.988820,2147.988820,,,0.015743,0.015743,0.158931,0.158931


In [39]:
output_file_path = 'application_eng.parquet'

application_eng.to_parquet(output_file_path, index=True)