In [1]:
import pandas as pd

from utils import path_join, load_train_test

In [2]:
train_filename = "application_train.csv"
test_filename = "application_test.csv"
path_to_data = "../data/"

### load train/test

In [3]:
df_application_train = load_train_test(
    filename=train_filename,
    path_to_train_folder=path_to_data,
)
df_application_test = load_train_test(
    filename=test_filename,
    path_to_train_folder=path_to_data,
)

### binary features

In [4]:
binary_columns = df_application_train.columns[df_application_train.nunique() == 2].tolist()

In [5]:
print(f"number of binary features: {len(binary_columns)}")

number of binary features: 37


In [6]:
binary_columns

['TARGET',
 'NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EMERGENCYSTATE_MODE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

In [7]:
# target included
df_application_train[binary_columns].to_csv(
    path_join(
        filename='application_train_binary.csv',
        path_to_folder=path_to_data,
    )
)

# target excluded
df_application_test[binary_columns[1:]].to_csv(
    path_join(
        filename='application_test_binary.csv',
        path_to_folder=path_to_data,
    )
)

### categorical features

In [8]:
categorical_columns = [
    'CODE_GENDER',
    
    'NAME_TYPE_SUITE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
               
    'OCCUPATION_TYPE',
    'REGION_RATING_CLIENT',
    'REGION_RATING_CLIENT_W_CITY',
    'WEEKDAY_APPR_PROCESS_START',
    'ORGANIZATION_TYPE',
    'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE',
    'WALLSMATERIAL_MODE',
]

In [9]:
print(f"number of categorical features: {len(categorical_columns)}")

number of categorical features: 14


In [10]:
categorical_columns

['CODE_GENDER',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE']

In [11]:
# add target column
df_application_train[['TARGET'] + categorical_columns].to_csv(
    path_join(
        filename='application_train_categorical.csv',
        path_to_folder=path_to_data,
    )
)

df_application_test[categorical_columns].to_csv(
    path_join(
        filename='application_test_categorical.csv',
        path_to_folder=path_to_data,
    )
)

### numerical features

In [12]:
numerical_columns = list(
    set(df_application_train.columns) - set(binary_columns + categorical_columns)
)

In [13]:
print(f"number of numerical features: {len(numerical_columns)}")

number of numerical features: 70


In [14]:
numerical_columns

['HOUR_APPR_PROCESS_START',
 'NONLIVINGAREA_MODE',
 'LIVINGAREA_MEDI',
 'DAYS_EMPLOYED',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'DAYS_LAST_PHONE_CHANGE',
 'NONLIVINGAPARTMENTS_MODE',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MODE',
 'FLOORSMIN_AVG',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'FLOORSMAX_MODE',
 'EXT_SOURCE_1',
 'NONLIVINGAPARTMENTS_MEDI',
 'ENTRANCES_MEDI',
 'APARTMENTS_MODE',
 'COMMONAREA_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MEDI',
 'ELEVATORS_MEDI',
 'ELEVATORS_MODE',
 'REGION_POPULATION_RELATIVE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'NONLIVINGAPARTMENTS_AVG',
 'COMMONAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'BASEMENTAREA_MEDI',
 'FLOORSMAX_AVG',
 'DAYS_BIRTH',
 'BASEMENTAREA_MODE',
 'DAYS_REGISTRATION',
 'OWN_CAR_AGE',
 'FLOORSMIN_MEDI',
 'YEARS_BUILD_MEDI',
 'APARTMENTS_MEDI',
 'ELEVATORS_AVG',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'LANDAREA_AVG',
 'AMT_GOODS_PRICE',
 'APARTMENTS_AVG',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'YEARS_BUILD_AVG',
 'AMT_CREDIT',
 'T

In [15]:
# add target column
df_application_train[['TARGET'] + numerical_columns].to_csv(
    path_join(
        filename='application_train_numerical.csv',
        path_to_folder=path_to_data,
    )
)

df_application_test[numerical_columns].to_csv(
    path_join(
        filename='application_test_numerical.csv',
        path_to_folder=path_to_data,
    )
)