In [1]:
import pandas as pd
import numpy as np
import os
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

##### Function to reduce memory usage

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

#### Function to find correlation

In [3]:
#Removing highly correlated field in TMP_APP_TRAIN & TEST
def cor_find(df,threshold):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] >= threshold)]
    return corr_matrix,upper,to_drop

#### Read all intermediate files prepared

In [4]:
app_train = import_data('tmp_app_train.csv')
app_test = import_data('tmp_app_test.csv')
burea = import_data('tmp_burea.csv')
credit_card = import_data('tmp_credit_card.csv')
installment = import_data('tmp_installment.csv')
previous_application = import_data('tmp_previous_application.csv')
pos_cash_balance = pd.read_csv('tmp_pos_cash_balance.csv')

Memory usage of dataframe is 239.08 MB
Memory usage after optimization is: 52.45 MB
Decreased by 78.1%
Memory usage of dataframe is 37.56 MB
Memory usage after optimization is: 8.27 MB
Decreased by 78.0%
Memory usage of dataframe is 137.66 MB
Memory usage after optimization is: 37.62 MB
Decreased by 72.7%
Memory usage of dataframe is 119.29 MB
Memory usage after optimization is: 46.40 MB
Decreased by 61.1%
Memory usage of dataframe is 126.95 MB
Memory usage after optimization is: 54.73 MB
Decreased by 56.9%
Memory usage of dataframe is 537.74 MB
Memory usage after optimization is: 111.17 MB
Decreased by 79.3%


In [5]:
installment.shape

(339587, 49)

In [6]:
pos_cash_balance.shape

(337252, 33)

In [7]:
credit_card.shape

(102866, 152)

### Find features with correlation 1

In [8]:
corr_matrix,upper,to_drop = cor_find(installment,1)

In [9]:
to_drop

['MAX_NUM_INSTALMENT_VERSION',
 'MAX_NUM_INSTALMENT_NUMBER',
 'MAX_DAYS_INSTALMENT',
 'MAX_DAYS_ENTRY_PAYMENT',
 'MAX_AMT_INSTALMENT',
 'MAX_AMT_PAYMENT',
 'MAX_DAYS_DIFF',
 'MAX_AMT_DIFF',
 'MAX_AMT_DAYS_DIFF']

In [10]:
installment = installment.drop(columns=to_drop)

In [11]:
corr_matrix,upper,to_drop = cor_find(credit_card,1)

In [12]:
to_drop

['LATEST_LATEST_MONTHS_BALANCE_y',
 'LATEST_LATEST_AMT_BALANCE_y',
 'LATEST_LATEST_AMT_CREDIT_LIMIT_ACTUAL_y',
 'LATEST_LATEST_AMT_DRAWINGS_ATM_CURRENT_y',
 'LATEST_LATEST_AMT_DRAWINGS_OTHER_CURRENT_y',
 'LATEST_LATEST_AMT_DRAWINGS_POS_CURRENT_y',
 'LATEST_LATEST_AMT_PAYMENT_CURRENT_y',
 'LATEST_LATEST_CNT_DRAWINGS_ATM_CURRENT_y',
 'LATEST_LATEST_CNT_DRAWINGS_CURRENT_y',
 'LATEST_LATEST_CNT_DRAWINGS_OTHER_CURRENT_y',
 'LATEST_LATEST_CNT_INSTALMENT_MATURE_CUM_y',
 'LATEST_LATEST_SK_DPD_y',
 'LATEST_LATEST_SK_DPD_DEF_y',
 'MEAN_AMT_PAYMENT_TOTAL_CURRENT.1',
 'MAX_MONTHS_BALANCE',
 'MAX_AMT_PAYMENT_TOTAL_CURRENT.1',
 'MIN_AMT_PAYMENT_TOTAL_CURRENT.1',
 'MIN_CNT_DRAWINGS_OTHER_CURRENT',
 'SUM_AMT_PAYMENT_TOTAL_CURRENT.1',
 'VAR_AMT_PAYMENT_TOTAL_CURRENT.1',
 'DEFAULT_RATE_CC_STATUS_y']

In [13]:
credit_card = credit_card.drop(columns=to_drop)

In [14]:
credit_card.shape

(102866, 131)

In [15]:
# # ## Removing dummy variable
# app_train = app_train.drop(columns=['NAME_CONTRACT_TYPE_Revolving loans'])
# app_test = app_test.drop(columns=['NAME_CONTRACT_TYPE_Revolving loans'])

In [16]:
corr_matrix,upper,to_drop = cor_find(previous_application,1)

In [17]:
to_drop

['PRODUCT_COMBINATION_MISSING', 'AMT_DOWN_PAY_NAN', 'AMT_ANNUITY_NAN']

In [18]:
upper['PRODUCT_COMBINATION_MISSING'].sort_values(ascending=False).head()

NAME_CONTRACT_TYPE_XNA               1.000000
NAME_SELLER_INDUSTRY_Connectivity    0.072663
CHANNEL_TYPE_Country-wide            0.050581
NAME_YIELD_GROUP_XNA                 0.048578
NAME_PORTFOLIO_XNA                   0.045192
Name: PRODUCT_COMBINATION_MISSING, dtype: float64

In [19]:
previous_application = previous_application.drop(columns=to_drop)

In [20]:
previous_application.shape

(338857, 205)

In [21]:
corr_matrix,upper,to_drop = cor_find(pos_cash_balance,1)

In [22]:
to_drop

['SUM_DEFAULT_RATE_POS_STATUS']

In [23]:
pos_cash_balance = pos_cash_balance.drop(columns=to_drop)

#### Joining all datasets based on SK_ID_CURR and adding NULL FLAGS

In [24]:
#Joining files burea
app_train = pd.merge(app_train,burea,how='left',on=['SK_ID_CURR'])
app_test = pd.merge(app_test,burea,how='left',on=['SK_ID_CURR'])

In [25]:
burea.head()

Unnamed: 0,SK_ID_CURR,BUREAU_MONTHS_x,LATEST_STATUS_0,LATEST_STATUS_1,LATEST_STATUS_2,LATEST_STATUS_3,LATEST_STATUS_4,LATEST_STATUS_5,LATEST_STATUS_C,LATEST_STATUS_X,DAYS_CREDIT_UPDATE,TOTAL_SUM_AMT_CREDIT_SUM,TOTAL_SUM_AMT_CREDIT_SUM_DEBT,BUREAU_MAX_AMT_CREDIT_MAX_OVERDUE,BUREAU_MAX_AMT_CREDIT_SUM,BUREAU_MAX_AMT_CREDIT_SUM_DEBT,BUREAU_MAX_AMT_CREDIT_SUM_LIMIT,BUREAU_MAX_AMT_CREDIT_SUM_OVERDUE,BADDEBT_SUM_AMT_CREDIT_SUM,BADDEBT_SUM_AMT_CREDIT_SUM_DEBT,CLOSED_SUM_AMT_CREDIT_SUM,CLOSED_SUM_AMT_CREDIT_SUM_DEBT,ACTIVE_SUM_AMT_CREDIT_SUM,ACTIVE_SUM_AMT_CREDIT_SUM_DEBT,BUREAU_AMT_ANNUITY_SUM,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,BUREAU_SUM_CNT_CREDIT_PROLONG,BUREAU_CNT_CREDIT_TYPE_Another type of loan,BUREAU_CNT_CREDIT_TYPE_Car loan,BUREAU_CNT_CREDIT_TYPE_Cash loan (non-earmarked),BUREAU_CNT_CREDIT_TYPE_Consumer credit,BUREAU_CNT_CREDIT_TYPE_Credit card,BUREAU_CNT_CREDIT_TYPE_Interbank credit,BUREAU_CNT_CREDIT_TYPE_Loan for business development,BUREAU_CNT_CREDIT_TYPE_Loan for purchase of shares (margin lending),BUREAU_CNT_CREDIT_TYPE_Loan for the purchase of equipment,BUREAU_CNT_CREDIT_TYPE_Loan for working capital replenishment,BUREAU_CNT_CREDIT_TYPE_Microloan,BUREAU_CNT_CREDIT_TYPE_Mobile operator loan,BUREAU_CNT_CREDIT_TYPE_Mortgage,BUREAU_CNT_CREDIT_TYPE_Real estate loan,BUREAU_CNT_CREDIT_TYPE_Unknown type of loan,BUREAU_CNT_CREDIT_ACTIVE_Active,BUREAU_CNT_CREDIT_ACTIVE_Bad debt,BUREAU_CNT_CREDIT_ACTIVE_Closed,BUREAU_CNT_CREDIT_ACTIVE_Sold,BUREAU_TOTAL_COUNT,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,BUREAU_MONTHS_y
0,100001,19.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-6,1453365.0,596686.5,-1.0,378000.0,373239.0,0.0,0.0,0.0,0.0,569340.0,0.0,884025.0,596686.5,24817.5,-49,0,1778.0,-544.0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,3,0,4,0,7,31.0,1.0,0.0,0.0,0.0,0.0,110.0,30.0,172.0
1,100002,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-7,865055.6,245781.0,5043.64502,450000.0,245781.0,31988.564453,0.0,0.0,0.0,383067.0,0.0,481988.5625,245781.0,0.0,-103,0,780.0,-36.0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0,2,0,6,0,8,45.0,27.0,0.0,0.0,0.0,0.0,23.0,15.0,110.0
2,100003,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-43,1017400.0,0.0,0.0,810000.0,0.0,810000.0,0.0,0.0,0.0,207400.5,0.0,810000.0,0.0,0.0,-606,0,1216.0,-540.0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,1,0,3,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100004,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-382,189037.8,0.0,0.0,94537.796875,0.0,0.0,0.0,0.0,0.0,189037.796875,0.0,0.0,0.0,0.0,-408,0,-382.0,-382.0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100005,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11,657126.0,568408.5,0.0,568800.0,543087.0,0.0,0.0,0.0,0.0,58500.0,0.0,598626.0,568408.5,4261.5,-62,0,1324.0,-123.0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,3,14.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,21.0


In [26]:
app_train['BUREAU_FLAG'] = np.where(app_train['BUREAU_MONTHS_x'].isnull(),-1,1)
app_test['BUREAU_FLAG'] = np.where(app_test['BUREAU_MONTHS_x'].isnull(),-1,1)
#app_train = app_train.drop(columns=['SK_ID_BUREAU'],axis = 1)
#app_test = app_test.drop(columns=['SK_ID_BUREAU'],axis = 1)

In [27]:
#Joining files credit card
app_train = pd.merge(app_train,credit_card,how='left',on=['SK_ID_CURR'])
app_test = pd.merge(app_test,credit_card,how='left',on=['SK_ID_CURR'])
app_train['CC_FLAG'] = np.where(app_train['CC_SK_DPD_DEF_COUNT'].isnull(),-1,1)
app_test['CC_FLAG'] = np.where(app_test['CC_SK_DPD_DEF_COUNT'].isnull(),-1,1)

In [28]:
installment.shape

(339587, 40)

In [29]:
installment.head()

Unnamed: 0,SK_ID_CURR,MEAN_NUM_INSTALMENT_VERSION,MEAN_NUM_INSTALMENT_NUMBER,MEAN_DAYS_INSTALMENT,MEAN_DAYS_ENTRY_PAYMENT,MEAN_AMT_INSTALMENT,MEAN_AMT_PAYMENT,MEAN_DAYS_DIFF,MEAN_AMT_DIFF,MEAN_AMT_DAYS_DIFF,VAR_NUM_INSTALMENT_VERSION,VAR_NUM_INSTALMENT_NUMBER,VAR_DAYS_INSTALMENT,VAR_DAYS_ENTRY_PAYMENT,VAR_AMT_INSTALMENT,VAR_AMT_PAYMENT,VAR_DAYS_DIFF,VAR_AMT_DIFF,SUM_NUM_INSTALMENT_VERSION,SUM_NUM_INSTALMENT_NUMBER,SUM_DAYS_INSTALMENT,SUM_DAYS_ENTRY_PAYMENT,SUM_AMT_INSTALMENT,SUMAMT_PAYMENT,SUM_DAYS_DIFF,SUM_AMT_DIFF,SUM_AMT_DAYS_DIFF,MIN_NUM_INSTALMENT_VERSION,MIN_NUM_INSTALMENT_NUMBER,MIN_DAYS_INSTALMENT,MIN_DAYS_ENTRY_PAYMENT,MIN_AMT_INSTALMENT,MIN_AMT_PAYMENT,MIN_DAYS_DIFF,MIN_AMT_DIFF,MIN_AMT_DAYS_DIFF,INST_NUM_NULL_INSTS,INST_NUM_NULL_APPS,SUM_DPD,SUM_DBD
0,100001,2.0,4,2916.0,2916.0,17397.900391,17397.900391,11.0,0.0,-0.0,0.142822,1.238281,427702.5625,414612.65625,25772650.0,25772650.0,213.904755,0.0,8.0,19,15314.0,15365.0,41195.93,41195.93,-51.0,0.0,0.0,1.0,1,1619.0,1628.0,3951.0,3951.0,-36.0,0.0,-0.0,0.0,0.0,51.0,0.0
1,100002,2.0,19,565.0,587.0,53093.746094,53093.746094,-12.0,0.0,-0.0,0.052643,31.671875,28500.0,29604.257812,101164100.0,101164100.0,24.257311,0.0,20.0,190,5605.0,5993.0,219625.7,219625.7,-388.0,0.0,0.0,1.0,1,25.0,49.0,9251.775391,9251.775391,-31.0,0.0,-0.0,0.0,0.0,388.0,0.0
2,100003,2.0,12,2310.0,2324.0,560835.375,560835.375,-1.0,0.0,-0.0,0.040009,9.828125,573735.875,573541.8125,12219660000.0,12219660000.0,13.89,0.0,26.0,127,34454.0,34633.0,1618865.0,1618865.0,-179.0,0.0,0.0,1.0,1,536.0,544.0,6662.970215,6662.970215,-14.0,0.0,-0.0,0.0,0.0,179.0,0.0
3,100004,2.0,3,784.0,795.0,10573.964844,10573.964844,-3.0,0.0,-0.0,0.333252,1.0,900.0,1157.333374,9071372.0,9071372.0,17.333334,0.0,4.0,6,2262.0,2285.0,21288.46,21288.46,-23.0,0.0,0.0,1.0,1,724.0,727.0,5357.25,5357.25,-11.0,0.0,-0.0,0.0,0.0,23.0,0.0
4,100005,2.0,9,706.0,736.0,17656.244141,17656.244141,1.0,0.0,-0.0,0.111084,7.5,6750.0,8200.027344,18327090.0,18327090.0,182.527771,0.0,10.0,45,5274.0,5486.0,56161.84,56161.84,-212.0,0.0,0.0,1.0,1,466.0,470.0,4813.200195,4813.200195,-37.0,0.0,-0.0,0.0,0.0,212.0,0.0


In [30]:
#Joining files installment
app_train = pd.merge(app_train,installment,how='left',on=['SK_ID_CURR'])
app_test = pd.merge(app_test,installment,how='left',on=['SK_ID_CURR'])
app_train['INSTALLMENT_FLAG'] = np.where(app_train['SUM_NUM_INSTALMENT_NUMBER'].isnull(),-1,1)
app_test['INSTALLMENT_FLAG'] = np.where(app_test['SUM_NUM_INSTALMENT_NUMBER'].isnull(),-1,1)

In [31]:
#Joining files pos_cash_balance
app_train = pd.merge(app_train,pos_cash_balance,how='left',on=['SK_ID_CURR'])
app_test = pd.merge(app_test,pos_cash_balance,how='left',on=['SK_ID_CURR'])

In [32]:
app_train['POS_FLAG'] = np.where(app_train['SUM_SK_DPD'].isnull(),-1,1)
app_test['POS_FLAG'] = np.where(app_test['SUM_SK_DPD'].isnull(),-1,1)

In [33]:
#Joining files previous_application
app_train = pd.merge(app_train,previous_application,how='left',on=['SK_ID_CURR'])
app_test = pd.merge(app_test,previous_application,how='left',on=['SK_ID_CURR'])

In [34]:
app_train['PREV_APP_FLAG'] = np.where(app_train['CNT_PREV_APP'].isnull(),-1,1)
app_test['PREV_APP_FLAG'] = np.where(app_test['CNT_PREV_APP'].isnull(),-1,1)

### Function for character encoding & finding missing values

In [35]:
#Encoding categorical variables
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def encoding_cat_vars(df,le):
    le_count = 0
    one_hot_count = 0
    # Iterate through the columns
    for col in df:
        if df[col].dtype == 'object' or df[col].dtype =='bool' :
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 2:
                # Train on the training data
                le.fit(df[col])
                # Transform both training and testing data
                df[col] = le.transform(df[col])    
                # Keep track of how many columns were label encoded
                le_count += 1
            else:
                one_hot_count +=1
    print ("no of columns label encoded : "+str(le_count))
    #One Hot encoding
    df = pd.get_dummies(df)
    print ("no of columns one hot encoded encoded : "+str(one_hot_count))
    return df

In [36]:
## Missing values###########
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
###############################

### Adding some features - copied from Kaggle kernel

In [37]:
app_train['EXT_SOURCE_1  EXT_SOURCE_3'] = app_train['EXT_SOURCE_1'] * app_train['EXT_SOURCE_3']
app_train['EXT_SOURCE_2  EXT_SOURCE_3'] = app_train['EXT_SOURCE_2'] * app_train['EXT_SOURCE_3']
app_train['EXT_SOURCE_1  EXT_SOURCE_2']= app_train['EXT_SOURCE_1'] * app_train['EXT_SOURCE_2']
app_train['EXT_SOURCE prod'] = app_train['EXT_SOURCE_1'] * app_train['EXT_SOURCE_2'] * app_train['EXT_SOURCE_3']

In [38]:
app_test['EXT_SOURCE_1  EXT_SOURCE_3'] = app_test['EXT_SOURCE_1'] * app_test['EXT_SOURCE_3']
app_test['EXT_SOURCE_2  EXT_SOURCE_3'] = app_test['EXT_SOURCE_2'] * app_test['EXT_SOURCE_3']
app_test['EXT_SOURCE_1  EXT_SOURCE_2']= app_test['EXT_SOURCE_1'] * app_test['EXT_SOURCE_2']
app_test['EXT_SOURCE prod'] = app_test['EXT_SOURCE_1'] * app_test['EXT_SOURCE_2'] * app_test['EXT_SOURCE_3']

In [39]:
app_train['EXT_SOURCE_1 / DAYS_BIRTH'] = app_train['EXT_SOURCE_1'] / app_train['DAYS_BIRTH']
app_train['EXT_SOURCE_2 / DAYS_BIRTH'] = app_train['EXT_SOURCE_2'] / app_train['DAYS_BIRTH']
app_train['EXT_SOURCE_3 / DAYS_BIRTH'] = app_train['EXT_SOURCE_3'] / app_train['DAYS_BIRTH']

In [40]:
app_test['EXT_SOURCE_1 / DAYS_BIRTH'] = app_test['EXT_SOURCE_1'] / app_test['DAYS_BIRTH']
app_test['EXT_SOURCE_2 / DAYS_BIRTH'] = app_test['EXT_SOURCE_2'] / app_test['DAYS_BIRTH']
app_test['EXT_SOURCE_3 / DAYS_BIRTH'] = app_test['EXT_SOURCE_3'] / app_test['DAYS_BIRTH']

In [41]:
app_train['NEW_CREDIT_TO_ANNUITY_RATIO'] = app_train['AMT_CREDIT'] / app_train['AMT_ANNUITY']
app_train['NEW_INC_PER_CHLD'] = app_train['AMT_INCOME_TOTAL'] / (1 + app_train['CNT_CHILDREN'])
app_train['NEW_SCORES_STD'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app_train['NEW_SCORES_STD'] = app_train['NEW_SCORES_STD'].fillna(app_train['NEW_SCORES_STD'].mean())
app_train['NEW_ANNUITY_TO_INCOME_RATIO'] = app_train['AMT_ANNUITY'] / ( app_train['AMT_INCOME_TOTAL'])
app_train['NEW_CREDIT_TO_INCOME_RATIO'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']

In [42]:
app_test['NEW_CREDIT_TO_ANNUITY_RATIO'] = app_test['AMT_CREDIT'] / app_test['AMT_ANNUITY']
app_test['NEW_INC_PER_CHLD'] = app_test['AMT_INCOME_TOTAL'] / (1 + app_test['CNT_CHILDREN'])
app_test['NEW_SCORES_STD'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app_test['NEW_SCORES_STD'] = app_test['NEW_SCORES_STD'].fillna(app_test['NEW_SCORES_STD'].mean())
app_test['NEW_ANNUITY_TO_INCOME_RATIO'] = app_test['AMT_ANNUITY'] / (app_test['AMT_INCOME_TOTAL'])
app_test['NEW_CREDIT_TO_INCOME_RATIO'] = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']

In [43]:
app_train['EXT_SOURCE sum'] = (app_train['EXT_SOURCE_1'] + app_train['EXT_SOURCE_2'] + app_train['EXT_SOURCE_3'])
app_test['EXT_SOURCE sum'] = (app_test['EXT_SOURCE_1'] + app_test['EXT_SOURCE_2'] + app_test['EXT_SOURCE_3'])

In [44]:
app_train['EXT_SOURCE_mean'] = app_train[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].mean(axis=1)
app_test['EXT_SOURCE_mean'] = app_test[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].mean(axis=1)

In [45]:
app_train['EXT_SOURCE - EXT_SOURCE_3'] = app_train['EXT_SOURCE_1'] + app_train['EXT_SOURCE_2'] - app_train['EXT_SOURCE_3']
app_test['EXT_SOURCE - EXT_SOURCE_3'] = app_test['EXT_SOURCE_1'] + app_test['EXT_SOURCE_2'] - app_test['EXT_SOURCE_3']

app_test['EXT_SOURCE / mean'] = app_test['EXT_SOURCE prod'] / app_test['EXT_SOURCE_mean']
app_train['EXT_SOURCE / mean'] = app_train['EXT_SOURCE prod'] / app_train['EXT_SOURCE_mean']

In [46]:
app_train['EXT_SOURCE_1 / EXT_SOURCE_2'] = app_train['EXT_SOURCE_1'] / app_train['EXT_SOURCE_2']
app_train['EXT_SOURCE_2 / EXT_SOURCE_3'] = app_train['EXT_SOURCE_2'] / app_train['EXT_SOURCE_3']
app_train['EXT_SOURCE_1 / EXT_SOURCE_3'] = app_train['EXT_SOURCE_1'] / app_train['EXT_SOURCE_3']

app_test['EXT_SOURCE_1 / EXT_SOURCE_2'] = app_test['EXT_SOURCE_1'] / app_test['EXT_SOURCE_2']
app_test['EXT_SOURCE_2 / EXT_SOURCE_3'] = app_test['EXT_SOURCE_2'] / app_test['EXT_SOURCE_3']
app_test['EXT_SOURCE_1 / EXT_SOURCE_3'] = app_test['EXT_SOURCE_1'] / app_test['EXT_SOURCE_3']



In [47]:
app_train['NEW_CREDIT_TO_GOODS_RATIO'] = app_train['AMT_CREDIT'] / (app_train['AMT_GOODS_PRICE'])
app_test['NEW_CREDIT_TO_GOODS_RATIO'] = app_test['AMT_CREDIT'] / (app_test['AMT_GOODS_PRICE'])

In [48]:
app_train['NEW_EMPLOY_TO_BIRTH_RATIO'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']
app_test['NEW_EMPLOY_TO_BIRTH_RATIO'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']

In [49]:
app_train['NEW_PHONE_TO_BIRTH_RATIO'] = app_train['DAYS_LAST_PHONE_CHANGE'] / (app_train['DAYS_BIRTH'])
app_test['NEW_PHONE_TO_BIRTH_RATIO'] = app_test['DAYS_LAST_PHONE_CHANGE'] / (app_test['DAYS_BIRTH'])


In [50]:
app_train['app_INCOME_CHILDREN'] = app_train['AMT_INCOME_TOTAL']/(1+app_train['CNT_CHILDREN'])
app_train['app_INCOME_FAM'] = app_train['AMT_INCOME_TOTAL']/(1+app_train['CNT_FAM_MEMBERS'])

In [51]:
app_test['app_INCOME_CHILDREN']  = app_test['AMT_INCOME_TOTAL']/(1+app_test['CNT_CHILDREN'])
app_test['app_INCOME_FAM']  = app_test['AMT_INCOME_TOTAL']/(1+app_test['CNT_FAM_MEMBERS'])

#### Replacing inifinity values

In [52]:
app_train = app_train.replace([np.inf, -np.inf], np.nan)
app_test = app_test.replace([np.inf, -np.inf], np.nan)

In [53]:
#Check for missing values
missing_values = missing_values_table(app_train)
missing_values['column_name'] = missing_values.index
missing_values = missing_values.reset_index(drop = True)
missing_values

Your selected dataframe has 593 columns.
There are 463 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,column_name
0,220961,71.9,MAX_AMT_RECIVABLE
1,220961,71.9,MEAN_CNT_DRAWINGS_POS_CURRENT
2,220961,71.9,MEAN_CNT_DRAWINGS_CURRENT
3,220961,71.9,MEAN_CNT_DRAWINGS_ATM_CURRENT
4,220961,71.9,MEAN_AMT_TOTAL_RECEIVABLE
5,220961,71.9,MEAN_AMT_RECIVABLE
6,220961,71.9,MEAN_AMT_RECEIVABLE_PRINCIPAL
7,220961,71.9,MEAN_AMT_PAYMENT_TOTAL_CURRENT
8,220961,71.9,MEAN_AMT_PAYMENT_CURRENT
9,220961,71.9,MEAN_AMT_INST_MIN_REGULARITY


###### Imputing -1 to missing values

In [54]:
app_train = app_train.fillna(-1)
app_test = app_test.fillna(-1)

In [55]:
app_test.shape

(48744, 592)

In [56]:
app_train.shape

(307228, 593)

In [57]:
app_train.head(2)

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_11,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOCUMENT_IND_MEAN,DOCUMENT_IND_STD,DOCUMENT_IND_KURT,DOCUMENT_IND_SUM,FLAG_IND_SUM,FLAG_IND_KURT,FLAG_IND_STD,FLAG_IND_MEAN,EXT_SOURCE_MISSING,ORG_CORRECT,ORG_DEFAULT,DEFAULT_RATE_ORG_TYPE,NAME_TYPE_CORRECT,NAME_TYPE_DEFAULT,DEFAULT_RATE_NAME_TYPE,OCCUPATION_TYPE_CORRECT,OCCUPATION_TYPE_DEFAULT,DEFAULT_RATE_OCC_TYPE,INCOME_TYPE_CORRECT,INCOME_TYPE_DEFAULT,DEFAULT_RATE_INCOME_TYPE,EDUCATION_TYPE_CORRECT,EDUCATION_TYPE_DEFAULT,DEFAULT_RATE_EDUCATION_TYPE,FAMILY_STATUS_CORRECT,FAMILY_STATUS_DEFAULT,DEFAULT_RATE_FAMILY_STATUS,HOUSING_TYPE_CORRECT,HOUSING_TYPE_DEFAULT,DEFAULT_RATE_HOUSING_TYPE,TOTALAREA_NAN,EXT_SOURCE_3_NAN,EMERGENCYSTATE_NAN,AMT_REQ_CREDIT_BUREAU_NAN,DEF_SOCIAL_CIRCLE_NAN,COMMONAREA_NAN,NONLIVINGAPARTMENTS_NAN,LIVINGAPARTMENTS_NAN,EXT_SOURCE_1_NAN,EXT_SOURCE_2_NAN,NONLIVINGAREA_NAN,YEARS_BEGINEXPLUATATION_NAN,AMT_ANNUITY_NAN,APARTMENT_FLAG,DAYS_EMPLOYED_ANOM,NAME_CONTRACT_TYPE_Cash loans_x,NAME_CONTRACT_TYPE_Revolving loans_x,TARGET,BUREAU_MONTHS_x,LATEST_STATUS_0,LATEST_STATUS_1,LATEST_STATUS_2,LATEST_STATUS_3,LATEST_STATUS_4,LATEST_STATUS_5,LATEST_STATUS_C,LATEST_STATUS_X,DAYS_CREDIT_UPDATE,TOTAL_SUM_AMT_CREDIT_SUM,TOTAL_SUM_AMT_CREDIT_SUM_DEBT,BUREAU_MAX_AMT_CREDIT_MAX_OVERDUE,BUREAU_MAX_AMT_CREDIT_SUM,BUREAU_MAX_AMT_CREDIT_SUM_DEBT,BUREAU_MAX_AMT_CREDIT_SUM_LIMIT,BUREAU_MAX_AMT_CREDIT_SUM_OVERDUE,BADDEBT_SUM_AMT_CREDIT_SUM,BADDEBT_SUM_AMT_CREDIT_SUM_DEBT,CLOSED_SUM_AMT_CREDIT_SUM,CLOSED_SUM_AMT_CREDIT_SUM_DEBT,ACTIVE_SUM_AMT_CREDIT_SUM,ACTIVE_SUM_AMT_CREDIT_SUM_DEBT,BUREAU_AMT_ANNUITY_SUM,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,BUREAU_SUM_CNT_CREDIT_PROLONG,BUREAU_CNT_CREDIT_TYPE_Another type of loan,BUREAU_CNT_CREDIT_TYPE_Car loan,BUREAU_CNT_CREDIT_TYPE_Cash loan (non-earmarked),BUREAU_CNT_CREDIT_TYPE_Consumer credit,BUREAU_CNT_CREDIT_TYPE_Credit card,BUREAU_CNT_CREDIT_TYPE_Interbank credit,BUREAU_CNT_CREDIT_TYPE_Loan for business development,BUREAU_CNT_CREDIT_TYPE_Loan for purchase of shares (margin lending),BUREAU_CNT_CREDIT_TYPE_Loan for the purchase of equipment,BUREAU_CNT_CREDIT_TYPE_Loan for working capital replenishment,BUREAU_CNT_CREDIT_TYPE_Microloan,BUREAU_CNT_CREDIT_TYPE_Mobile operator loan,BUREAU_CNT_CREDIT_TYPE_Mortgage,BUREAU_CNT_CREDIT_TYPE_Real estate loan,BUREAU_CNT_CREDIT_TYPE_Unknown type of loan,BUREAU_CNT_CREDIT_ACTIVE_Active,BUREAU_CNT_CREDIT_ACTIVE_Bad debt,BUREAU_CNT_CREDIT_ACTIVE_Closed,BUREAU_CNT_CREDIT_ACTIVE_Sold,BUREAU_TOTAL_COUNT,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,BUREAU_MONTHS_y,BUREAU_FLAG,LATEST_LATEST_MONTHS_BALANCE_x,LATEST_LATEST_AMT_BALANCE_x,LATEST_LATEST_AMT_CREDIT_LIMIT_ACTUAL_x,LATEST_LATEST_AMT_DRAWINGS_ATM_CURRENT_x,LATEST_LATEST_AMT_DRAWINGS_OTHER_CURRENT_x,LATEST_LATEST_AMT_DRAWINGS_POS_CURRENT_x,LATEST_LATEST_AMT_PAYMENT_CURRENT_x,LATEST_LATEST_CNT_DRAWINGS_ATM_CURRENT_x,LATEST_LATEST_CNT_DRAWINGS_CURRENT_x,LATEST_LATEST_CNT_DRAWINGS_OTHER_CURRENT_x,LATEST_LATEST_CNT_INSTALMENT_MATURE_CUM_x,LATEST_LATEST_SK_DPD_x,LATEST_LATEST_SK_DPD_DEF_x,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed,SK_DPD,SK_DPD_DEF,DEFAULT_RATE_CC_STATUS_x,CC_SK_DPD_DEF_COUNT,CC_SK_DPD_COUNT,CNT_CC_PREV_APP,MEAN_MONTHS_BALANCE,MEAN_AMT_BALANCE,MEAN_AMT_CREDIT_LIMIT_ACTUAL,MEAN_AMT_DRAWINGS_ATM_CURRENT,MEAN_AMT_DRAWINGS_CURRENT,MEAN_AMT_DRAWINGS_OTHER_CURRENT,MEAN_AMT_DRAWINGS_POS_CURRENT,MEAN_AMT_INST_MIN_REGULARITY,MEAN_AMT_PAYMENT_CURRENT,MEAN_AMT_PAYMENT_TOTAL_CURRENT,MEAN_AMT_RECEIVABLE_PRINCIPAL,MEAN_AMT_RECIVABLE,MEAN_AMT_TOTAL_RECEIVABLE,MEAN_CNT_DRAWINGS_ATM_CURRENT,MEAN_CNT_DRAWINGS_CURRENT,MEAN_CNT_DRAWINGS_OTHER_CURRENT,MEAN_CNT_DRAWINGS_POS_CURRENT,MEAN_CNT_INSTALMENT_MATURE_CUM,MAX_AMT_BALANCE,MAX_AMT_CREDIT_LIMIT_ACTUAL,MAX_AMT_DRAWINGS_ATM_CURRENT,MAX_AMT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_OTHER_CURRENT,MAX_AMT_DRAWINGS_POS_CURRENT,MAX_AMT_INST_MIN_REGULARITY,MAX_AMT_PAYMENT_CURRENT,MAX_AMT_PAYMENT_TOTAL_CURRENT,MAX_AMT_RECEIVABLE_PRINCIPAL,MAX_AMT_RECIVABLE,MAX_AMT_TOTAL_RECEIVABLE,MAX_CNT_DRAWINGS_ATM_CURRENT,MAX_CNT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_OTHER_CURRENT,MAX_CNT_DRAWINGS_POS_CURRENT,MAX_CNT_INSTALMENT_MATURE_CUM,MIN_MONTHS_BALANCE,MIN_AMT_BALANCE,MIN_AMT_CREDIT_LIMIT_ACTUAL,MIN_AMT_DRAWINGS_ATM_CURRENT,MIN_AMT_DRAWINGS_CURRENT,MIN_AMT_DRAWINGS_OTHER_CURRENT,MIN_AMT_DRAWINGS_POS_CURRENT,MIN_AMT_INST_MIN_REGULARITY,MIN_AMT_PAYMENT_CURRENT,MIN_AMT_PAYMENT_TOTAL_CURRENT,MIN_AMT_RECEIVABLE_PRINCIPAL,MIN_AMT_RECIVABLE,MIN_AMT_TOTAL_RECEIVABLE,MIN_CNT_DRAWINGS_ATM_CURRENT,MIN_CNT_DRAWINGS_CURRENT,MIN_CNT_DRAWINGS_POS_CURRENT,MIN_CNT_INSTALMENT_MATURE_CUM,SUM_MONTHS_BALANCE,SUM_AMT_BALANCE,SUM_AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_DRAWINGS_ATM_CURRENT,SUM_AMT_DRAWINGS_CURRENT,SUM_AMT_DRAWINGS_OTHER_CURRENT,SUM_AMT_DRAWINGS_POS_CURRENT,SUM_AMT_INST_MIN_REGULARITY,SUM_AMT_PAYMENT_CURRENT,SUM_AMT_PAYMENT_TOTAL_CURRENT,SUM_AMT_RECEIVABLE_PRINCIPAL,SUM_AMT_RECIVABLE,SUM_AMT_TOTAL_RECEIVABLE,SUM_CNT_DRAWINGS_ATM_CURRENT,SUM_CNT_DRAWINGS_CURRENT,SUM_CNT_DRAWINGS_OTHER_CURRENT,SUM_CNT_DRAWINGS_POS_CURRENT,SUM_CNT_INSTALMENT_MATURE_CUM,VAR_MONTHS_BALANCE,VAR_AMT_BALANCE,VAR_AMT_CREDIT_LIMIT_ACTUAL,VAR_AMT_DRAWINGS_ATM_CURRENT,VAR_AMT_DRAWINGS_CURRENT,VAR_AMT_DRAWINGS_OTHER_CURRENT,VAR_AMT_DRAWINGS_POS_CURRENT,VAR_AMT_INST_MIN_REGULARITY,VAR_AMT_PAYMENT_CURRENT,VAR_AMT_PAYMENT_TOTAL_CURRENT,VAR_AMT_RECEIVABLE_PRINCIPAL,VAR_AMT_RECIVABLE,VAR_AMT_TOTAL_RECEIVABLE,VAR_CNT_DRAWINGS_ATM_CURRENT,VAR_CNT_DRAWINGS_CURRENT,VAR_CNT_DRAWINGS_OTHER_CURRENT,VAR_CNT_DRAWINGS_POS_CURRENT,VAR_CNT_INSTALMENT_MATURE_CUM,SUM_AMT_DRAWINGS_CURRENT / CNT_DRAWINGS_CURRENT,SUM_AMT_INST_MIN_REGULARITY / AMT_PAYMENT_TOTAL_CURRENT,SUM_AMT_DRAWINGS_CURRENT / AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_BALANCE / AMT_CREDIT_LIMIT_ACTUAL,SUM_AMT_DRAWINGS_ATM_CURRENT / AMT_DRAWINGS_CURRENT,SUM_CNT_DRAWINGS_ATM_CURRENT / CNT_DRAWINGS_CURRENT,MEAN_AMT_DRAWINGS_CURRENT / CNT_DRAWINGS_CURRENT,MEAN_AMT_INST_MIN_REGULARITY / AMT_PAYMENT_TOTAL_CURRENT,MEAN_AMT_DRAWINGS_CURRENT / AMT_CREDIT_LIMIT_ACTUAL,MEAN_AMT_BALANCE / AMT_CREDIT_LIMIT_ACTUAL,MEAN_AMT_DRAWINGS_ATM_CURRENT / AMT_DRAWINGS_CURRENT,MEAN_CNT_DRAWINGS_ATM_CURRENT / CNT_DRAWINGS_CURRENT,MAX_AMT_DRAWINGS_CURRENT / CNT_DRAWINGS_CURRENT,MAX_AMT_INST_MIN_REGULARITY / AMT_PAYMENT_TOTAL_CURRENT,MAX_AMT_DRAWINGS_CURRENT / AMT_CREDIT_LIMIT_ACTUAL,MAX_AMT_BALANCE / AMT_CREDIT_LIMIT_ACTUAL,MAX_AMT_DRAWINGS_ATM_CURRENT / AMT_DRAWINGS_CURRENT,MAX_CNT_DRAWINGS_ATM_CURRENT / CNT_DRAWINGS_CURRENT,CC_FLAG,MEAN_NUM_INSTALMENT_VERSION,MEAN_NUM_INSTALMENT_NUMBER,MEAN_DAYS_INSTALMENT,MEAN_DAYS_ENTRY_PAYMENT,MEAN_AMT_INSTALMENT,MEAN_AMT_PAYMENT,MEAN_DAYS_DIFF,MEAN_AMT_DIFF,MEAN_AMT_DAYS_DIFF,VAR_NUM_INSTALMENT_VERSION,VAR_NUM_INSTALMENT_NUMBER,VAR_DAYS_INSTALMENT,VAR_DAYS_ENTRY_PAYMENT,VAR_AMT_INSTALMENT,VAR_AMT_PAYMENT,VAR_DAYS_DIFF,VAR_AMT_DIFF,SUM_NUM_INSTALMENT_VERSION,SUM_NUM_INSTALMENT_NUMBER,SUM_DAYS_INSTALMENT,SUM_DAYS_ENTRY_PAYMENT,SUM_AMT_INSTALMENT,SUMAMT_PAYMENT,SUM_DAYS_DIFF,SUM_AMT_DIFF,SUM_AMT_DAYS_DIFF,MIN_NUM_INSTALMENT_VERSION,MIN_NUM_INSTALMENT_NUMBER,MIN_DAYS_INSTALMENT,MIN_DAYS_ENTRY_PAYMENT,MIN_AMT_INSTALMENT,MIN_AMT_PAYMENT,MIN_DAYS_DIFF,MIN_AMT_DIFF,MIN_AMT_DAYS_DIFF,INST_NUM_NULL_INSTS,INST_NUM_NULL_APPS,SUM_DPD,SUM_DBD,INSTALLMENT_FLAG,MEAN_DEFAULT_RATE_POS_STATUS,MAX_DEFAULT_RATE_POS_STATUS,MEAN_SK_DPD,MEAN_SK_DPD_DEF,MAX_CNT_INSTALMENT,MAX_SK_DPD,MAX_SK_DPD_DEF,SUM_SK_DPD,SUM_SK_DPD_DEF,MIN_CNT_INSTALMENT_FUTURE,MEAN_POS_STATUS_Active,MEAN_POS_STATUS_Amortized debt,MEAN_POS_STATUS_Approved,MEAN_POS_STATUS_Canceled,MEAN_POS_STATUS_Completed,MEAN_POS_STATUS_Demand,MEAN_POS_STATUS_Returned to the store,MEAN_POS_STATUS_Signed,SUM_POS_STATUS_Active,SUM_POS_STATUS_Amortized debt,SUM_POS_STATUS_Approved,SUM_POS_STATUS_Canceled,SUM_POS_STATUS_Completed,SUM_POS_STATUS_Demand,SUM_POS_STATUS_Returned to the store,SUM_POS_STATUS_Signed,SIZE_POS_MONTH_BAL,POS_CNT_DIFF_MEAN,POS_CNT_DIFF_MIN,POS_CNT_DIFF_MAX,POS_CNT_DIFF_VAR,POS_FLAG,NAME_CONTRACT_TYPE_Cash loans_y,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans_y,NAME_CONTRACT_TYPE_XNA,NAME_CASH_LOAN_PURPOSE_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_Business development,NAME_CASH_LOAN_PURPOSE_Buying a garage,NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land,NAME_CASH_LOAN_PURPOSE_Buying a home,NAME_CASH_LOAN_PURPOSE_Buying a new car,NAME_CASH_LOAN_PURPOSE_Buying a used car,NAME_CASH_LOAN_PURPOSE_Car repairs,NAME_CASH_LOAN_PURPOSE_Education,NAME_CASH_LOAN_PURPOSE_Everyday expenses,NAME_CASH_LOAN_PURPOSE_Furniture,NAME_CASH_LOAN_PURPOSE_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_Hobby,NAME_CASH_LOAN_PURPOSE_Journey,NAME_CASH_LOAN_PURPOSE_Medicine,NAME_CASH_LOAN_PURPOSE_Money for a third person,NAME_CASH_LOAN_PURPOSE_Other,NAME_CASH_LOAN_PURPOSE_Payments on other loans,NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_Repairs,NAME_CASH_LOAN_PURPOSE_Urgent needs,NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday,NAME_CASH_LOAN_PURPOSE_XAP,NAME_CASH_LOAN_PURPOSE_XNA,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Unused offer,NAME_PAYMENT_TYPE_Cash through the bank,NAME_PAYMENT_TYPE_Cashless from the account of the employer,NAME_PAYMENT_TYPE_Non-cash from your account,NAME_PAYMENT_TYPE_XNA,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_CLIENT_TYPE_XNA,NAME_GOODS_CATEGORY_Additional Service,NAME_GOODS_CATEGORY_Animals,NAME_GOODS_CATEGORY_Audio/Video,NAME_GOODS_CATEGORY_Auto Accessories,NAME_GOODS_CATEGORY_Clothing and Accessories,NAME_GOODS_CATEGORY_Computers,NAME_GOODS_CATEGORY_Construction Materials,NAME_GOODS_CATEGORY_Consumer Electronics,NAME_GOODS_CATEGORY_Direct Sales,NAME_GOODS_CATEGORY_Education,NAME_GOODS_CATEGORY_Fitness,NAME_GOODS_CATEGORY_Furniture,NAME_GOODS_CATEGORY_Gardening,NAME_GOODS_CATEGORY_Homewares,NAME_GOODS_CATEGORY_House Construction,NAME_GOODS_CATEGORY_Insurance,NAME_GOODS_CATEGORY_Jewelry,NAME_GOODS_CATEGORY_Medical Supplies,NAME_GOODS_CATEGORY_Medicine,NAME_GOODS_CATEGORY_Mobile,NAME_GOODS_CATEGORY_Office Appliances,NAME_GOODS_CATEGORY_Other,NAME_GOODS_CATEGORY_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_Sport and Leisure,NAME_GOODS_CATEGORY_Tourism,NAME_GOODS_CATEGORY_Vehicles,NAME_GOODS_CATEGORY_Weapon,NAME_GOODS_CATEGORY_XNA,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cars,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_XNA,CHANNEL_TYPE_AP+ (Cash loan),CHANNEL_TYPE_Car dealer,CHANNEL_TYPE_Channel of corporate sales,CHANNEL_TYPE_Contact center,CHANNEL_TYPE_Country-wide,CHANNEL_TYPE_Credit and cash offices,CHANNEL_TYPE_Regional / Local,CHANNEL_TYPE_Stone,NAME_PRODUCT_TYPE_XNA,NAME_PRODUCT_TYPE_walk-in,NAME_PRODUCT_TYPE_x-sell,NAME_SELLER_INDUSTRY_Auto technology,NAME_SELLER_INDUSTRY_Clothing,NAME_SELLER_INDUSTRY_Connectivity,NAME_SELLER_INDUSTRY_Construction,NAME_SELLER_INDUSTRY_Consumer electronics,NAME_SELLER_INDUSTRY_Furniture,NAME_SELLER_INDUSTRY_Industry,NAME_SELLER_INDUSTRY_Jewelry,NAME_SELLER_INDUSTRY_MLM partners,NAME_SELLER_INDUSTRY_Tourism,NAME_SELLER_INDUSTRY_XNA,NAME_YIELD_GROUP_XNA,NAME_YIELD_GROUP_high,NAME_YIELD_GROUP_low_action,NAME_YIELD_GROUP_low_normal,NAME_YIELD_GROUP_middle,PRODUCT_COMBINATION_Card Street,PRODUCT_COMBINATION_Card X-Sell,PRODUCT_COMBINATION_Cash,PRODUCT_COMBINATION_Cash Street: high,PRODUCT_COMBINATION_Cash Street: low,PRODUCT_COMBINATION_Cash Street: middle,PRODUCT_COMBINATION_Cash X-Sell: high,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,SUM_AMT_ANNUITY,SUM_AMT_APPLICATION,SUM_AMT_CREDIT,SUM_AMT_DOWN_PAYMENT,SUM_SELLERPLACE_AREA,SUM_AMT_GOODS_PRICE,SUM_RATE_DOWN_PAYMENT,SUM_CNT_PAYMENT,SUM_NFLAG_INSURED_ON_APPROVAL,SUM_DEFAULT_RATE_PREV_STATUS,SUM_DEFAULT_RATE_PRODUCT_COMBINATION,SUM_DEFAULT_RATE_CLIENT_TYPE,SUM_DEFAULT_RATE_GOODS_CAT,SUM_DEFAULT_RATE_CHANNEL_TYPE,SUM_AMT_APPLICATION - AMT_CREDIT,SUM_AMT_ANNUITY*CNT_PAYMENT - AMT_CREDIT,MIN_AMT_ANNUITY,MIN_AMT_APPLICATION,MIN_AMT_CREDIT,MIN_AMT_DOWN_PAYMENT,MIN_SELLERPLACE_AREA,MIN_AMT_GOODS_PRICE,MIN_RATE_DOWN_PAYMENT,MIN_CNT_PAYMENT,MIN_NFLAG_INSURED_ON_APPROVAL,MIN_DEFAULT_RATE_PREV_STATUS,MIN_DEFAULT_RATE_PRODUCT_COMBINATION,MIN_DEFAULT_RATE_CLIENT_TYPE,MIN_DEFAULT_RATE_GOODS_CAT,MIN_DEFAULT_RATE_CHANNEL_TYPE,MIN_AMT_APPLICATION - AMT_CREDIT,MIN_AMT_ANNUITY*CNT_PAYMENT - AMT_CREDIT,MEAN_AMT_ANNUITY,MEAN_AMT_APPLICATION,MEAN_AMT_CREDIT,MEAN_AMT_DOWN_PAYMENT,MEAN_SELLERPLACE_AREA,MEAN_AMT_GOODS_PRICE,MEAN_RATE_DOWN_PAYMENT,MEAN_CNT_PAYMENT,MEAN_NFLAG_INSURED_ON_APPROVAL,MEAN_DEFAULT_RATE_PREV_STATUS,MEAN_DEFAULT_RATE_PRODUCT_COMBINATION,MEAN_DEFAULT_RATE_CLIENT_TYPE,MEAN_DEFAULT_RATE_GOODS_CAT,MEAN_DEFAULT_RATE_CHANNEL_TYPE,MEAN_AMT_APPLICATION - AMT_CREDIT,MEAN_AMT_ANNUITY*CNT_PAYMENT - AMT_CREDIT,MAX_AMT_ANNUITY,MAX_AMT_APPLICATION,MAX_AMT_CREDIT,MAX_AMT_DOWN_PAYMENT,MAX_SELLERPLACE_AREA,MAX_AMT_GOODS_PRICE,MAX_RATE_DOWN_PAYMENT,MAX_CNT_PAYMENT,MAX_NFLAG_INSURED_ON_APPROVAL,MAX_DEFAULT_RATE_PREV_STATUS,MAX_DEFAULT_RATE_PRODUCT_COMBINATION,MAX_DEFAULT_RATE_CLIENT_TYPE,MAX_DEFAULT_RATE_GOODS_CAT,MAX_DEFAULT_RATE_CHANNEL_TYPE,MAX_AMT_APPLICATION - AMT_CREDIT,MAX_AMT_ANNUITY*CNT_PAYMENT - AMT_CREDIT,CNT_PREV_APP,DAYS_FIRST_DUE,DAYS_FIRST_DRAWING,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,DAYS_TERMINATION_ANOM,DAYS_LAST_DUE_ANOM,DAYS_LAST_DUE_1ST_VERSION_ANOM,DAYS_FIRST_DUE_ANOM,DAYS_FIRST_DRAWING_ANOM,RATE_DOWN_PAY_NAN,NFLAG_INSURED_ON_APPROVAL_NAN,AMT_GOODS_PRICE_NAN,CNT_PAYMENT_NAN,PREV_APP_FLAG,EXT_SOURCE_1 EXT_SOURCE_3,EXT_SOURCE_2 EXT_SOURCE_3,EXT_SOURCE_1 EXT_SOURCE_2,EXT_SOURCE prod,EXT_SOURCE_1 / DAYS_BIRTH,EXT_SOURCE_2 / DAYS_BIRTH,EXT_SOURCE_3 / DAYS_BIRTH,NEW_CREDIT_TO_ANNUITY_RATIO,NEW_INC_PER_CHLD,NEW_SCORES_STD,NEW_ANNUITY_TO_INCOME_RATIO,NEW_CREDIT_TO_INCOME_RATIO,EXT_SOURCE sum,EXT_SOURCE_mean,EXT_SOURCE - EXT_SOURCE_3,EXT_SOURCE / mean,EXT_SOURCE_1 / EXT_SOURCE_2,EXT_SOURCE_2 / EXT_SOURCE_3,EXT_SOURCE_1 / EXT_SOURCE_3,NEW_CREDIT_TO_GOODS_RATIO,NEW_EMPLOY_TO_BIRTH_RATIO,NEW_PHONE_TO_BIRTH_RATIO,app_INCOME_CHILDREN,app_INCOME_FAM
0,100002,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018799,-9461,-637,-3648.0,-2120,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083008,0.262939,0.139404,0.024704,2.0,2.0,2.0,2.0,-1134.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.049988,0.223633,20.0,1,4,-1.875,0.516602,0.666504,0,61615,6318,0.102539,228185,20337,0.089111,49302,5832,0.118286,143375.0,15208.0,0.106079,198682,19507,0.098206,40934.0,4452.0,0.108765,251375,21255,0.084534,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-7.0,865055.6,245781.0,5043.64502,450000.0,245781.0,31988.564453,0.0,0.0,0.0,383067.0,0.0,481988.5625,245781.0,0.0,-103.0,0.0,780.0,-36.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,6.0,0.0,8.0,45.0,27.0,0.0,0.0,0.0,0.0,23.0,15.0,110.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,2.0,19.0,565.0,587.0,53093.746094,53093.746094,-12.0,0.0,-0.0,0.052643,31.671875,28500.0,29604.257812,101164100.0,101164100.0,24.257311,0.0,20.0,190.0,5605.0,5993.0,219625.7,219625.7,-388.0,0.0,0.0,1.0,1.0,25.0,49.0,9251.775391,9251.775391,-31.0,0.0,-0.0,0.0,0.0,388.0,0.0,1,0.073424,0.073424,0.0,0.0,24.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,9.0,9.0,9.0,0.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9251.775391,179055.0,179055.0,0.0,500.0,179055.0,0.0,24.0,0.0,0.082092,0.088562,0.098816,0.111816,0.07959,0.0,42987.601562,9251.775391,179055.0,179055.0,0.0,500.0,179055.0,0.0,24.0,0.0,0.082092,0.088562,0.098816,0.111816,0.07959,0.0,42987.601562,9251.775391,179055.0,179055.0,0.0,500.0,179055.0,0.0,24.0,0.0,0.082092,0.088562,0.098816,0.111816,0.07959,0.0,42987.601562,9251.775391,179055.0,179055.0,0.0,500.0,179055.0,0.0,24.0,0.0,0.082092,0.088562,0.098816,0.111816,0.07959,0.0,42987.601562,1.0,-565.0,-1.0,125.0,-25.0,-17.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0.011574,0.036652,0.02182,0.003042,-9e-06,-2.8e-05,-1.5e-05,16.461103,202500.0,0.092041,0.121978,2.007889,0.485352,0.161743,0.206543,0.018814,0.315674,1.885742,0.595215,1.158397,0.067329,0.11986,202500.0,101250.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003542,-16765,-1188,-1186.0,-291,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311279,0.62207,0.535156,0.095886,1.0,0.0,1.0,0.0,-828.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.049988,0.223633,20.0,1,4,-1.875,0.516602,0.666504,1,8357,526,0.062927,37139,3009,0.080994,25795,1735,0.067261,20433.0,1247.0,0.061035,70791,4006,0.05658,181425.0,14832.0,0.081787,251375,21255,0.084534,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-43.0,1017400.0,0.0,0.0,810000.0,0.0,810000.0,0.0,0.0,0.0,207400.5,0.0,810000.0,0.0,0.0,-606.0,0.0,1216.0,-540.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,2.0,12.0,2310.0,2324.0,560835.375,560835.375,-1.0,0.0,-0.0,0.040009,9.828125,573735.875,573541.8125,12219660000.0,12219660000.0,13.89,0.0,26.0,127.0,34454.0,34633.0,1618865.0,1618865.0,-179.0,0.0,0.0,1.0,1.0,536.0,544.0,6662.970215,6662.970215,-14.0,0.0,-0.0,0.0,0.0,179.0,0.0,1,0.073462,0.073955,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.928571,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,28.0,4.125,3.375,5.5,1.421875,1,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,169661.96875,1306309.5,1452573.0,6885.0,1599.0,1306309.5,0.100037,30.0,2.0,0.246338,0.221558,0.253906,0.241699,0.268555,-146263.5,195964.65625,6737.310059,68809.5,68053.5,0.0,-1.0,68809.5,0.0,6.0,0.0,0.082092,0.067261,0.079163,0.062134,0.07959,-135882.0,12794.219727,56553.988281,435436.5,484191.0,3442.5,533.0,435436.5,0.050018,10.0,0.666504,0.082092,0.073853,0.084595,0.080566,0.089539,-48754.5,65321.550781,98356.992188,900000.0,1035882.0,6885.0,1400.0,900000.0,0.100037,12.0,1.0,0.082092,0.083801,0.095459,0.103577,0.09906,756.0,144401.9375,3.0,-716.0,-1.0,-386.0,-536.0,-527.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0.166626,0.333008,0.193604,0.103638,-1.9e-05,-3.7e-05,-3.2e-05,36.234085,270000.0,0.1604,0.132217,4.79075,1.46875,0.489502,0.398438,0.21167,0.500488,1.162109,0.581543,1.145199,0.070862,0.049389,270000.0,90000.0


In [58]:
app_train.to_csv('home_credit_train.csv',index=False)
app_test.to_csv('home_credit_test.csv',index=False)