###### Importing dependecies

In [2]:
import pandas as pd
import numpy as np
import os
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

#### Function to reduce memory usage:

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

###### Reading source train and test data set

In [3]:
# personal computer
app_train = import_data('application_train.csv')
app_test = import_data('application_test.csv')

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


###### Select all fields starting with FLAG and take sum across column.

In [4]:
flag_doc = [_f for _f in app_train.columns if 'FLAG_DOCUMENT_' in _f]
flag_cols = [_f for _f in app_train.columns if ('FLAG_' in _f) & ('FLAG_DOCUMENT_' not in _f) & ('_FLAG_' not in _f)]

###### Synthensize new fields

In [5]:
app_train['DOCUMENT_IND_MEAN'] = app_train[flag_doc].mean(axis=1)
app_train['DOCUMENT_IND_STD'] = app_train[flag_doc].std(axis=1)
app_train['DOCUMENT_IND_KURT'] = app_train[flag_doc].kurtosis(axis=1)
app_train['DOCUMENT_IND_SUM'] = app_train[flag_doc].sum(axis=1)

app_train['FLAG_IND_SUM'] = app_train[flag_cols].sum(axis=1)
app_train['FLAG_IND_KURT'] = app_train[flag_cols].kurtosis(axis=1)
app_train['FLAG_IND_STD'] = app_train[flag_cols].std(axis=1)
app_train['FLAG_IND_MEAN'] = app_train[flag_cols].mean(axis=1)

In [6]:
app_test['DOCUMENT_IND_MEAN'] = app_test[flag_doc].mean(axis=1)
app_test['DOCUMENT_IND_STD'] = app_test[flag_doc].std(axis=1)
app_test['DOCUMENT_IND_KURT'] = app_test[flag_doc].kurtosis(axis=1)
app_test['DOCUMENT_IND_SUM'] = app_test[flag_doc].sum(axis=1)

app_test['FLAG_IND_SUM'] = app_test[flag_cols].sum(axis=1)
app_test['FLAG_IND_KURT'] = app_test[flag_cols].kurtosis(axis=1)
app_test['FLAG_IND_STD'] = app_test[flag_cols].std(axis=1)
app_test['FLAG_IND_MEAN'] = app_test[flag_cols].mean(axis=1)

In [7]:
app_test.shape

(48744, 129)

In [8]:
app_train.shape

(307511, 130)

#### Drop features which has only one unique value in Test and Train data set

In [9]:
#Drop columns having only one unique value
remove_cols = []
for col in app_test.columns:
    if len(app_test[col].unique()) == 1:
        remove_cols.append(col)
remove_cols

['FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

In [10]:
app_train = app_train.drop(columns=remove_cols)
app_test = app_test.drop(columns=remove_cols)

###### Funciton to find missing values

In [11]:
## Missing values###########
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(4)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
###############################

#### Function to encode categorical variables. Label encoding for features having 2 values and One hot for rest

In [12]:
#Encoding categorical variables
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
#le.fit(df[col])
def encoding_cat_vars(df_train,df_test,le):
    le_count = 0
    one_hot_count = 0
    # Iterate through the columns
    for col in df_train:
        if df_train[col].dtype == 'object' or df_train[col].dtype =='bool' :
            # If 2 or fewer unique categories
            if len(list(df_train[col].unique())) <= 2:
                le.fit(df_train[col])
                # Train on the training data
                # Transform both training and testing data
                df_train[col] = le.transform(df_train[col])   
                df_test[col] = le.transform(df_test[col]) 
                # Keep track of how many columns were label encoded
                le_count += 1
            else:
                one_hot_count +=1
    print ("no of columns label encoded : "+str(le_count))
    #One Hot encoding
    df_train = pd.get_dummies(df_train)
    df_test = pd.get_dummies(df_test)
    print ("no of columns one hot encoded encoded : "+str(one_hot_count))
    return df_train,df_test

###### Function to align test and train data set after encoding

In [4]:
#Aligning Training and Testing Data
def align_train_test(tmp_app_train,tmp_app_test):
    train_labels = tmp_app_train['TARGET']
    # Align the training and testing data, keep only columns present in both dataframes
    tmp_app_train, tmp_app_test = tmp_app_train.align(tmp_app_test, join = 'inner', axis = 1)
    # Add the target back in
    tmp_app_train['TARGET'] = train_labels
    print('Training Features shape: ', tmp_app_train.shape)
    print('Testing Features shape: ', tmp_app_test.shape)
    return tmp_app_train,tmp_app_test

##### Adding a function to bin the numerical features to some bins and calculate the default rates

In [14]:
def default_rate_num(feat, application,app_train):
    def_feat_name = 'APP_' + feat +'_' + 'DEFAULT'
    crct_feat_name = 'APP_' + feat +'_' +'CORRECT'
    steps = (application[feat].max()-application[feat].min())/50
    bins = np.arange(application[feat].min(),application[feat].max(),steps)
    app_train[feat+'_binned'] = pd.cut(application[feat], bins)
    application[feat+'_binned'] = pd.cut(application[feat], bins)
    feat_type = app_train.groupby([feat+'_binned','TARGET'])['SK_ID_CURR'].count().reset_index()
    feat_type = feat_type.pivot(index=feat+'_binned', columns='TARGET', values='SK_ID_CURR').reset_index()
    feat_type = feat_type.rename(columns={0:crct_feat_name, 1:def_feat_name})
    application = pd.merge(application,feat_type,on=[feat+'_binned'],how='left')
    application['APP_'+feat+'_DEFAULT_RATE'] = application[def_feat_name]/(1+application[def_feat_name]+application[crct_feat_name])
    application = application.drop([feat+'_binned',crct_feat_name,def_feat_name], axis=1)
    return application

##### Taking a backup of source data sets as we are going to modify the datasets

In [15]:
tmp_app_train = app_train.copy()
tmp_app_test = app_test.copy()

In [16]:
tmp_app_train.shape

(307511, 119)

In [17]:
tmp_app_test.shape

(48744, 118)

##### Find missing values

In [18]:
#Check for missing values
missing_values = missing_values_table(tmp_app_train)
missing_values['column_name'] = missing_values.index
missing_values = missing_values.reset_index(drop = True)
missing_values

Your selected dataframe has 119 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,column_name
0,214865,69.8723,COMMONAREA_MEDI
1,214865,69.8723,COMMONAREA_AVG
2,214865,69.8723,COMMONAREA_MODE
3,213514,69.433,NONLIVINGAPARTMENTS_MEDI
4,213514,69.433,NONLIVINGAPARTMENTS_MODE
5,213514,69.433,NONLIVINGAPARTMENTS_AVG
6,210295,68.3862,FONDKAPREMONT_MODE
7,210199,68.355,LIVINGAPARTMENTS_MODE
8,210199,68.355,LIVINGAPARTMENTS_MEDI
9,210199,68.355,LIVINGAPARTMENTS_AVG


##### Impute AMT_ANNUITY as 1/10 of the INCOME

In [19]:
tmp_app_train['AMT_ANNUITY_NAN'] = tmp_app_train['AMT_ANNUITY'].isnull()
tmp_app_test['AMT_ANNUITY_NAN'] = tmp_app_test['AMT_ANNUITY'].isnull()

In [93]:
tmp_app_train.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_11,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOCUMENT_IND_MEAN,DOCUMENT_IND_STD,DOCUMENT_IND_KURT,DOCUMENT_IND_SUM,FLAG_IND_SUM,FLAG_IND_KURT,FLAG_IND_STD,FLAG_IND_MEAN,AMT_ANNUITY_NAN,AMT_ANNUITY_TO_INCOME_RATIO,AMT_CREDIT_TO_INCOME_RATIO,AMT_CREDIT_TO_ANNUITY_RATIO,APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE,EXT_SOURCE_MISSING,ORG_CORRECT,ORG_DEFAULT,DEFAULT_RATE_ORG_TYPE,NAME_TYPE_CORRECT,NAME_TYPE_DEFAULT,DEFAULT_RATE_NAME_TYPE,OCCUPATION_TYPE_CORRECT,OCCUPATION_TYPE_DEFAULT,DEFAULT_RATE_OCC_TYPE,INCOME_TYPE_CORRECT,INCOME_TYPE_DEFAULT,DEFAULT_RATE_INCOME_TYPE,EDUCATION_TYPE_CORRECT,EDUCATION_TYPE_DEFAULT,DEFAULT_RATE_EDUCATION_TYPE,FAMILY_STATUS_CORRECT,FAMILY_STATUS_DEFAULT,DEFAULT_RATE_FAMILY_STATUS,HOUSING_TYPE_CORRECT,HOUSING_TYPE_DEFAULT,DEFAULT_RATE_HOUSING_TYPE,TOTALAREA_NAN,EXT_SOURCE_3_NAN,EMERGENCYSTATE_NAN,AMT_REQ_CREDIT_BUREAU_NAN,DEF_SOCIAL_CIRCLE_NAN,COMMONAREA_NAN,NONLIVINGAPARTMENTS_NAN,LIVINGAPARTMENTS_NAN,EXT_SOURCE_1_NAN,EXT_SOURCE_2_NAN,NONLIVINGAREA_NAN,YEARS_BEGINEXPLUATATION_NAN,APARTMENT_FLAG,DAYS_EMPLOYED_ANOM,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,TARGET
0,100002,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018799,-9461,-637,-3648.0,-2120,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083008,0.262939,0.139404,0.024704,2.0,2.0,2.0,2.0,-1134.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.05,0.223607,20.0,1,4,-1.875,0.516398,0.666667,0,0.121978,2.007889,16.461104,0.077338,0.085951,0.133397,0,61615,6318,0.10254,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003542,-16765,-1188,-1186.0,-291,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311279,0.62207,0.535156,0.095886,1.0,0.0,1.0,0.0,-828.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.223607,20.0,1,4,-1.875,0.516398,0.666667,0,0.132217,4.79075,36.234085,0.077338,0.086806,0.052561,1,8357,526,0.062941,37139,3009,0.08102,25795,1735,0.067261,20433.0,1247.0,0.061029,70791,4006,0.056589,181425.0,14836.0,0.081775,251375,21255,0.084555,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010033,-19046,-225,-4260.0,-2531,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,0.505859,0.556152,0.729492,0.060181,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,5,6.0,0.408248,0.833333,0,0.1,2.0,20.0,0.074597,0.085951,0.089602,1,9665,726,0.075116,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,1,0,1,0,0,1,1,1,1,0,1,1,0,0,0,1,0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008018,-19005,-3039,-9832.0,-2437,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,0.505859,0.650391,0.535156,0.048096,2.0,0.0,2.0,0.0,-617.0,1,0,0,0,0,0,0,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.05,0.223607,20.0,1,3,-3.333333,0.547723,0.5,0,0.2199,2.316167,10.532818,0.085659,0.085951,0.08128,2,61615,6318,0.10254,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,26791.0,2959.0,0.110448,251375,21255,0.084555,1,1,1,1,0,1,1,1,1,0,1,1,0,0,1,0,0
4,100007,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028656,-19932,-3038,-4312.0,-3458,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,0.505859,0.322754,0.535156,0.171875,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.223607,20.0,1,3,-3.333333,0.547723,0.5,0,0.179963,4.222222,23.461618,0.08352,0.086806,0.048171,2,80,5,0.0625,228185,20337,0.089125,25795,1735,0.067261,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,1,1,1,0,0,1,1,1,1,0,1,1,0,0,1,0,0


In [20]:
tmp_app_train['AMT_ANNUITY'] = np.where(tmp_app_train['AMT_ANNUITY'].isnull()
                                        ,np.where(tmp_app_train['NAME_CONTRACT_TYPE']=='Cash loans',
                                                  0.16*tmp_app_train['AMT_INCOME_TOTAL'],
                                                  0.1*tmp_app_train['AMT_INCOME_TOTAL']),
                                        tmp_app_train['AMT_ANNUITY'])
    
tmp_app_test['AMT_ANNUITY'] = np.where(tmp_app_test['AMT_ANNUITY'].isnull()
                                        ,np.where(tmp_app_test['NAME_CONTRACT_TYPE']=='Cash loans',
                                                  0.16*tmp_app_test['AMT_INCOME_TOTAL'],
                                                  0.1*tmp_app_test['AMT_INCOME_TOTAL']),
                                        tmp_app_test['AMT_ANNUITY'])

In [21]:
tmp_app_train['AMT_ANNUITY_TO_INCOME_RATIO'] = tmp_app_train['AMT_ANNUITY']/tmp_app_train['AMT_INCOME_TOTAL']
tmp_app_train['AMT_CREDIT_TO_INCOME_RATIO'] = tmp_app_train['AMT_CREDIT'] / tmp_app_train['AMT_INCOME_TOTAL']
tmp_app_train['AMT_CREDIT_TO_ANNUITY_RATIO'] = tmp_app_train['AMT_CREDIT'] / tmp_app_train['AMT_ANNUITY']


In [22]:
tmp_app_test['AMT_ANNUITY_TO_INCOME_RATIO'] = tmp_app_test['AMT_ANNUITY']/tmp_app_test['AMT_INCOME_TOTAL']
tmp_app_test['AMT_CREDIT_TO_INCOME_RATIO'] = tmp_app_test['AMT_CREDIT'] / tmp_app_test['AMT_INCOME_TOTAL']
tmp_app_test['AMT_CREDIT_TO_ANNUITY_RATIO'] = tmp_app_test['AMT_CREDIT'] / tmp_app_test['AMT_ANNUITY']


In [23]:
tmp_app_train = default_rate_num('AMT_ANNUITY_TO_INCOME_RATIO', tmp_app_train,app_train)
tmp_app_train = default_rate_num('AMT_CREDIT_TO_INCOME_RATIO', tmp_app_train,app_train)
tmp_app_train = default_rate_num('AMT_CREDIT_TO_ANNUITY_RATIO', tmp_app_train,app_train)

tmp_app_test = default_rate_num('AMT_ANNUITY_TO_INCOME_RATIO', tmp_app_test,app_train)
tmp_app_test = default_rate_num('AMT_CREDIT_TO_INCOME_RATIO', tmp_app_test,app_train)
tmp_app_test = default_rate_num('AMT_CREDIT_TO_ANNUITY_RATIO', tmp_app_test,app_train)

In [24]:
tmp_app_train.shape

(307511, 126)

In [26]:
tmp_app_test.shape

(48744, 125)

In [27]:
tmp_app_test.head(2)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_11,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOCUMENT_IND_MEAN,DOCUMENT_IND_STD,DOCUMENT_IND_KURT,DOCUMENT_IND_SUM,FLAG_IND_SUM,FLAG_IND_KURT,FLAG_IND_STD,FLAG_IND_MEAN,AMT_ANNUITY_NAN,AMT_ANNUITY_TO_INCOME_RATIO,AMT_CREDIT_TO_INCOME_RATIO,AMT_CREDIT_TO_ANNUITY_RATIO,APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.018845,-19241,-2329,-5168.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752441,0.789551,0.159546,0.065979,0.05899,0.973145,,,,0.137939,0.125,,,,0.050507,,,0.0672,0.061188,0.973145,,,,0.137939,0.125,,,,0.052612,,,0.066589,0.05899,0.973145,,,,0.137939,0.125,,,,0.051392,,,,block of flats,0.039215,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.223607,20.0,1,4,-1.875,0.516398,0.666667,False,0.1523,4.213333,27.664697,0.081895,0.077298,0.075918
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035797,-18064,-4469,-9120.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.564941,0.291748,0.432861,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.05,0.223607,20.0,1,3,-3.333333,0.547723,0.5,False,0.175455,2.250182,12.82487,0.077403,0.079487,0.085996


In [28]:
tmp_app_train['APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE'] = tmp_app_train['APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE'].fillna(0)
tmp_app_train['APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE'] = tmp_app_train['APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE'].fillna(0)
tmp_app_train['APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE'] = tmp_app_train['APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE'].fillna(0)

tmp_app_test['APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE'] = tmp_app_test['APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE'].fillna(0)
tmp_app_test['APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE'] = tmp_app_test['APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE'].fillna(0)
tmp_app_test['APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE'] = tmp_app_test['APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE'].fillna(0)

In [29]:
#Check for missing values
missing_values = missing_values_table(tmp_app_train)
missing_values['column_name'] = missing_values.index
missing_values = missing_values.reset_index(drop = True)
missing_values

Your selected dataframe has 126 columns.
There are 66 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,column_name
0,214865,69.8723,COMMONAREA_MODE
1,214865,69.8723,COMMONAREA_MEDI
2,214865,69.8723,COMMONAREA_AVG
3,213514,69.433,NONLIVINGAPARTMENTS_MEDI
4,213514,69.433,NONLIVINGAPARTMENTS_AVG
5,213514,69.433,NONLIVINGAPARTMENTS_MODE
6,210295,68.3862,FONDKAPREMONT_MODE
7,210199,68.355,LIVINGAPARTMENTS_MODE
8,210199,68.355,LIVINGAPARTMENTS_MEDI
9,210199,68.355,LIVINGAPARTMENTS_AVG


#### Function to find correlation between features

In [100]:
def cor_find(df,threshold):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return corr_matrix,upper,to_drop

In [59]:
# corr_matrix,upper,to_drop = cor_find(app_train,0.98)

##### Drop records from train data set where AMT_GOODS_PRICE, CNT_FAM_MEMBERS & DAYS_LAST_PHONE_CHANGE is null as there are no records with test with null AMT_GOODS_PRICE

In [30]:
tmp_app_train = tmp_app_train.dropna(subset=['AMT_GOODS_PRICE','CNT_FAM_MEMBERS','DAYS_LAST_PHONE_CHANGE'])

##### Drop records having code gender XNA ; 4 records in Train and 0 records in TEST

In [31]:
tmp_app_train = tmp_app_train.drop(tmp_app_train[tmp_app_train['CODE_GENDER']=='XNA'].index)
tmp_app_train['CODE_GENDER'] = np.where(tmp_app_train['CODE_GENDER']=='M',1,0)
tmp_app_test['CODE_GENDER'] = np.where(tmp_app_test['CODE_GENDER']=='M',1,0)

### Add an additional column to find count of missing EXT_SOURCES

In [32]:
tmp_app_train['EXT_SOURCE_MISSING'] = 3 - tmp_app_train[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].count(axis=1)
tmp_app_test['EXT_SOURCE_MISSING'] = 3 - tmp_app_test[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].count(axis=1)

### Organization type - Find rate of default by each organization type

In [33]:
org = tmp_app_train.groupby(['ORGANIZATION_TYPE','TARGET'])[['SK_ID_CURR']].count().reset_index()
org = org.pivot(index='ORGANIZATION_TYPE', columns='TARGET', values='SK_ID_CURR').reset_index()
org = org.rename(columns={0:'ORG_CORRECT',1:'ORG_DEFAULT'})

In [34]:
tmp_app_train = pd.merge(tmp_app_train,org,on=['ORGANIZATION_TYPE'],how='left')
tmp_app_train['DEFAULT_RATE_ORG_TYPE'] = tmp_app_train['ORG_DEFAULT']/tmp_app_train['ORG_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['ORGANIZATION_TYPE'])

In [35]:
tmp_app_test = pd.merge(tmp_app_test,org,on=['ORGANIZATION_TYPE'],how='left')
tmp_app_test['DEFAULT_RATE_ORG_TYPE'] = tmp_app_test['ORG_DEFAULT']/tmp_app_test['ORG_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['ORGANIZATION_TYPE'])

###### Add missing flags for NAN Records ; Impute 'MISSING' values to OCCUPATION TYPE and NAME TYPE SUITE

In [36]:
tmp_app_train['OCCUPATION_TYPE'] = np.where(tmp_app_train['OCCUPATION_TYPE'].isnull(),
                                            'MISSING',tmp_app_train['OCCUPATION_TYPE'])
tmp_app_test['OCCUPATION_TYPE'] = np.where(tmp_app_test['OCCUPATION_TYPE'].isnull(),
                                            'MISSING',tmp_app_test['OCCUPATION_TYPE'])

In [37]:
tmp_app_train['NAME_TYPE_SUITE'] = np.where(tmp_app_train['NAME_TYPE_SUITE'].isnull(),
                                            'MISSING',tmp_app_train['NAME_TYPE_SUITE'])
tmp_app_test['NAME_TYPE_SUITE'] = np.where(tmp_app_test['NAME_TYPE_SUITE'].isnull(),
                                            'MISSING',tmp_app_test['NAME_TYPE_SUITE'])


### NAME_TYPE_SUITE,OCCUPATION_TYPE :- Find rate of default

In [38]:
name_type = tmp_app_train.groupby(['NAME_TYPE_SUITE','TARGET'])[['SK_ID_CURR']].count().reset_index()
name_type = name_type.pivot(index='NAME_TYPE_SUITE', columns='TARGET', values='SK_ID_CURR').reset_index()
name_type = name_type.rename(columns={0:'NAME_TYPE_CORRECT',1:'NAME_TYPE_DEFAULT'})

In [39]:
tmp_app_train = pd.merge(tmp_app_train,name_type,on=['NAME_TYPE_SUITE'],how='left')
tmp_app_train['DEFAULT_RATE_NAME_TYPE'] = tmp_app_train['NAME_TYPE_DEFAULT']/tmp_app_train['NAME_TYPE_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['NAME_TYPE_SUITE'])

In [40]:
tmp_app_test = pd.merge(tmp_app_test,name_type,on=['NAME_TYPE_SUITE'],how='left')
tmp_app_test['DEFAULT_RATE_NAME_TYPE'] = tmp_app_test['NAME_TYPE_DEFAULT']/tmp_app_test['NAME_TYPE_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['NAME_TYPE_SUITE'])

In [41]:
occ_type = tmp_app_train.groupby(['OCCUPATION_TYPE','TARGET'])[['SK_ID_CURR']].count().reset_index()
occ_type = occ_type.pivot(index='OCCUPATION_TYPE', columns='TARGET', values='SK_ID_CURR').reset_index()
occ_type = occ_type.rename(columns={0:'OCCUPATION_TYPE_CORRECT',1:'OCCUPATION_TYPE_DEFAULT'})

In [42]:
tmp_app_train = pd.merge(tmp_app_train,occ_type,on=['OCCUPATION_TYPE'],how='left')
tmp_app_train['DEFAULT_RATE_OCC_TYPE'] = tmp_app_train['OCCUPATION_TYPE_DEFAULT']/tmp_app_train['OCCUPATION_TYPE_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['OCCUPATION_TYPE'])

In [43]:
tmp_app_test = pd.merge(tmp_app_test,occ_type,on=['OCCUPATION_TYPE'],how='left')
tmp_app_test['DEFAULT_RATE_OCC_TYPE'] = tmp_app_test['OCCUPATION_TYPE_DEFAULT']/tmp_app_test['OCCUPATION_TYPE_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['OCCUPATION_TYPE'])

In [44]:
tmp_app_test.shape

(48744, 132)

#### Remove WEEKDAY APP PROCESS START as it should not be a factor in loan default

In [45]:
tmp_app_train = tmp_app_train.drop(columns=['WEEKDAY_APPR_PROCESS_START'])
tmp_app_test = tmp_app_test.drop(columns=['WEEKDAY_APPR_PROCESS_START'])

##### NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE :- Find rate of default

In [46]:
income_type = tmp_app_train.groupby(['NAME_INCOME_TYPE','TARGET'])[['SK_ID_CURR']].count().reset_index()
income_type = income_type.pivot(index='NAME_INCOME_TYPE', columns='TARGET', values='SK_ID_CURR').reset_index()
income_type = income_type.rename(columns={0:'INCOME_TYPE_CORRECT',1:'INCOME_TYPE_DEFAULT'})

In [47]:
tmp_app_train = pd.merge(tmp_app_train,income_type,on=['NAME_INCOME_TYPE'],how='left')
tmp_app_train['DEFAULT_RATE_INCOME_TYPE'] = tmp_app_train['INCOME_TYPE_DEFAULT']/tmp_app_train['INCOME_TYPE_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['NAME_INCOME_TYPE'])

In [48]:
tmp_app_test = pd.merge(tmp_app_test,income_type,on=['NAME_INCOME_TYPE'],how='left')
tmp_app_test['DEFAULT_RATE_INCOME_TYPE'] = tmp_app_test['INCOME_TYPE_DEFAULT']/tmp_app_test['INCOME_TYPE_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['NAME_INCOME_TYPE'])

In [49]:
edu_type = tmp_app_train.groupby(['NAME_EDUCATION_TYPE','TARGET'])[['SK_ID_CURR']].count().reset_index()
edu_type = edu_type.pivot(index='NAME_EDUCATION_TYPE', columns='TARGET', values='SK_ID_CURR').reset_index()
edu_type = edu_type.rename(columns={0:'EDUCATION_TYPE_CORRECT',1:'EDUCATION_TYPE_DEFAULT'})

In [50]:
tmp_app_train = pd.merge(tmp_app_train,edu_type,on=['NAME_EDUCATION_TYPE'],how='left')
tmp_app_train['DEFAULT_RATE_EDUCATION_TYPE'] = tmp_app_train['EDUCATION_TYPE_DEFAULT']/tmp_app_train['EDUCATION_TYPE_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['NAME_EDUCATION_TYPE'])

In [51]:
tmp_app_test = pd.merge(tmp_app_test,edu_type,on=['NAME_EDUCATION_TYPE'],how='left')
tmp_app_test['DEFAULT_RATE_EDUCATION_TYPE'] = tmp_app_test['EDUCATION_TYPE_DEFAULT']/tmp_app_test['EDUCATION_TYPE_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['NAME_EDUCATION_TYPE'])

In [52]:
tmp_app_test.shape

(48744, 135)

In [53]:
tmp_app_train.shape

(307228, 136)

In [54]:
family_type = tmp_app_train.groupby(['NAME_FAMILY_STATUS','TARGET'])[['SK_ID_CURR']].count().reset_index()
family_type = family_type.pivot(index='NAME_FAMILY_STATUS', columns='TARGET', values='SK_ID_CURR').reset_index()
family_type = family_type.rename(columns={0:'FAMILY_STATUS_CORRECT',1:'FAMILY_STATUS_DEFAULT'})

In [55]:
tmp_app_train = pd.merge(tmp_app_train,family_type,on=['NAME_FAMILY_STATUS'],how='left')
tmp_app_train['DEFAULT_RATE_FAMILY_STATUS'] = tmp_app_train['FAMILY_STATUS_DEFAULT']/tmp_app_train['FAMILY_STATUS_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['NAME_FAMILY_STATUS'])

In [56]:
tmp_app_test = pd.merge(tmp_app_test,family_type,on=['NAME_FAMILY_STATUS'],how='left')
tmp_app_test['DEFAULT_RATE_FAMILY_STATUS'] = tmp_app_test['FAMILY_STATUS_DEFAULT']/tmp_app_test['FAMILY_STATUS_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['NAME_FAMILY_STATUS'])

In [57]:
housing_type = tmp_app_train.groupby(['NAME_HOUSING_TYPE','TARGET'])[['SK_ID_CURR']].count().reset_index()
housing_type = housing_type.pivot(index='NAME_HOUSING_TYPE', columns='TARGET', values='SK_ID_CURR').reset_index()
housing_type = housing_type.rename(columns={0:'HOUSING_TYPE_CORRECT',1:'HOUSING_TYPE_DEFAULT'})

In [58]:
tmp_app_train = pd.merge(tmp_app_train,housing_type,on=['NAME_HOUSING_TYPE'],how='left')
tmp_app_train['DEFAULT_RATE_HOUSING_TYPE'] = tmp_app_train['HOUSING_TYPE_DEFAULT']/tmp_app_train['HOUSING_TYPE_CORRECT']
tmp_app_train = tmp_app_train.drop(columns=['NAME_HOUSING_TYPE'])

In [59]:
tmp_app_test = pd.merge(tmp_app_test,housing_type,on=['NAME_HOUSING_TYPE'],how='left')
tmp_app_test['DEFAULT_RATE_HOUSING_TYPE'] = tmp_app_test['HOUSING_TYPE_DEFAULT']/tmp_app_test['HOUSING_TYPE_CORRECT']
tmp_app_test = tmp_app_test.drop(columns=['NAME_HOUSING_TYPE'])

#### Add is_NAN FLAGS

In [60]:
#Add is_NAN FLAGS
tmp_app_train['TOTALAREA_NAN'] = tmp_app_train['TOTALAREA_MODE'].isnull()
tmp_app_test['TOTALAREA_NAN'] = tmp_app_test['TOTALAREA_MODE'].isnull()

tmp_app_train['EXT_SOURCE_3_NAN'] = tmp_app_train['EXT_SOURCE_3'].isnull()
tmp_app_test['EXT_SOURCE_3_NAN'] = tmp_app_test['EXT_SOURCE_3'].isnull()

tmp_app_train['EMERGENCYSTATE_NAN'] = tmp_app_train['EMERGENCYSTATE_MODE'].isnull()
tmp_app_test['EMERGENCYSTATE_NAN'] = tmp_app_test['EMERGENCYSTATE_MODE'].isnull()

tmp_app_train['AMT_REQ_CREDIT_BUREAU_NAN']= tmp_app_train['AMT_REQ_CREDIT_BUREAU_HOUR'].isnull()
tmp_app_test['AMT_REQ_CREDIT_BUREAU_NAN'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_HOUR'].isnull()
              
tmp_app_train['DEF_SOCIAL_CIRCLE_NAN'] = tmp_app_train['DEF_60_CNT_SOCIAL_CIRCLE'].isnull()
tmp_app_test['DEF_SOCIAL_CIRCLE_NAN'] = tmp_app_test['DEF_60_CNT_SOCIAL_CIRCLE'].isnull()
             

In [61]:
#Add is_NAN FLAGS
tmp_app_train['COMMONAREA_NAN'] = tmp_app_train['COMMONAREA_MODE'].isnull()
tmp_app_test['COMMONAREA_NAN'] = tmp_app_test['COMMONAREA_MODE'].isnull()
tmp_app_train['NONLIVINGAPARTMENTS_NAN'] = tmp_app_train['NONLIVINGAPARTMENTS_MODE'].isnull()
tmp_app_test['NONLIVINGAPARTMENTS_NAN'] = tmp_app_test['NONLIVINGAPARTMENTS_MODE'].isnull()

tmp_app_train['LIVINGAPARTMENTS_NAN'] = tmp_app_train['LIVINGAPARTMENTS_MODE'].isnull()
tmp_app_test['LIVINGAPARTMENTS_NAN'] = tmp_app_test['LIVINGAPARTMENTS_MODE'].isnull()

tmp_app_train['FLOORSMIN_AVG'] = tmp_app_train['FLOORSMIN_MODE'].isnull()
tmp_app_test['FLOORSMIN_AVG'] = tmp_app_test['FLOORSMIN_MODE'].isnull()

tmp_app_train['EXT_SOURCE_1_NAN']= tmp_app_train['EXT_SOURCE_1'].isnull()
tmp_app_test['EXT_SOURCE_1_NAN']= tmp_app_test['EXT_SOURCE_1'].isnull()

tmp_app_train['EXT_SOURCE_2_NAN']= tmp_app_train['EXT_SOURCE_2'].isnull()
tmp_app_test['EXT_SOURCE_2_NAN']= tmp_app_test['EXT_SOURCE_2'].isnull()

tmp_app_train['EXT_SOURCE_3_NAN']= tmp_app_train['EXT_SOURCE_3'].isnull()
tmp_app_test['EXT_SOURCE_3_NAN']= tmp_app_test['EXT_SOURCE_3'].isnull()

tmp_app_train['NONLIVINGAREA_NAN']=tmp_app_train['NONLIVINGAREA_AVG'].isnull()
tmp_app_test['NONLIVINGAREA_NAN']= tmp_app_test['NONLIVINGAREA_AVG'].isnull()

tmp_app_test['YEARS_BEGINEXPLUATATION_NAN']= tmp_app_test['YEARS_BEGINEXPLUATATION_MODE'].isnull()
tmp_app_train['YEARS_BEGINEXPLUATATION_NAN']=tmp_app_train['YEARS_BEGINEXPLUATATION_MODE'].isnull()

In [62]:
tmp_app_train.shape

(307228, 152)

#### Remove features with more than 60 % missing values

In [63]:
#Remove missing features above 60%
tmp_app_train = tmp_app_train.drop(columns=list(missing_values[missing_values['% of Total Values']>60]['column_name']))
tmp_app_test = tmp_app_test.drop(columns=list(missing_values[missing_values['% of Total Values']>60]['column_name']))


In [64]:
tmp_app_train.shape

(307228, 135)

###### Impute missing features based on analysis

In [65]:
tmp_app_train['APARTMENT_FLAG'] = np.where(tmp_app_train['APARTMENTS_AVG'].isnull(), 0, 1)
tmp_app_train['APARTMENTS_AVG'] = tmp_app_train['APARTMENTS_AVG'].fillna(6*tmp_app_train['REGION_POPULATION_RELATIVE'])

In [66]:
tmp_app_train['EXT_SOURCE_1'] = tmp_app_train['EXT_SOURCE_1'].fillna(tmp_app_train['EXT_SOURCE_1'].median())
tmp_app_train['EXT_SOURCE_2'] = tmp_app_train['EXT_SOURCE_2'].fillna(tmp_app_train['EXT_SOURCE_2'].median())
tmp_app_train['EXT_SOURCE_3'] = tmp_app_train['EXT_SOURCE_3'].fillna(tmp_app_train['EXT_SOURCE_3'].median())

In [67]:
tmp_app_test['APARTMENT_FLAG'] = np.where(tmp_app_test['APARTMENTS_AVG'].isnull(), 0, 1)
tmp_app_test['APARTMENTS_AVG'] = tmp_app_test['APARTMENTS_AVG'].fillna(6*tmp_app_train['REGION_POPULATION_RELATIVE'])

In [68]:
tmp_app_test['EXT_SOURCE_1'] = tmp_app_test['EXT_SOURCE_1'].fillna(tmp_app_test['EXT_SOURCE_1'].median())
tmp_app_test['EXT_SOURCE_2'] = tmp_app_test['EXT_SOURCE_2'].fillna(tmp_app_test['EXT_SOURCE_2'].median())
tmp_app_test['EXT_SOURCE_3'] = tmp_app_test['EXT_SOURCE_3'].fillna(tmp_app_test['EXT_SOURCE_3'].median())

In [69]:
tmp_app_test.shape

(48744, 135)

In [70]:
tmp_app_train.shape

(307228, 136)

##### Check for missing values again

In [71]:
#Check for missing values
missing_values = missing_values_table(tmp_app_train)
missing_values['column_name'] = missing_values.index
missing_values = missing_values.reset_index(drop = True)
missing_values

Your selected dataframe has 136 columns.
There are 42 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,column_name
0,182408,59.3722,LANDAREA_AVG
1,182408,59.3722,LANDAREA_MEDI
2,182408,59.3722,LANDAREA_MODE
3,179770,58.5135,BASEMENTAREA_AVG
4,179770,58.5135,BASEMENTAREA_MODE
5,179770,58.5135,BASEMENTAREA_MEDI
6,169511,55.1743,NONLIVINGAREA_MODE
7,169511,55.1743,NONLIVINGAREA_MEDI
8,169511,55.1743,NONLIVINGAREA_AVG
9,163728,53.292,ELEVATORS_AVG


In [72]:
#Remove features having missing values more than 40% from both train and test
del_cols = missing_values[missing_values['% of Total Values']>40]['column_name'].values
#itemindex = np.where(del_cols=='EXT_SOURCE_1')
#del_cols = np.delete(del_cols, itemindex)
tmp_app_train = tmp_app_train.drop(del_cols,axis = 1) #columns having more than 40 % missing values removed from dataset
tmp_app_test = tmp_app_test.drop(del_cols,axis = 1) #columns having more than 40 % missing values removed from dataset

In [73]:
tmp_app_train.shape

(307228, 106)

In [74]:
#Check for missing values
missing_values = missing_values_table(tmp_app_train)
missing_values['column_name'] = missing_values.index
missing_values = missing_values.reset_index(drop = True)
missing_values

Your selected dataframe has 106 columns.
There are 12 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,column_name
0,41472,13.4988,AMT_REQ_CREDIT_BUREAU_HOUR
1,41472,13.4988,AMT_REQ_CREDIT_BUREAU_DAY
2,41472,13.4988,AMT_REQ_CREDIT_BUREAU_WEEK
3,41472,13.4988,AMT_REQ_CREDIT_BUREAU_MON
4,41472,13.4988,AMT_REQ_CREDIT_BUREAU_QRT
5,41472,13.4988,AMT_REQ_CREDIT_BUREAU_YEAR
6,1021,0.3323,OBS_30_CNT_SOCIAL_CIRCLE
7,1021,0.3323,DEF_30_CNT_SOCIAL_CIRCLE
8,1021,0.3323,OBS_60_CNT_SOCIAL_CIRCLE
9,1021,0.3323,DEF_60_CNT_SOCIAL_CIRCLE


In [75]:
tmp_app_train['FLAG_OWN_CAR'] = np.where(tmp_app_train['FLAG_OWN_CAR']=='Y',1,0)
tmp_app_train['FLAG_OWN_REALTY'] = np.where(tmp_app_train['FLAG_OWN_REALTY']=='Y',1,0)

tmp_app_test['FLAG_OWN_CAR'] = np.where(tmp_app_test['FLAG_OWN_CAR']=='Y',1,0)
tmp_app_test['FLAG_OWN_REALTY'] = np.where(tmp_app_test['FLAG_OWN_REALTY']=='Y',1,0)

### Check for Days Anomalies

In [77]:
# Create an anomalous flag column
tmp_app_train['DAYS_EMPLOYED_ANOM'] = tmp_app_train["DAYS_EMPLOYED"] == 365243
tmp_app_train['DAYS_EMPLOYED'].replace({365243: 1}, inplace = True)
# any column created in training has to be created in test data
tmp_app_test['DAYS_EMPLOYED_ANOM'] = tmp_app_test["DAYS_EMPLOYED"]  == 365243
tmp_app_test["DAYS_EMPLOYED"].replace({365243: 1}, inplace = True)

In [80]:
tmp_app_train['INCOME_TYPE_DEFAULT'] = tmp_app_train['INCOME_TYPE_DEFAULT'].fillna(0)
tmp_app_test['INCOME_TYPE_DEFAULT'] = tmp_app_test['INCOME_TYPE_DEFAULT'].fillna(0)

tmp_app_train['DEFAULT_RATE_INCOME_TYPE'] = tmp_app_train['DEFAULT_RATE_INCOME_TYPE'].fillna(0)
tmp_app_test['DEFAULT_RATE_INCOME_TYPE'] = tmp_app_test['DEFAULT_RATE_INCOME_TYPE'].fillna(0)

#### Impute missing values with -1 ; A field _isNAN was previously added 

In [81]:
tmp_app_train['OBS_30_CNT_SOCIAL_CIRCLE'] = tmp_app_train['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_train['DEF_30_CNT_SOCIAL_CIRCLE'] = tmp_app_train['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_train['OBS_60_CNT_SOCIAL_CIRCLE'] = tmp_app_train['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_train['DEF_60_CNT_SOCIAL_CIRCLE'] = tmp_app_train['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(-1)

tmp_app_test['OBS_30_CNT_SOCIAL_CIRCLE'] = tmp_app_test['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_test['DEF_30_CNT_SOCIAL_CIRCLE'] = tmp_app_test['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_test['OBS_60_CNT_SOCIAL_CIRCLE'] = tmp_app_test['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(-1)
tmp_app_test['DEF_60_CNT_SOCIAL_CIRCLE'] = tmp_app_test['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(-1)


In [82]:
tmp_app_train['AMT_REQ_CREDIT_BUREAU_HOUR'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(-1)
tmp_app_train['AMT_REQ_CREDIT_BUREAU_DAY'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(-1)
tmp_app_train['AMT_REQ_CREDIT_BUREAU_WEEK'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(-1)
tmp_app_train['AMT_REQ_CREDIT_BUREAU_MON'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_MON'].fillna(-1)
tmp_app_train['AMT_REQ_CREDIT_BUREAU_YEAR'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(-1)
tmp_app_train['AMT_REQ_CREDIT_BUREAU_QRT'] = tmp_app_train['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(-1)

In [83]:
tmp_app_test['AMT_REQ_CREDIT_BUREAU_HOUR'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(-1)
tmp_app_test['AMT_REQ_CREDIT_BUREAU_DAY'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(-1)
tmp_app_test['AMT_REQ_CREDIT_BUREAU_WEEK'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(-1)
tmp_app_test['AMT_REQ_CREDIT_BUREAU_MON'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_MON'].fillna(-1)
tmp_app_test['AMT_REQ_CREDIT_BUREAU_YEAR'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(-1)
tmp_app_test['AMT_REQ_CREDIT_BUREAU_QRT'] = tmp_app_test['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(-1)

#### Encoding Categorical features and aligninig train and test data set

In [84]:
#Encoding
tmp_app_train,tmp_app_test = encoding_cat_vars(tmp_app_train,tmp_app_test,le)

no of columns label encoded : 14
no of columns one hot encoded encoded : 0


In [85]:
list1 = list(tmp_app_train.columns)

In [86]:
list2 = list(tmp_app_test.columns)

In [87]:
set(list1).difference(list2)
# Below are missing features in TEST data set

{'TARGET'}

##### Aligning Train and test data sets

In [88]:
tmp_app_train,tmp_app_test = align_train_test(tmp_app_train,tmp_app_test)

Training Features shape:  (307228, 108)
Testing Features shape:  (48744, 107)


In [89]:
tmp_app_test.shape

(48744, 107)

In [90]:
tmp_app_train.shape

(307228, 108)

In [92]:
tmp_app_train.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_11,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOCUMENT_IND_MEAN,DOCUMENT_IND_STD,DOCUMENT_IND_KURT,DOCUMENT_IND_SUM,FLAG_IND_SUM,FLAG_IND_KURT,FLAG_IND_STD,FLAG_IND_MEAN,AMT_ANNUITY_NAN,AMT_ANNUITY_TO_INCOME_RATIO,AMT_CREDIT_TO_INCOME_RATIO,AMT_CREDIT_TO_ANNUITY_RATIO,APP_AMT_ANNUITY_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_INCOME_RATIO_DEFAULT_RATE,APP_AMT_CREDIT_TO_ANNUITY_RATIO_DEFAULT_RATE,EXT_SOURCE_MISSING,ORG_CORRECT,ORG_DEFAULT,DEFAULT_RATE_ORG_TYPE,NAME_TYPE_CORRECT,NAME_TYPE_DEFAULT,DEFAULT_RATE_NAME_TYPE,OCCUPATION_TYPE_CORRECT,OCCUPATION_TYPE_DEFAULT,DEFAULT_RATE_OCC_TYPE,INCOME_TYPE_CORRECT,INCOME_TYPE_DEFAULT,DEFAULT_RATE_INCOME_TYPE,EDUCATION_TYPE_CORRECT,EDUCATION_TYPE_DEFAULT,DEFAULT_RATE_EDUCATION_TYPE,FAMILY_STATUS_CORRECT,FAMILY_STATUS_DEFAULT,DEFAULT_RATE_FAMILY_STATUS,HOUSING_TYPE_CORRECT,HOUSING_TYPE_DEFAULT,DEFAULT_RATE_HOUSING_TYPE,TOTALAREA_NAN,EXT_SOURCE_3_NAN,EMERGENCYSTATE_NAN,AMT_REQ_CREDIT_BUREAU_NAN,DEF_SOCIAL_CIRCLE_NAN,COMMONAREA_NAN,NONLIVINGAPARTMENTS_NAN,LIVINGAPARTMENTS_NAN,EXT_SOURCE_1_NAN,EXT_SOURCE_2_NAN,NONLIVINGAREA_NAN,YEARS_BEGINEXPLUATATION_NAN,APARTMENT_FLAG,DAYS_EMPLOYED_ANOM,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,TARGET
0,100002,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018799,-9461,-637,-3648.0,-2120,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083008,0.262939,0.139404,0.024704,2.0,2.0,2.0,2.0,-1134.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.05,0.223607,20.0,1,4,-1.875,0.516398,0.666667,0,0.121978,2.007889,16.461104,0.077338,0.085951,0.133397,0,61615,6318,0.10254,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003542,-16765,-1188,-1186.0,-291,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311279,0.62207,0.535156,0.095886,1.0,0.0,1.0,0.0,-828.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.223607,20.0,1,4,-1.875,0.516398,0.666667,0,0.132217,4.79075,36.234085,0.077338,0.086806,0.052561,1,8357,526,0.062941,37139,3009,0.08102,25795,1735,0.067261,20433.0,1247.0,0.061029,70791,4006,0.056589,181425.0,14836.0,0.081775,251375,21255,0.084555,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010033,-19046,-225,-4260.0,-2531,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,0.505859,0.556152,0.729492,0.060181,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,5,6.0,0.408248,0.833333,0,0.1,2.0,20.0,0.074597,0.085951,0.089602,1,9665,726,0.075116,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,1,0,1,0,0,1,1,1,1,0,1,1,0,0,0,1,0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008018,-19005,-3039,-9832.0,-2437,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,0.505859,0.650391,0.535156,0.048096,2.0,0.0,2.0,0.0,-617.0,1,0,0,0,0,0,0,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.05,0.223607,20.0,1,3,-3.333333,0.547723,0.5,0,0.2199,2.316167,10.532818,0.085659,0.085951,0.08128,2,61615,6318,0.10254,228185,20337,0.089125,49302,5832,0.118291,143375.0,15210.0,0.106085,198682,19507,0.098182,26791.0,2959.0,0.110448,251375,21255,0.084555,1,1,1,1,0,1,1,1,1,0,1,1,0,0,1,0,0
4,100007,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028656,-19932,-3038,-4312.0,-3458,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,0.505859,0.322754,0.535156,0.171875,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.223607,20.0,1,3,-3.333333,0.547723,0.5,0,0.179963,4.222222,23.461618,0.08352,0.086806,0.048171,2,80,5,0.0625,228185,20337,0.089125,25795,1735,0.067261,143375.0,15210.0,0.106085,198682,19507,0.098182,40934.0,4452.0,0.10876,251375,21255,0.084555,1,1,1,0,0,1,1,1,1,0,1,1,0,0,1,0,0


##### Results to a csv file

In [91]:
tmp_app_train.to_csv('tmp_app_train.csv',index=False)
tmp_app_test.to_csv('tmp_app_test.csv',index=False)