##  Data Preprocessing

Step 2: 

Perform various transformations on our data as required for efficient and effective processing by Machine Learning algorithms based on the type and needs of individual features, identified through the data explorations performed in Step 1 - Data Exploration - Application Data.

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import LabelBinarizer

# Show all of the data in a dataframe
pd.set_option('display.max_columns', None)

In [2]:
# Training Set
application_train = pd.read_csv('data/application_train.csv')
print("Loaded Training Set: {0} rows".format(application_train.shape[0]))

application_test = pd.read_csv('data/application_test.csv')
print("Loaded Training Set: {0} rows".format(application_test.shape[0]))

columns = pd.read_csv('data/HomeCredit_columns_description.csv')

Loaded Training Set: 307511 rows
Loaded Training Set: 48744 rows


In [3]:
# Load the list of features indentified in the exploration step which need preprocessing
non_numeric_features = pd.read_csv('data/tmp/non_numeric_features.csv', header=0, index_col=0, names=["feature"])
numeric_features = pd.read_csv('data/tmp/numeric_features.csv', header=0, index_col=0, names=["feature"])
string_to_bool_features = pd.read_csv('data/tmp/string_to_bool.csv', header=0, index_col=0, names=["feature"])
log_transform_features = pd.read_csv('data/tmp/log_transform.csv', header=0, index_col=0, names=["feature"])

In [4]:
print("Non-Numeric {0}, Numeric {1}, Bool {2}, Log Transform {3}".format(\
        len(non_numeric_features), len(numeric_features), \
        len(string_to_bool_features), len(log_transform_features)))

Non-Numeric 15, Numeric 61, Bool 6, Log Transform 11


In [5]:
'''
Scales a list of numeric features to a range of [0 .. 1] without changing the distribution of the data.  
Accepts a List of column names.  Returns a new dataFrame.
''' 
def scale_features(dataFrame, featureList): 
    dataFrame_transform = pd.DataFrame(data = dataFrame)
    dataFrame_transform[featureList] = scaler.fit_transform(dataFrame_transform[featureList])
    return dataFrame_transform

In [6]:
''' 
Applies an in-place transformation that converts a Y/N field to binary 1/0
'''
def make_bool(dataFrame, featureName): 
    
    lb = LabelBinarizer()
    bool_name = "BOOL_{0}".format(featureName)    
    dataFrame[bool_name] = lb.fit_transform(dataFrame[featureName])
    
    # This seems to work more consistently than df.drop
    # From: https://stackoverflow.com/questions/43838198/df-drop-is-not-working
    del(dataFrame[featureName])    
    #result = dataFrame.drop(featureName, axis=1) 
    
    return dataFrame

In [7]:
''' 
Applies an in-place transformation that converts NaN, Inf and -Inf to numeric values
'''
def make_numeric(dataFrame, featureName): 
        
    return np.nan_to_num(dataFrame[featureName])

In [8]:
'''
Applies an in-place log transformation to numeric features
'''
def log_transform(dataFrame, featureName):     
    
    transformed = dataFrame[featureName].apply(lambda x: np.log(x + 1))
    
    transformed_name = "LOG_{0}".format(featureName)

    print("Transformed Name: {0}".format(transformed_name))
    
    dataFrame[transformed_name] = transformed

    # This seems to work more consistently than df.drop
    # From: https://stackoverflow.com/questions/43838198/df-drop-is-not-working
    del(dataFrame[featureName])
    #result = dataFrame.drop(featureName, axis=1)
    
    return dataFrame

In [9]:
'''
Applies an in-place transformation that offsets all values in a column by the minimum value in the column
'''

def find_offset(dataFrames, featureName): 
    
    offset = 0
    
    for df in dataFrames: 
        if (np.min(df[featureName]) < offset): 
            offset = np.min(df[featureName])

    return abs(offset) + 1
            
            
def offset_negative_values(dataFrame, featureName, offset): 
    print("Offsetting Feature {0} by {1}".format(featureName, offset))
    return dataFrame[featureName].apply(lambda x: (x + offset)), offset

## Preprocessing

### Individual Problematic Features

In [10]:
# DAYS_EMPLOYED

# A bunch of records indicate that the person has worked for ~100 years.
# I'm just going to zero them out, because they're invalid
# NOTE: NaN might ultimately be a better choice

mean_days_employed = np.mean(application_train['DAYS_EMPLOYED'])
application_train['DAYS_EMPLOYED'] = application_train['DAYS_EMPLOYED'].replace({365243: mean_days_employed})
application_test['DAYS_EMPLOYED'] = application_test['DAYS_EMPLOYED'].replace({365243: mean_days_employed})

application_train['FONDKAPREMONT_MODE'] = application_train['FONDKAPREMONT_MODE'].replace({np.nan: "not specified"})
application_test['FONDKAPREMONT_MODE'] = application_test['FONDKAPREMONT_MODE'].replace({np.nan: "not specified"})

application_train['HOUSETYPE_MODE'] = application_train['FONDKAPREMONT_MODE'].replace({np.nan: "not specified"})
application_test['HOUSETYPE_MODE'] = application_test['FONDKAPREMONT_MODE'].replace({np.nan: "not specified"})

application_train['WALLSMATERIAL_MODE'] = application_train['WALLSMATERIAL_MODE'].replace({np.nan: "not specified"})
application_test['WALLSMATERIAL_MODE'] = application_test['WALLSMATERIAL_MODE'].replace({np.nan: "not specified"})

### Convert Y/N String Fields to Boolean

In [11]:
# Convert any Y/N string fields to boolean
for feature in string_to_bool_features['feature']: 
    print("Making Boolean: {0}".format(feature))
    application_test = make_bool(application_test, feature)
    application_train = make_bool(application_train, feature)

Making Boolean: FLAG_OWN_CAR
Making Boolean: FLAG_OWN_REALTY
Making Boolean: FLAG_EMP_PHONE
Making Boolean: FLAG_WORK_PHONE
Making Boolean: FLAG_PHONE
Making Boolean: FLAG_EMAIL


### One-Hot Encode Non-Numeric Features

In [12]:
nnf = non_numeric_features['feature']
print(nnf)

# One-Hot Encode all of our non-numeric features
application_test = pd.get_dummies(application_test, columns=nnf)
application_train = pd.get_dummies(application_train, columns=nnf)

print("Training Set Columns: {0}".format(application_train.shape[1]))
print("Testing Set Columns: {0}".format(application_test.shape[1]))

# Ensure that train and test sets have the same number of columns
# from https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data/41339045
application_train,application_test = application_train.align(application_test, join='outer', axis=1, fill_value=0)

print("Aligned Training Set Columns: {0}".format(application_train.shape[1]))
print("Aligned Testing Set Columns: {0}".format(application_test.shape[1]))

0                    CODE_GENDER
1             NAME_CONTRACT_TYPE
2                NAME_TYPE_SUITE
3               NAME_INCOME_TYPE
4            NAME_EDUCATION_TYPE
5             NAME_FAMILY_STATUS
6              NAME_HOUSING_TYPE
7                OCCUPATION_TYPE
8     WEEKDAY_APPR_PROCESS_START
9        HOUR_APPR_PROCESS_START
10             ORGANIZATION_TYPE
11            FONDKAPREMONT_MODE
12           EMERGENCYSTATE_MODE
13                HOUSETYPE_MODE
14            WALLSMATERIAL_MODE
Name: feature, dtype: object
Training Set Columns: 269
Testing Set Columns: 265
Aligned Training Set Columns: 269
Aligned Testing Set Columns: 269


### Log Transform Skewed Numeric Features

In [13]:
# These features need log transformations but have negative values, and log() of negative values is undefined.
# We need to offset all the values in the feature such that they're positive. 
# We'll want to find the minimum value across the test and train dataset so that we offset consistently

offset = find_offset([application_train, application_test], 'DAYS_BIRTH')
application_train['DAYS_BIRTH'], offset = offset_negative_values(application_train, "DAYS_BIRTH", offset)
application_test['DAYS_BIRTH'], offset = offset_negative_values(application_train, "DAYS_BIRTH", offset)

offset = find_offset([application_train, application_test], 'DAYS_LAST_PHONE_CHANGE')
application_train['DAYS_LAST_PHONE_CHANGE'], offset = offset_negative_values(application_train, "DAYS_LAST_PHONE_CHANGE", offset)
application_test['DAYS_LAST_PHONE_CHANGE'], offset = offset_negative_values(application_train, "DAYS_LAST_PHONE_CHANGE", offset)

offset = find_offset([application_train, application_test], 'DAYS_EMPLOYED')
application_train['DAYS_EMPLOYED'], offset = offset_negative_values(application_train, "DAYS_EMPLOYED", offset)
application_test['DAYS_EMPLOYED'], offset = offset_negative_values(application_train, "DAYS_EMPLOYED", offset)

offset = find_offset([application_train, application_test], 'DAYS_ID_PUBLISH')
application_train['DAYS_ID_PUBLISH'], offset = offset_negative_values(application_train, "DAYS_ID_PUBLISH", offset)
application_test['DAYS_ID_PUBLISH'], offset = offset_negative_values(application_train, "DAYS_ID_PUBLISH", offset)

offset = find_offset([application_train, application_test], 'DAYS_REGISTRATION')
application_train['DAYS_REGISTRATION'], offset = offset_negative_values(application_train, "DAYS_REGISTRATION", offset)
application_test['DAYS_REGISTRATION'], offset = offset_negative_values(application_train, "DAYS_REGISTRATION", offset)

offset = find_offset([application_train, application_test], 'DAYS_EMPLOYED')
application_train['DAYS_EMPLOYED'], offset = offset_negative_values(application_train, "DAYS_EMPLOYED", offset)
application_test['DAYS_EMPLOYED'], offset = offset_negative_values(application_train, "DAYS_EMPLOYED", offset)


Offsetting Feature DAYS_BIRTH by 25230
Offsetting Feature DAYS_BIRTH by 25230
Offsetting Feature DAYS_LAST_PHONE_CHANGE by 4362.0
Offsetting Feature DAYS_LAST_PHONE_CHANGE by 4362.0
Offsetting Feature DAYS_EMPLOYED by 17913.0
Offsetting Feature DAYS_EMPLOYED by 17913.0
Offsetting Feature DAYS_ID_PUBLISH by 7198
Offsetting Feature DAYS_ID_PUBLISH by 7198
Offsetting Feature DAYS_REGISTRATION by 24673.0
Offsetting Feature DAYS_REGISTRATION by 24673.0
Offsetting Feature DAYS_EMPLOYED by 1
Offsetting Feature DAYS_EMPLOYED by 1


In [14]:
# Ensure all values are numeric, then log transform them
for feature in log_transform_features['feature']: 
        application_train[feature] = make_numeric(application_train, feature)
        application_test[feature] = make_numeric(application_test, feature)
    
        print("Performing Log Transformations on: {0}".format(feature))
        application_test = log_transform(application_test, feature)
        application_train = log_transform(application_train, feature)


Performing Log Transformations on: CNT_CHILDREN
Performing Log Transformations on: AMT_INCOME_TOTAL
Performing Log Transformations on: AMT_CREDIT
Performing Log Transformations on: AMT_ANNUITY
Performing Log Transformations on: AMT_GOODS_PRICE
Performing Log Transformations on: OWN_CAR_AGE
Performing Log Transformations on: OBS_30_CNT_SOCIAL_CIRCLE
Performing Log Transformations on: DEF_30_CNT_SOCIAL_CIRCLE
Performing Log Transformations on: OBS_60_CNT_SOCIAL_CIRCLE
Performing Log Transformations on: DEF_60_CNT_SOCIAL_CIRCLE
Performing Log Transformations on: DAYS_LAST_PHONE_CHANGE


### Remove Non-Numeric Values from Numeric Fields

In [15]:
for feature in numeric_features['feature']:
    print("Removing Non-Numeric Values from Feature {0}".format(feature))
    application_train[feature] = make_numeric(application_train, feature)
    application_test[feature] = make_numeric(application_test, feature)

Removing Non-Numeric Values from Feature REGION_POPULATION_RELATIVE
Removing Non-Numeric Values from Feature DAYS_BIRTH
Removing Non-Numeric Values from Feature DAYS_EMPLOYED
Removing Non-Numeric Values from Feature DAYS_REGISTRATION
Removing Non-Numeric Values from Feature DAYS_ID_PUBLISH
Removing Non-Numeric Values from Feature CNT_FAM_MEMBERS
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_HOUR
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_DAY
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_WEEK
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_MON
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_QRT
Removing Non-Numeric Values from Feature AMT_REQ_CREDIT_BUREAU_YEAR
Removing Non-Numeric Values from Feature APARTMENTS_AVG
Removing Non-Numeric Values from Feature APARTMENTS_MEDI
Removing Non-Numeric Values from Feature APARTMENTS_MODE
Removing Non-Numeric Values from Feature BASEMENTAREA_AVG
Removing Non-Numeri

### Scale Numeric Fields

In [16]:
'''
TODO: I think we'll have to: 
 - put the numeric features in their own DataFrame
 - scale them
 - drop the original features from the original data frames
 - add the dataframes with the scaled features back to the original dataframes
'''

 
numeric = [] # We need a list of all numeric features
ltf = log_transform_features['feature']
nf = numeric_features['feature']

# Construct a DataFrame with just the numeric features
#application_train_numeric = pd.DataFrame()
#application_test_numeric = pd.DataFrame()

# Initialize a scaler, then apply it to the features
# RobustScaler has superior support for data with outliers
scaler = StandardScaler()

for feature in ltf:
    #numeric.append("LOG_{0}".format(feature))
    numeric.append("{0}".format(feature))

for feature in nf: 
    numeric.append("{0}".format(feature))
    
for feature in numeric: 
       
    scaler = scaler.fit(application_train[numeric])
    application_train[numeric] = scaler.transform(application_train[numeric])
    application_test[numeric] = scaler.transform(application_test[numeric])

### Preview
Show our transformed dataset

In [17]:
# Show an example of a record with scaling applied
display(application_train.head(n = 5))

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,APARTMENTS_MEDI,APARTMENTS_MODE,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,BASEMENTAREA_MODE,BOOL_FLAG_EMAIL,BOOL_FLAG_EMP_PHONE,BOOL_FLAG_OWN_CAR,BOOL_FLAG_OWN_REALTY,BOOL_FLAG_PHONE,BOOL_FLAG_WORK_PHONE,CNT_CHILDREN,CNT_FAM_MEMBERS,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,COMMONAREA_AVG,COMMONAREA_MEDI,COMMONAREA_MODE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,DAYS_REGISTRATION,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,ELEVATORS_AVG,ELEVATORS_MEDI,ELEVATORS_MODE,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,ENTRANCES_AVG,ENTRANCES_MEDI,ENTRANCES_MODE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,FLAG_CONT_MOBILE,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_2,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_MOBIL,FLOORSMAX_AVG,FLOORSMAX_MEDI,FLOORSMAX_MODE,FLOORSMIN_AVG,FLOORSMIN_MEDI,FLOORSMIN_MODE,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUR_APPR_PROCESS_START_0,HOUR_APPR_PROCESS_START_1,HOUR_APPR_PROCESS_START_10,HOUR_APPR_PROCESS_START_11,HOUR_APPR_PROCESS_START_12,HOUR_APPR_PROCESS_START_13,HOUR_APPR_PROCESS_START_14,HOUR_APPR_PROCESS_START_15,HOUR_APPR_PROCESS_START_16,HOUR_APPR_PROCESS_START_17,HOUR_APPR_PROCESS_START_18,HOUR_APPR_PROCESS_START_19,HOUR_APPR_PROCESS_START_2,HOUR_APPR_PROCESS_START_20,HOUR_APPR_PROCESS_START_21,HOUR_APPR_PROCESS_START_22,HOUR_APPR_PROCESS_START_23,HOUR_APPR_PROCESS_START_3,HOUR_APPR_PROCESS_START_4,HOUR_APPR_PROCESS_START_5,HOUR_APPR_PROCESS_START_6,HOUR_APPR_PROCESS_START_7,HOUR_APPR_PROCESS_START_8,HOUR_APPR_PROCESS_START_9,HOUSETYPE_MODE_not specified,HOUSETYPE_MODE_org spec account,HOUSETYPE_MODE_reg oper account,HOUSETYPE_MODE_reg oper spec account,LANDAREA_AVG,LANDAREA_MEDI,LANDAREA_MODE,LIVE_CITY_NOT_WORK_CITY,LIVE_REGION_NOT_WORK_REGION,LIVINGAPARTMENTS_AVG,LIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_MODE,LIVINGAREA_AVG,LIVINGAREA_MEDI,LIVINGAREA_MODE,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NONLIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_AVG,NONLIVINGAREA_MEDI,NONLIVINGAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,OWN_CAR_AGE,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,SK_ID_CURR,TARGET,TOTALAREA_MODE,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_not specified,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE
0,-0.166065,-0.478095,-0.505662,0.142129,-0.058766,-0.070987,-0.269947,-0.30862,-0.155837,-0.346719,-0.34518,-0.342055,-0.327407,0.003066,0.006034,0.028603,0,1,0,1,1,0,-0.577538,-1.265685,0,1,0,0.018432,0.020727,0.034883,1.50688,-0.39857,0.579154,-0.206972,0.379837,4.163504,5.25326,-0.368513,-0.365348,-0.355991,1,0,-0.051932,-0.049475,-0.030398,-0.476522,-1.301593,-1.007542,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.198993,-0.197647,-0.188297,0.355764,0.356104,0.368213,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.162782,0.165022,0.185154,0,0,-0.166853,-0.165974,-0.154904,-0.36414,-0.362243,-0.346934,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.100846,-0.099729,-0.095487,-0.261343,-0.258124,-0.247602,0.242861,0.252132,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.455284,-0.149452,2,2,0,0,0,0,100002,1,-0.411375,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0.960978,0.960861,0.961652,1.016443,1.023576,1.042389
1,0.59271,1.72545,1.60048,0.426792,-0.058766,-0.070987,-0.269947,-0.30862,-0.155837,-0.885565,0.396431,0.401242,0.380977,0.236003,0.23993,0.252119,0,1,0,0,1,0,-0.577538,-0.167621,1,0,0,1.012286,1.017847,0.81437,-0.166821,-0.420157,1.790855,0.163117,1.078697,-0.32048,-0.275663,0.431106,0.436172,0.468762,1,0,-0.387436,-0.385082,-0.369801,0.323239,0.566501,-1.527258,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1.165958,1.166763,1.193952,1.824859,1.824092,1.852433,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,-0.228092,-0.227585,-0.222405,0,0,0.648129,0.655182,0.617489,0.014866,0.017768,0.027531,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0.045223,0.047355,-0.095487,-0.059838,-0.054163,-0.247602,-0.174085,-0.168527,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.455284,-1.25275,1,1,0,0,0,0,100003,0,0.197887,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.987275,0.987155,0.987948,1.505968,1.504599,1.508933
2,-1.404507,-1.152888,-1.090025,-0.427196,-0.058766,-0.070987,-0.269947,-0.30862,-0.155837,-0.885565,-0.602452,-0.600864,-0.59305,-0.534146,-0.533388,-0.523697,0,1,1,1,1,1,-0.577538,-1.265685,0,1,0,-0.28919,-0.288724,-0.283095,-0.689509,-0.382429,0.306869,0.17884,0.206116,-0.32048,-0.275663,-0.368513,-0.365348,-0.355991,0,0,-0.72294,-0.72069,-0.709204,-0.767499,0.221612,1.193214,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.744581,-0.743018,-0.740798,-0.525834,-0.524829,-0.522462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,-0.440702,-0.440853,-0.431913,0,0,-0.455166,-0.455213,-0.45302,-0.564727,-0.56318,-0.555204,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.100846,-0.099729,-0.095487,-0.261343,-0.258124,-0.247602,-0.591031,-0.589187,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.430559,-0.783451,2,2,0,0,0,0,100004,0,-0.572048,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,-1.020852,-1.020765,-1.020127,-0.698001,-0.698343,-0.698841
3,0.177929,-0.71143,-0.651753,-0.142533,-0.058766,-0.070987,-0.269947,-0.30862,-0.155837,-0.885565,-0.602452,-0.600864,-0.59305,-0.534146,-0.533388,-0.523697,0,1,0,1,0,0,-0.577538,-0.167621,1,0,0,-0.28919,-0.288724,-0.283095,-0.680114,-0.492674,0.369143,0.418309,-1.375829,-0.32048,-0.275663,-0.368513,-0.365348,-0.355991,0,0,-0.72294,-0.72069,-0.709204,-0.767499,0.7131,-1.527258,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.744581,-0.743018,-0.740798,-0.525834,-0.524829,-0.522462,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-0.440702,-0.440853,-0.431913,0,0,-0.455166,-0.455213,-0.45302,-0.564727,-0.56318,-0.555204,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.100846,-0.099729,-0.095487,-0.261343,-0.258124,-0.247602,0.242861,0.252132,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.455284,-0.928991,2,2,0,0,0,0,100006,0,-0.572048,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-1.020852,-1.020765,-1.020127,-0.698001,-0.698343,-0.698841
4,-0.361658,-0.213734,-0.06739,-0.199466,-0.058766,-0.070987,-0.269947,-0.30862,-0.155837,-0.885565,-0.602452,-0.600864,-0.59305,-0.534146,-0.533388,-0.523697,0,1,0,1,0,0,-0.577538,-1.265685,0,1,0,-0.28919,-0.288724,-0.283095,-0.892535,-0.492635,-0.307263,-0.173107,0.191639,-0.32048,-0.275663,-0.368513,-0.365348,-0.355991,0,0,-0.72294,-0.72069,-0.709204,-0.767499,-0.990729,-1.527258,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,-0.744581,-0.743018,-0.740798,-0.525834,-0.524829,-0.522462,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-0.440702,-0.440853,-0.431913,1,0,-0.455166,-0.455213,-0.45302,-0.564727,-0.56318,-0.555204,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,-0.100846,-0.099729,-0.095487,-0.261343,-0.258124,-0.247602,-0.591031,-0.589187,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.455284,0.56357,2,2,0,1,0,0,100007,0,-0.572048,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,-1.020852,-1.020765,-1.020127,-0.698001,-0.698343,-0.698841


### Output
Save out our preprocessed data to temporary intermediate files

In [18]:
application_train.to_csv('data/tmp/application_train_standard_scaler_no_log_transformation_preprocessed.csv')
application_test.to_csv('data/tmp/application_test_standard_scaler_no_log_transformation_preprocessed.csv')