In [3]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')

# Data Load

In [4]:
# load data

DATA_DIR =  "/../Data/"

ds_names = (
    #"application_train", "application_test", 
    #"bureau", "bureau_balance", 
    #"credit_card_balance", "installments_payments",
    "previous_application","POS_CASH_balance", "credit_card_balance", 
    "application_train"
)

datasets = {}
datasets_transformed = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

# EDA and Transformation Functions

In [5]:
# Class to summarize the features specified into min, max, mean, count, sum, median, and var
class FeatureSummarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None): # no *args or **kargs
        self.features = features
        self.agg_ops = ["min", "max", "count", "sum", "median", "mean", "var"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        keys = list(set(X.columns) - set(self.features))
        
        result = X.groupby(keys, as_index=False) \
                  .agg({ft:self.agg_ops for ft in self.features}) 
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        result.reset_index()
        return result 
    

def runFeatureSummarizer(df, features):
    print(f"df.shape: {df.shape}\n")
    print(f"Aggregated Features:\ndf[{features}][0:5]: \n{df[features][0:5]}")
    pipeline = make_pipeline(FeatureSummarizer(features))
    return(pipeline.fit_transform(df))


def id_num_cat_feature(df):
    numerical = df.select_dtypes(include=['int64', 'float64']).columns
    categorical = df.select_dtypes(include=['object', 'bool']).columns
    feat_num = list(numerical)
    feat_cat = list(categorical)
    
    id_cols = ['SK_ID_CURR','SK_ID_BUREAU']
    
    id_cols = [cols for cols in  list(df.columns.intersection(id_cols))] 
    features = list(set(df.columns) - set(id_cols))
    
    print('--------')
    print(f"# of ID's: {len(id_cols)}")
    print(f" ID's: {id_cols}")
    print('--------')
    print(f"# of numerical   features: {len(feat_num)}")
    print(f"Numerical   features: {feat_num}")
    print('--------')
    print(f"# of categorical features: {len(feat_cat)}")
    print(f"Categorical features: {feat_cat}")
    print('--------')
    print(f"# All features: {len(features)}")
    print(f"All features: {features}")
    return id_cols,feat_num,feat_cat,features

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) 


def eda_transformation(df,n):

    id_cols, feat_num, feat_cat, features =  id_num_cat_feature(df)

    # One-Hot-Encode categorical variables
    df = pd.get_dummies(data=df, columns=feat_cat)

    features = list(set(df.columns) - set(id_cols))
    feat_ohe = list(set(features) - set(feat_num))

    print(f"# of OHE categorical features: {len(feat_ohe)}")
    print(f"OHE Categorical features: {feat_ohe}")
    print('--------')

    # aggregate bureau variables
    df = runFeatureSummarizer(df, features)

    if n ==3:
        
        feature_selection = [
            df[[column for column in df.columns if not column.startswith('SK_ID_PREV') and column.startswith(tuple(feat_num))]],
            df[[column for column in df.columns if column.startswith('DAYS') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('SK_ID_PREV') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]
     
    if n ==4:
        
        feature_selection = [
            df[[column for column in df.columns if not column.startswith('SK_ID_PREV') and column.startswith(tuple(feat_num))]],
            df[[column for column in df.columns if column.startswith('SK_ID_PREV') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]

    df = pd.concat(feature_selection, axis=1)
    features = list(set(df.columns) - set(id_cols))

    print('--------')
    print('Aggregated Features:')
    print('\n'.join(map(str, sorted(features))))
    print('')
    print('Aggregated Data:')
    print('')
    print(df[features].describe().T)
    return df


def feature_selection(df,num):
    pass
    if feat_method == num:
        # bureau_balance
        feature_selection = [
            df[[column for column in df.columns if not column.startswith(tuple(feat_cat)) and not column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('DAYS_CREDIT') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]
        
def drop_null_columns(df, threshold=0.7):
    # Dropping columns with missing value rate higher than threshold
    df = df[df.columns[df.isnull().mean() < threshold]]

    # Dropping rows with missing value rate higher than threshold
    df = df.loc[df.isnull().mean(axis=1) < threshold]

    return df   

# POS

In [6]:
def pos_eda(df):
    pos = df
    drop_list_pos = []
    
    #Adding new features

    pos['POS_PERC_INSTL_PNDNG']=pos['CNT_INSTALMENT_FUTURE']/pos['CNT_INSTALMENT']
    pos['POS_CNT_INSTAL_PNDNG']=pos['CNT_INSTALMENT']-pos['CNT_INSTALMENT_FUTURE']
    pos['POS_DAYS_WTHT_TOLRNC']=pos['SK_DPD']-pos['SK_DPD_DEF']
    pos['MONTHS_BALANCE'] = pos['MONTHS_BALANCE'].abs()
    
    #replacing " " with _ for OHE cols.
    pos['NAME_CONTRACT_STATUS']=pos['NAME_CONTRACT_STATUS'].apply(lambda x: str(x).replace(" ","_")) 
    
    # Drop elements ijndrop list
    threshold = 0.7

    #Dropping rows with missing value rate higher than threshold
    pos = pos.loc[pos.isnull().mean(axis=1) < threshold]
    
    return (eda_transformation(pos,4))

In [7]:
pos = datasets['POS_CASH_balance']
pos = pos_eda(pos)

#this is used for correlations, but below trasformation removes curr_id
#datasets_transformed['POS_CASH_balance'] = pos.reset_index()


--------
# of ID's: 1
 ID's: ['SK_ID_CURR']
--------
# of numerical   features: 4
Numerical   features: ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'POS_PERC_INSTL_PNDNG', 'POS_CNT_INSTAL_PNDNG']
--------
# of categorical features: 1
Categorical features: ['NAME_CONTRACT_STATUS']
--------
# All features: 10
All features: ['MONTHS_BALANCE', 'POS_PERC_INSTL_PNDNG', 'CNT_INSTALMENT_FUTURE', 'SK_ID_PREV', 'POS_DAYS_WTHT_TOLRNC', 'NAME_CONTRACT_STATUS', 'SK_DPD_DEF', 'CNT_INSTALMENT', 'SK_DPD', 'POS_CNT_INSTAL_PNDNG']
# of OHE categorical features: 14
OHE Categorical features: ['MONTHS_BALANCE', 'NAME_CONTRACT_STATUS_XNA', 'NAME_CONTRACT_STATUS_Signed', 'SK_ID_PREV', 'POS_DAYS_WTHT_TOLRNC', 'NAME_CONTRACT_STATUS_Amortized_debt', 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Demand', 'NAME_CONTRACT_STATUS_Canceled', 'SK_DPD_DEF', 'SK_DPD', 'NAME_CONTRACT_STATUS_Active', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Returned_to_the_store']
--------
df.shape: (10001358, 19)


# PREVAPP

In [8]:
def prevapp_eda(df):
    
    prevapp = df
    drop_list_pa = []
    #Day and Amount columns
    day_cols = [col for col in prevapp.columns if 'DAY' in col]
    amt_cols = [col for col in prevapp.columns if 'AMT' in col]
    #Adding new features

    prevapp['PREV_APRV_CNT'] = prevapp['NAME_CONTRACT_STATUS'].map(lambda x: 1 if (x == 'Approved') else 0)
    prevapp['PREV_REJ_CNT'] = prevapp['NAME_CONTRACT_STATUS'].map(lambda x: 1 if (x == 'Refused') else 0)
    prevapp['PREV_APCTN_CRDT_DIFF'] = prevapp['AMT_APPLICATION'] - prevapp['AMT_CREDIT']
    prevapp['PREV_APCTN_CRDT_RATIO'] = prevapp['AMT_APPLICATION'] / prevapp['AMT_CREDIT']
    prevapp['PREV_CRDT_ANNUTY_RATIO'] = prevapp['AMT_CREDIT']/prevapp['AMT_ANNUITY']
    prevapp['PREV_DWN_PYMNT_CRDT_RATIO'] = prevapp['AMT_DOWN_PAYMENT'] / prevapp['AMT_CREDIT']
    
    
    for c in [co for co in prevapp.columns if 'DAYS' in co]:
        prevapp[c] = prevapp[c].replace({365243.0: np.nan})
        prevapp[c] = prevapp[c].abs()
    
    drop_list_pa.append('WEEKDAY_APPR_PROCESS_START') ## weekday data is normally distributed, so wont make any sense to add that
    drop_list_pa.append('HOUR_APPR_PROCESS_START') ## Hour application started.
    
    
    
    # Drop elements ijndrop list

    drop_list_pa.append('WEEKDAY_APPR_PROCESS_START') ## weekday data is normally distributed, so wont make any sense to add that
    drop_list_pa.append('HOUR_APPR_PROCESS_START') ## Hour application started.

    threshold = 0.7
    drop_list_pa = list(prevapp.columns[prevapp.isnull().mean() > threshold])

    prevapp = prevapp.drop(columns=drop_list_pa, axis=1)

    #drop cols and rows which are more than 70% null
    #Dropping columns with missing value rate higher than threshold
    prevapp = prevapp[prevapp.columns[prevapp.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    prevapp = prevapp.loc[prevapp.isnull().mean(axis=1) < threshold]
    
    prevapp= eda_transformation(prevapp,3)
    return prevapp



In [9]:
prevapp = datasets['previous_application']
prevapp = prevapp_eda(prevapp)
datasets_transformed['previous_application'] = prevapp.reset_index()

#prevapp.to_csv(os.getcwd() + DATA_DIR + 'agg_prevapp.csv')

--------
# of ID's: 1
 ID's: ['SK_ID_CURR']
--------
# of numerical   features: 24
Numerical   features: ['SK_ID_PREV', 'SK_ID_CURR', 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'NFLAG_LAST_APPL_IN_DAY', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION', 'SELLERPLACE_AREA', 'CNT_PAYMENT', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL', 'PREV_APRV_CNT', 'PREV_REJ_CNT', 'PREV_APCTN_CRDT_DIFF', 'PREV_APCTN_CRDT_RATIO', 'PREV_CRDT_ANNUTY_RATIO', 'PREV_DWN_PYMNT_CRDT_RATIO']
--------
# of categorical features: 16
Categorical features: ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRO

In [10]:
#prevapp.SK_ID_PREV_count

## PREV CREDIT BALANCE

### 1) A data dictionary of the raw features (test description; data type: numerical, list, etc.)

In [11]:
credit_card_balance_csv = pd.read_csv("./CreditCardBalance.csv")
credit_card_balance_csv

Unnamed: 0,Attribute,DataType,Description
0,SK_ID_PREV,NUMBER,ID of previous credit in Home credit related t...
1,SK_ID_CURR,NUMBER,ID of loan in main sample
2,MONTHS_BALANCE,NUMBER,Month of balance relative to application date ...
3,AMT_BALANCE,NUMBER,Balance during the month of previous credit
4,AMT_CREDIT_LIMIT_ACTUAL,NUMBER,Credit card limit during the month of the prev...
5,AMT_DRAWINGS_ATM_CURRENT,NUMBER,Amount drawing at ATM during the month of the ...
6,AMT_DRAWINGS_CURRENT,NUMBER,Amount drawing during the month of the previou...
7,AMT_DRAWINGS_OTHER_CURRENT,NUMBER,Amount of other drawings during the month of t...
8,AMT_DRAWINGS_POS_CURRENT,NUMBER,Amount drawing or buying goods during the mont...
9,AMT_INST_MIN_REGULARITY,NUMBER,Minimal installment for this month of the prev...


In [12]:
def prev_credit_eda(df):
    df = pd.get_dummies(data=df,
                        columns=['NAME_CONTRACT_STATUS'])
    df = drop_null_columns(df, .7)
    df = eda_transformation(df, 4)
    return df
    #function incase more transformations are done 

In [13]:
prevcreditbalance = datasets['credit_card_balance']
prevcreditbalance = prev_credit_eda(prevcreditbalance)
datasets_transformed['credit_card_balance'] = prevcreditbalance.reset_index()

#prev_credit_balance.to_csv(os.getcwd() + DATA_DIR + 'agg_prevcreditbalance.csv')

--------
# of ID's: 1
 ID's: ['SK_ID_CURR']
--------
# of numerical   features: 22
Numerical   features: ['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF']
--------
# of categorical features: 0
Categorical features: []
--------
# All features: 28
All features: ['AMT_DRAWINGS_ATM_CURRENT', 'SK_DPD', 'NAME_CONTRACT_STATUS_Sent proposal', 'SK_ID_PREV', 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Demand', 'AMT_DRAWINGS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_CREDIT_LIMIT_ACTUAL', 'NAME_CONTRACT_STATUS_Ac

# DF Summary

In [13]:
def feature_summary(df_fa):
    print('DataFrame shape')
    print('Rows:',df_fa.shape[0])
    print('Cols:',df_fa.shape[1])
    print("------------------------------------------------------------------------")
    
    col_list=['Null','%_Null','Unique_Count','Data_type','Max/Min','Mean','Std','Skewness','Sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    df['%_Null']=round((df_fa.isnull().sum()/df_fa.isnull().count()*100),3).sort_values(ascending = False)
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'Max/Min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'Mean']=df_fa[col].mean()
            df.at[col,'Std']=df_fa[col].std()
            df.at[col,'Skewness']=df_fa[col].skew()
        df.at[col,'Sample_values']=list(df_fa[col].unique())
    
    print("Table Statistics")
    print("------------------------------------------------------------------------")
    display(df.fillna('-'))


In [14]:
for i,ds_name in enumerate(datasets.keys()):
    print("Table under consideration:",ds_name.upper())
    print("------------------------------------------------------------------------")
    ds = feature_summary(datasets[ds_name])
    print("------------------------------------------------------------------------")
    

Table under consideration: PREVIOUS_APPLICATION
------------------------------------------------------------------------
DataFrame shape
Rows: 1670214
Cols: 43
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_PREV,0,0.0,1670214,int64,2845382/1000001,1923089.135331,532597.958696,-0.000573,"[2030495, 2802425, 2523466, 2819243, 1784265, ..."
SK_ID_CURR,0,0.0,338857,int64,456255/100001,278357.174099,102814.823849,-0.003303,"[271877, 108129, 122040, 176158, 202054, 19938..."
NAME_CONTRACT_TYPE,0,0.0,4,object,-,-,-,-,"[Consumer loans, Cash loans, Revolving loans, ..."
AMT_ANNUITY,372235,22.287,357960,float64,418058.15/0.0,15955.120659,14782.137335,2.692572,"[1730.43, 25188.615, 15060.735, 47041.335, 319..."
AMT_APPLICATION,0,0.0,93885,float64,6905160.0/0.0,175233.86036,292779.762387,3.391442,"[17145.0, 607500.0, 112500.0, 450000.0, 337500..."
AMT_CREDIT,1,0.0,86804,float64,6905160.0/0.0,196114.021218,318574.616546,3.245815,"[17145.0, 679671.0, 136444.5, 470790.0, 404055..."
AMT_DOWN_PAYMENT,895844,53.636,29279,float64,3060045.0/-0.9,6697.402139,20921.49541,36.476576,"[0.0, nan, 12649.5, 1350.0, 9000.0, 13500.0, 4..."
AMT_GOODS_PRICE,385515,23.082,93886,float64,6905160.0/0.0,227847.279283,315396.557937,3.07369,"[17145.0, 607500.0, 112500.0, 450000.0, 337500..."
WEEKDAY_APPR_PROCESS_START,0,0.0,7,object,-,-,-,-,"[SATURDAY, THURSDAY, TUESDAY, MONDAY, FRIDAY, ..."
HOUR_APPR_PROCESS_START,0,0.0,24,int64,23/0,12.484182,3.334028,-0.025629,"[15, 11, 7, 9, 8, 10, 12, 13, 14, 16, 6, 4, 5,..."


------------------------------------------------------------------------
Table under consideration: POS_CASH_BALANCE
------------------------------------------------------------------------
DataFrame shape
Rows: 10001358
Cols: 11
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_PREV,0,0.0,936325,int64,2843499/1000001,1903216.598957,535846.530722,0.044229,"[1803195, 1715348, 1784872, 1903291, 2341044, ..."
SK_ID_CURR,0,0.0,337252,int64,456255/100001,278403.863306,102763.74509,-0.003128,"[182943, 367990, 397406, 269225, 334279, 34216..."
MONTHS_BALANCE,0,0.0,96,int64,96/1,35.012588,26.06657,0.672777,"[31, 33, 32, 35, 38, 39, 34, 41, 37, 40, 43, 3..."
CNT_INSTALMENT,26071,0.261,74,float64,92.0/1.0,17.08965,11.995056,1.601734,"[48.0, 36.0, 12.0, 24.0, 60.0, 18.0, 4.0, 42.0..."
CNT_INSTALMENT_FUTURE,26087,0.261,80,float64,85.0/0.0,10.48384,11.109058,1.846746,"[45.0, 35.0, 9.0, 42.0, 12.0, 43.0, 36.0, 16.0..."
NAME_CONTRACT_STATUS,0,0.0,9,object,-,-,-,-,"[Active, Completed, Signed, Approved, Returned..."
SK_DPD,0,0.0,3400,int64,4231/0,11.606928,132.714043,14.899126,"[0, 1, 2, 4, 3, 18, 7, 5, 12, 6, 8, 13, 16, 10..."
SK_DPD_DEF,0,0.0,2307,int64,3595/0,0.654468,32.762491,66.339906,"[0, 1, 2, 4, 3, 18, 7, 5, 12, 8, 13, 10, 15, 6..."
POS_PERC_INSTL_PNDNG,26184,0.262,1280,float64,6.67/0.0,0.546994,0.330309,-0.24752,"[0.9375, 0.9722222222222222, 0.75, 0.875, 1.0,..."
POS_CNT_INSTAL_PNDNG,26184,0.262,120,float64,72.0/-51.0,6.605944,5.923767,1.667505,"[3.0, 1.0, 6.0, 0.0, 5.0, 8.0, 7.0, 9.0, 11.0,..."


------------------------------------------------------------------------
Table under consideration: CREDIT_CARD_BALANCE
------------------------------------------------------------------------
DataFrame shape
Rows: 3840312
Cols: 23
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_PREV,0,0.0,104307,int64,2843496/1000018,1904503.5899,536469.470563,0.038385,"[2562384, 2582071, 1740877, 1389973, 1891521, ..."
SK_ID_CURR,0,0.0,103558,int64,456250/100006,278324.207289,102704.475133,-0.001834,"[378907, 363914, 371185, 337855, 126868, 38001..."
MONTHS_BALANCE,0,0.0,96,int64,-1/-96,-34.521921,26.667751,-0.59804,"[-6, -1, -7, -4, -5, -3, -2, -19, -13, -18, -1..."
AMT_BALANCE,0,0.0,1347904,float64,1505902.19/-420250.18,58300.155262,106307.031025,2.920173,"[56.97, 63975.555, 31815.225, 236572.11, 45391..."
AMT_CREDIT_LIMIT_ACTUAL,0,0.0,181,int64,1350000/0,153807.9574,165145.699523,2.059732,"[135000, 45000, 450000, 225000, 270000, 585000..."
AMT_DRAWINGS_ATM_CURRENT,749816,19.525,2268,float64,2115000.0/-6827.31,5961.324822,28225.688579,9.664842,"[0.0, 2250.0, 67500.0, 45000.0, 90000.0, 76500..."
AMT_DRAWINGS_CURRENT,0,0.0,187005,float64,2287098.31/-6211.62,7433.388179,33846.077334,10.065626,"[877.5, 2250.0, 0.0, 11547.0, 67500.0, 45000.0..."
AMT_DRAWINGS_OTHER_CURRENT,749816,19.525,1833,float64,1529847.0/0.0,288.169582,8201.989345,50.57035,"[0.0, 137700.0, nan, 177840.0, 46800.0, 187200..."
AMT_DRAWINGS_POS_CURRENT,749816,19.525,168749,float64,2239274.16/0.0,2968.804848,20796.887047,19.421081,"[877.5, 0.0, 11547.0, 199339.425, 34526.7, 968..."
AMT_INST_MIN_REGULARITY,305236,7.948,312267,float64,202882.01/0.0,3540.204129,5600.154122,2.494431,"[1700.325, 2250.0, 11795.76, 22924.89, 4449.10..."


------------------------------------------------------------------------
Table under consideration: APPLICATION_TRAIN
------------------------------------------------------------------------
DataFrame shape
Rows: 307511
Cols: 122
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_CURR,0,0.000,307511,int64,456255/100002,278180.518577,102790.175348,-0.0012,"[100002, 100003, 100004, 100006, 100007, 10000..."
TARGET,0,0.000,2,int64,1/0,0.080729,0.272419,3.078159,"[1, 0]"
NAME_CONTRACT_TYPE,0,0.000,2,object,-,-,-,-,"[Cash loans, Revolving loans]"
CODE_GENDER,0,0.000,3,object,-,-,-,-,"[M, F, XNA]"
FLAG_OWN_CAR,0,0.000,2,object,-,-,-,-,"[N, Y]"
...,...,...,...,...,...,...,...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,41519,13.502,10,float64,9.0/0.0,0.007,0.110757,27.043505,"[0.0, nan, 1.0, 3.0, 2.0, 4.0, 5.0, 6.0, 9.0, ..."
AMT_REQ_CREDIT_BUREAU_WEEK,41519,13.502,10,float64,8.0/0.0,0.034362,0.204685,9.293573,"[0.0, nan, 1.0, 3.0, 2.0, 4.0, 5.0, 6.0, 8.0, ..."
AMT_REQ_CREDIT_BUREAU_MON,41519,13.502,25,float64,27.0/0.0,0.267395,0.916002,7.804848,"[0.0, nan, 1.0, 2.0, 6.0, 5.0, 3.0, 7.0, 9.0, ..."
AMT_REQ_CREDIT_BUREAU_QRT,41519,13.502,12,float64,261.0/0.0,0.265474,0.794056,134.365776,"[0.0, nan, 1.0, 2.0, 4.0, 3.0, 8.0, 5.0, 6.0, ..."


------------------------------------------------------------------------


## Correlation Against Target

To determine the correlation, the train set was indivdiaully joined to the respect tables and then the correlation caculated. A left join was 
done against the trainset. <b>On second thought a right join maybe the right decision agaist the child tables??<b>

In [14]:
def correlation_against_target(df):
    df_joined = {} 
    df_joined =  datasets['application_train'].merge(df, on='SK_ID_CURR', how='left')
    return df_joined[df_joined.columns[:]].corr()['TARGET'].sort_values(ascending=False)

In [15]:
original_corr = {}
for i,ds_name in enumerate(datasets.keys()):
    if( ds_name.upper() != "APPLICATION_TRAIN"):
        print("------------------------------------------------------------------------")
        print("Correlation for Orignal Table:", ds_name.upper())
        print("------------------------------------------------------------------------")
        ds = correlation_against_target(datasets[ds_name])
        print("------------------------------------------------------------------------")
        original_corr[ds_name] = ds
        print(ds)
        print("------------------------------------------------------------------------")
        print("------------------------------------------------------------------------")

------------------------------------------------------------------------
Correlation for Orignal Table: PREVIOUS_APPLICATION
------------------------------------------------------------------------
------------------------------------------------------------------------
TARGET                         1.000000
DAYS_BIRTH                     0.074314
REGION_RATING_CLIENT_W_CITY    0.059832
DAYS_LAST_PHONE_CHANGE         0.058110
REGION_RATING_CLIENT           0.057135
                                 ...   
PREV_APRV_CNT                 -0.049161
DAYS_FIRST_DRAWING            -0.095723
EXT_SOURCE_1                  -0.153187
EXT_SOURCE_2                  -0.155211
EXT_SOURCE_3                  -0.188381
Name: TARGET, Length: 132, dtype: float64
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
Correlation for Orignal Tabl

In [16]:
transformed_corr = {}
for i,ds_name in enumerate(datasets_transformed.keys()):
    if( ds_name.upper() != "APPLICATION_TRAIN"):
        print("------------------------------------------------------------------------")
        print("Correlation for Transformed Table:", ds_name.upper())
        print("------------------------------------------------------------------------")
        ds = correlation_against_target(datasets_transformed[ds_name])
        print("------------------------------------------------------------------------")
        transformed_corr[ds_name] = ds
        print(ds)
        print("------------------------------------------------------------------------")
        print("------------------------------------------------------------------------")

------------------------------------------------------------------------
Correlation for Transformed Table: PREVIOUS_APPLICATION
------------------------------------------------------------------------
------------------------------------------------------------------------
TARGET                                           1.000000
DAYS_BIRTH                                       0.078239
NAME_CONTRACT_STATUS_Refused_mean                0.077671
PREV_REJ_CNT_mean                                0.077671
PREV_REJ_CNT_var                                 0.075867
                                                   ...   
EXT_SOURCE_3                                    -0.178919
NAME_GOODS_CATEGORY_Animals_median                    NaN
NAME_GOODS_CATEGORY_House Construction_median         NaN
NAME_GOODS_CATEGORY_House Construction_mean           NaN
NAME_GOODS_CATEGORY_House Construction_var            NaN
Name: TARGET, Length: 706, dtype: float64
---------------------------------------------