In [1]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')



# Data Load

In [2]:
# load data

DATA_DIR =  "/../Data/"

ds_names = (
    #"application_train", "application_test", 
    #"bureau", "bureau_balance", 
    #"credit_card_balance", "installments_payments",
    "previous_application","POS_CASH_balance"
)

datasets = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

# EDA and Transformation Functions

In [3]:


# Class to summarize the features specified into min, max, mean, count, sum, median, and var
class FeatureSummarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features=None): # no *args or **kargs
        self.features = features
        self.agg_ops = ["min", "max", "count", "sum", "median", "mean", "var"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        keys = list(set(X.columns) - set(self.features))
        
        result = X.groupby(keys, as_index=False) \
                  .agg({ft:self.agg_ops for ft in self.features}) 
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        
        return result 
    

def runFeatureSummarizer(df, features):
    print(f"df.shape: {df.shape}\n")
    print(f"Aggregated Features:\ndf[{features}][0:5]: \n{df[features][0:5]}")
    pipeline = make_pipeline(FeatureSummarizer(features))
    return(pipeline.fit_transform(df))


def id_num_cat_feature(df):
    numerical = df.select_dtypes(include=['int64', 'float64']).columns
    categorical = df.select_dtypes(include=['object', 'bool']).columns
    feat_num = list(numerical)
    feat_cat = list(categorical)
    
    id_cols = ['SK_ID_CURR','SK_ID_BUREAU']
    
    id_cols = [cols for cols in  list(df.columns.intersection(id_cols))] 
    features = list(set(df.columns) - set(id_cols))
    
    print('--------')
    print(f"# of ID's: {len(id_cols)}")
    print(f" ID's: {id_cols}")
    print('--------')
    print(f"# of numerical   features: {len(feat_num)}")
    print(f"Numerical   features: {feat_num}")
    print('--------')
    print(f"# of categorical features: {len(feat_cat)}")
    print(f"Categorical features: {feat_cat}")
    print('--------')
    print(f"# All features: {len(features)}")
    print(f"All features: {features}")
    return id_cols,feat_num,feat_cat,features

def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) 


def eda_transformation(df,n):

    id_cols, feat_num, feat_cat, features =  id_num_cat_feature(df)

    # One-Hot-Encode categorical variables
    df = pd.get_dummies(data=df, columns=feat_cat)

    features = list(set(df.columns) - set(id_cols))
    feat_ohe = list(set(features) - set(feat_num))

    print(f"# of OHE categorical features: {len(feat_ohe)}")
    print(f"OHE Categorical features: {feat_ohe}")
    print('--------')

    # aggregate bureau variables
    df = runFeatureSummarizer(df, features)

    if n ==3:
        
        feature_selection = [
            df[[column for column in df.columns if not column.startswith('SK_ID_PREV') and column.startswith(tuple(feat_num))]],
            df[[column for column in df.columns if column.startswith('DAYS') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('SK_ID_PREV') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]
     
    if n ==4:
        
        feature_selection = [
            df[[column for column in df.columns if not column.startswith('SK_ID_PREV') and column.startswith(tuple(feat_num))]],
            df[[column for column in df.columns if column.startswith('SK_ID_PREV') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]

    df = pd.concat(feature_selection, axis=1)
    features = list(set(df.columns) - set(id_cols))

    print('--------')
    print('Aggregated Features:')
    print('\n'.join(map(str, sorted(features))))
    print('')
    print('Aggregated Data:')
    print('')
    print(df[features].describe().T)
    return df


def feature_selection(df,num):
    pass
    if feat_method == num:
        # bureau_balance
        feature_selection = [
            df[[column for column in df.columns if not column.startswith(tuple(feat_cat)) and not column.endswith('count')]],
            df[[column for column in df.columns if column.startswith('DAYS_CREDIT') and column.endswith('count')]],
            df[[column for column in df.columns if column.startswith(tuple(feat_cat)) and column.endswith(('mean', 'median', 'var'))]]
        ]
    



# POS

In [4]:
def pos_eda(df):
    pos = df
    drop_list_pos = []
    
    #Adding new features

    pos['POS_PERC_INSTL_PNDNG']=pos['CNT_INSTALMENT_FUTURE']/pos['CNT_INSTALMENT']
    pos['POS_CNT_INSTAL_PNDNG']=pos['CNT_INSTALMENT']-pos['CNT_INSTALMENT_FUTURE']
    pos['POS_DAYS_WTHT_TOLRNC']=pos['SK_DPD']-pos['SK_DPD_DEF']
    pos['MONTHS_BALANCE'] = pos['MONTHS_BALANCE'].abs()
    
    #replacing " " with _ for OHE cols.
    pos['NAME_CONTRACT_STATUS']=pos['NAME_CONTRACT_STATUS'].apply(lambda x: str(x).replace(" ","_")) 
    
    # Drop elements ijndrop list
    threshold = 0.7

    #Dropping rows with missing value rate higher than threshold
    pos = pos.loc[pos.isnull().mean(axis=1) < threshold]
    
    return (eda_transformation(pos,4))

In [146]:
pos = datasets['POS_CASH_balance']
pos = pos_eda(pos)
#pos.to_csv(os.getcwd() + DATA_DIR + 'agg_pos.csv')

--------
# of ID's: 1
 ID's: ['SK_ID_CURR']
--------
# of numerical   features: 10
Numerical   features: ['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF', 'POS_PERC_INSTL_PNDNG', 'POS_CNT_INSTAL_PNDNG', 'POS_DAYS_WTHT_TOLRNC']
--------
# of categorical features: 1
Categorical features: ['NAME_CONTRACT_STATUS']
--------
# All features: 10
All features: ['POS_PERC_INSTL_PNDNG', 'SK_DPD', 'CNT_INSTALMENT_FUTURE', 'SK_ID_PREV', 'SK_DPD_DEF', 'MONTHS_BALANCE', 'POS_CNT_INSTAL_PNDNG', 'NAME_CONTRACT_STATUS', 'CNT_INSTALMENT', 'POS_DAYS_WTHT_TOLRNC']
# of OHE categorical features: 9
OHE Categorical features: ['NAME_CONTRACT_STATUS_Signed', 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Amortized_debt', 'NAME_CONTRACT_STATUS_Returned_to_the_store', 'NAME_CONTRACT_STATUS_Canceled', 'NAME_CONTRACT_STATUS_Demand', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Active', 'NAME_CONTRACT_STATUS_XNA']
--------
df.shape: (

# PREVAPP

In [5]:
def prevapp_eda(df):
    
    prevapp = df
    drop_list_pa = []
    #Day and Amount columns
    day_cols = [col for col in prevapp.columns if 'DAY' in col]
    amt_cols = [col for col in prevapp.columns if 'AMT' in col]
    #Adding new features

    prevapp['PREV_APRV_CNT'] = prevapp['NAME_CONTRACT_STATUS'].map(lambda x: 1 if (x == 'Approved') else 0)
    prevapp['PREV_REJ_CNT'] = prevapp['NAME_CONTRACT_STATUS'].map(lambda x: 1 if (x == 'Refused') else 0)
    prevapp['PREV_APCTN_CRDT_DIFF'] = prevapp['AMT_APPLICATION'] - prevapp['AMT_CREDIT']
    prevapp['PREV_APCTN_CRDT_RATIO'] = prevapp['AMT_APPLICATION'] / prevapp['AMT_CREDIT']
    prevapp['PREV_CRDT_ANNUTY_RATIO'] = prevapp['AMT_CREDIT']/prevapp['AMT_ANNUITY']
    prevapp['PREV_DWN_PYMNT_CRDT_RATIO'] = prevapp['AMT_DOWN_PAYMENT'] / prevapp['AMT_CREDIT']
    
    
    for c in [co for co in prevapp.columns if 'DAYS' in co]:
        prevapp[c] = prevapp[c].replace({365243.0: np.nan})
        prevapp[c] = prevapp[c].abs()
    
    drop_list_pa.append('WEEKDAY_APPR_PROCESS_START') ## weekday data is normally distributed, so wont make any sense to add that
    drop_list_pa.append('HOUR_APPR_PROCESS_START') ## Hour application started.
    
    
    
    # Drop elements ijndrop list

    drop_list_pa.append('WEEKDAY_APPR_PROCESS_START') ## weekday data is normally distributed, so wont make any sense to add that
    drop_list_pa.append('HOUR_APPR_PROCESS_START') ## Hour application started.

    threshold = 0.7
    drop_list_pa = list(prevapp.columns[prevapp.isnull().mean() > threshold])

    prevapp = prevapp.drop(columns=drop_list_pa, axis=1)

    #drop cols and rows which are more than 70% null
    #Dropping columns with missing value rate higher than threshold
    prevapp = prevapp[prevapp.columns[prevapp.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    prevapp = prevapp.loc[prevapp.isnull().mean(axis=1) < threshold]
    
    prevapp= eda_transformation(prevapp,3)
    return prevapp



In [6]:
prevapp = datasets['previous_application']
prevapp = prevapp_eda(prevapp)
#prevapp.to_csv(os.getcwd() + DATA_DIR + 'agg_prevapp.csv')

--------
# of ID's: 1
 ID's: ['SK_ID_CURR']
--------
# of numerical   features: 24
Numerical   features: ['SK_ID_PREV', 'SK_ID_CURR', 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'HOUR_APPR_PROCESS_START', 'NFLAG_LAST_APPL_IN_DAY', 'RATE_DOWN_PAYMENT', 'DAYS_DECISION', 'SELLERPLACE_AREA', 'CNT_PAYMENT', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL', 'PREV_APRV_CNT', 'PREV_REJ_CNT', 'PREV_APCTN_CRDT_DIFF', 'PREV_APCTN_CRDT_RATIO', 'PREV_CRDT_ANNUTY_RATIO', 'PREV_DWN_PYMNT_CRDT_RATIO']
--------
# of categorical features: 16
Categorical features: ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRO

--------
Aggregated Features:
AMT_ANNUITY_count
AMT_ANNUITY_max
AMT_ANNUITY_mean
AMT_ANNUITY_median
AMT_ANNUITY_min
AMT_ANNUITY_sum
AMT_ANNUITY_var
AMT_APPLICATION_count
AMT_APPLICATION_max
AMT_APPLICATION_mean
AMT_APPLICATION_median
AMT_APPLICATION_min
AMT_APPLICATION_sum
AMT_APPLICATION_var
AMT_CREDIT_count
AMT_CREDIT_max
AMT_CREDIT_mean
AMT_CREDIT_median
AMT_CREDIT_min
AMT_CREDIT_sum
AMT_CREDIT_var
AMT_DOWN_PAYMENT_count
AMT_DOWN_PAYMENT_max
AMT_DOWN_PAYMENT_mean
AMT_DOWN_PAYMENT_median
AMT_DOWN_PAYMENT_min
AMT_DOWN_PAYMENT_sum
AMT_DOWN_PAYMENT_var
AMT_GOODS_PRICE_count
AMT_GOODS_PRICE_max
AMT_GOODS_PRICE_mean
AMT_GOODS_PRICE_median
AMT_GOODS_PRICE_min
AMT_GOODS_PRICE_sum
AMT_GOODS_PRICE_var
CHANNEL_TYPE_AP+ (Cash loan)_mean
CHANNEL_TYPE_AP+ (Cash loan)_median
CHANNEL_TYPE_AP+ (Cash loan)_var
CHANNEL_TYPE_Car dealer_mean
CHANNEL_TYPE_Car dealer_median
CHANNEL_TYPE_Car dealer_var
CHANNEL_TYPE_Channel of corporate sales_mean
CHANNEL_TYPE_Channel of corporate sales_median
CHANNEL_TYPE_

                                                count          mean  \
NAME_CASH_LOAN_PURPOSE_Education_median      338857.0      0.000276   
PREV_APCTN_CRDT_DIFF_mean                    338857.0 -15863.511408   
NAME_PRODUCT_TYPE_XNA_var                    278399.0      0.196262   
NAME_CASH_LOAN_PURPOSE_Repairs_median        338857.0      0.004797   
DAYS_FIRST_DUE_median                        336738.0    998.443628   
...                                               ...           ...   
CODE_REJECT_REASON_XAP_var                   278399.0      0.113143   
PREV_APRV_CNT_min                            338857.0      0.426773   
NAME_GOODS_CATEGORY_Weapon_var               278399.0      0.000057   
NAME_CASH_LOAN_PURPOSE_Buying a garage_mean  338857.0      0.000059   
DAYS_TERMINATION_sum                         338857.0   2226.774696   

                                                      std        min      25%  \
NAME_CASH_LOAN_PURPOSE_Education_median          0.014549        0

In [47]:
#prevapp.SK_ID_PREV_count

0         1
1         1
2         3
3         1
4         2
         ..
338852    1
338853    1
338854    2
338855    2
338856    8
Name: SK_ID_PREV_count, Length: 338857, dtype: int64

# DF Summary

In [138]:
def feature_summary(df_fa):
    print('DataFrame shape')
    print('Rows:',df_fa.shape[0])
    print('Cols:',df_fa.shape[1])
    print("------------------------------------------------------------------------")
    
    col_list=['Null','%_Null','Unique_Count','Data_type','Max/Min','Mean','Std','Skewness','Sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    df['%_Null']=round((df_fa.isnull().sum()/df_fa.isnull().count()*100),3).sort_values(ascending = False)
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'Max/Min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'Mean']=df_fa[col].mean()
            df.at[col,'Std']=df_fa[col].std()
            df.at[col,'Skewness']=df_fa[col].skew()
        df.at[col,'Sample_values']=list(df_fa[col].unique())
    print("Table Statistics")
    print("------------------------------------------------------------------------")
    display(df.fillna('-'))
    #return(df.fillna('-'))


In [139]:
for i,ds_name in enumerate(datasets.keys()):
    print("Table under consideration:",ds_name.upper())
    print("------------------------------------------------------------------------")
    ds = feature_summary(datasets[ds_name])
    print("------------------------------------------------------------------------")
    

Table under consideration: PREVIOUS_APPLICATION
------------------------------------------------------------------------
DataFrame shape
rows: 1670214
cols: 43
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_PREV,0,0.0,1670214,int64,2845382/1000001,1923089.135331,532597.958696,-0.000573,"[2030495, 2802425, 2523466, 2819243, 1784265, ..."
SK_ID_CURR,0,0.0,338857,int64,456255/100001,278357.174099,102814.823849,-0.003303,"[271877, 108129, 122040, 176158, 202054, 19938..."
NAME_CONTRACT_TYPE,0,0.0,4,object,-,-,-,-,"[Consumer loans, Cash loans, Revolving loans, ..."
AMT_ANNUITY,372235,22.287,357960,float64,418058.14/0.0,15955.120659,14782.137335,2.692572,"[1730.43, 25188.615, 15060.735, 47041.335, 319..."
AMT_APPLICATION,0,0.0,93885,float64,6905160.0/0.0,175233.86036,292779.762386,3.391442,"[17145.0, 607500.0, 112500.0, 450000.0, 337500..."
AMT_CREDIT,1,0.0,86804,float64,6905160.0/0.0,196114.021218,318574.616547,3.245815,"[17145.0, 679671.0, 136444.5, 470790.0, 404055..."
AMT_DOWN_PAYMENT,895844,53.636,29279,float64,3060045.0/-0.9,6697.402139,20921.49541,36.476576,"[0.0, nan, 12649.5, 1350.0, 9000.0, 13500.0, 4..."
AMT_GOODS_PRICE,385515,23.082,93886,float64,6905160.0/0.0,227847.279283,315396.557937,3.07369,"[17145.0, 607500.0, 112500.0, 450000.0, 337500..."
WEEKDAY_APPR_PROCESS_START,0,0.0,7,object,-,-,-,-,"[SATURDAY, THURSDAY, TUESDAY, MONDAY, FRIDAY, ..."
HOUR_APPR_PROCESS_START,0,0.0,24,int64,23/0,12.484182,3.334028,-0.025629,"[15, 11, 7, 9, 8, 10, 12, 13, 14, 16, 6, 4, 5,..."


------------------------------------------------------------------------
Table under consideration: POS_CASH_BALANCE
------------------------------------------------------------------------
DataFrame shape
rows: 10001358
cols: 11
------------------------------------------------------------------------
Table Statistics
------------------------------------------------------------------------


Unnamed: 0,Null,%_Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
SK_ID_PREV,0,0.0,936325,int64,2843499/1000001,1903216.598957,535846.530722,0.044229,"[1803195, 1715348, 1784872, 1903291, 2341044, ..."
SK_ID_CURR,0,0.0,337252,int64,456255/100001,278403.863306,102763.74509,-0.003128,"[182943, 367990, 397406, 269225, 334279, 34216..."
MONTHS_BALANCE,0,0.0,96,int64,96/1,35.012588,26.06657,0.672777,"[31, 33, 32, 35, 38, 39, 34, 41, 37, 40, 43, 3..."
CNT_INSTALMENT,26071,0.261,74,float64,92.0/1.0,17.08965,11.995056,1.601734,"[48.0, 36.0, 12.0, 24.0, 60.0, 18.0, 4.0, 42.0..."
CNT_INSTALMENT_FUTURE,26087,0.261,80,float64,85.0/0.0,10.48384,11.109058,1.846746,"[45.0, 35.0, 9.0, 42.0, 12.0, 43.0, 36.0, 16.0..."
NAME_CONTRACT_STATUS,0,0.0,9,object,-,-,-,-,"[Active, Completed, Signed, Approved, Returned..."
SK_DPD,0,0.0,3400,int64,4231/0,11.606928,132.714043,14.899126,"[0, 1, 2, 4, 3, 18, 7, 5, 12, 6, 8, 13, 16, 10..."
SK_DPD_DEF,0,0.0,2307,int64,3595/0,0.654468,32.762491,66.339906,"[0, 1, 2, 4, 3, 18, 7, 5, 12, 8, 13, 10, 15, 6..."
POS_PERC_INSTL_PNDNG,26184,0.262,1280,float64,6.67/0.0,0.546994,0.330309,-0.24752,"[0.9375, 0.9722222222222222, 0.75, 0.875, 1.0,..."
POS_CNT_INSTAL_PNDNG,26184,0.262,120,float64,72.0/-51.0,6.605944,5.923767,1.667505,"[3.0, 1.0, 6.0, 0.0, 5.0, 8.0, 7.0, 9.0, 11.0,..."


------------------------------------------------------------------------
