In [1]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')

In [2]:
# load data

DATA_DIR =  "/../Data/"

ds_names = (
    "application_train", "application_test", 
    "bureau", "bureau_balance", 
    "credit_card_balance", "installments_payments",
    "previous_application","POS_CASH_balance"
)

datasets = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

In [3]:
# bureau_balance transformation rollup

bureau_bal = datasets['bureau_balance']

bureau_bal['STATUS'] = bureau_bal['STATUS'].replace(to_replace=['C','X'],
                                                    value=[0,0]) \
                                           .astype(int)

bureau_bal['MONTHS_BALANCE'] = bureau_bal['MONTHS_BALANCE'] * -1

bureau_bal = bureau_bal.groupby('SK_ID_BUREAU') \
                       .agg(MONTHS_BALANCE_max=('MONTHS_BALANCE','max'),
                            STATUS_sum=('STATUS','sum')) \
                       .reset_index()

bureau_bal

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE_max,STATUS_sum
0,5001709,96,0
1,5001710,82,0
2,5001711,3,0
3,5001712,18,0
4,5001713,21,0
...,...,...,...
817390,6842884,47,0
817391,6842885,23,60
817392,6842886,32,0
817393,6842887,36,0


In [7]:
# bureau + bureau_balance transformation rollup

bureau = datasets['bureau']

bureau = bureau.merge(bureau_bal, on='SK_ID_BUREAU', how='left') \
               .replace(to_replace='\s+', value='_', regex=True) \
               .replace(to_replace='\-', value='_', regex=True) \
               .replace(to_replace='\(', value='', regex=True) \
               .replace(to_replace='\)', value='', regex=True) 

bureau['DAYS_CREDIT'] = bureau['DAYS_CREDIT'] * -1
bureau['DAYS_ENDDATE_FACT'] = bureau['DAYS_ENDDATE_FACT'] * -1

bureau = pd.get_dummies(data=bureau,
                        columns=['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])

bureau = bureau.groupby('SK_ID_CURR') \
               .agg(SK_ID_BUREAU_count=('SK_ID_BUREAU','count'),
                    DAYS_CREDIT_min=('DAYS_CREDIT','min'),
                    CREDIT_DAY_OVERDUE_sum=('CREDIT_DAY_OVERDUE','sum'),
                    DAYS_CREDIT_ENDDATE_mean=('DAYS_CREDIT_ENDDATE','mean'),
                    DAYS_ENDDATE_FACT_mean=('DAYS_ENDDATE_FACT','mean'),
                    AMT_CREDIT_MAX_OVERDUE_sum=('AMT_CREDIT_MAX_OVERDUE','sum'),
                    CNT_CREDIT_PROLONG_sum=('CNT_CREDIT_PROLONG','sum'),
                    AMT_CREDIT_SUM_sum=('AMT_CREDIT_SUM','sum'),
                    AMT_CREDIT_SUM_DEBT_sum=('AMT_CREDIT_SUM_DEBT','sum'),
                    AMT_CREDIT_SUM_LIMIT_sum=('AMT_CREDIT_SUM_LIMIT','sum'),
                    AMT_CREDIT_SUM_OVERDUE_sum=('AMT_CREDIT_SUM_OVERDUE','sum'),
                    MONTHS_BALANCE_max_mean=('MONTHS_BALANCE_max','mean'),
                    STATUS_sum_sum=('STATUS_sum','sum'),
                    CREDIT_ACTIVE_Active_sum=('CREDIT_ACTIVE_Active','sum'),
                    CREDIT_ACTIVE_Bad_debt_sum=('CREDIT_ACTIVE_Bad_debt','sum'),
                    CREDIT_ACTIVE_Closed_sum=('CREDIT_ACTIVE_Closed','sum'),
                    CREDIT_ACTIVE_Sold_sum=('CREDIT_ACTIVE_Sold','sum'),
                    CREDIT_CURRENCY_currency_1_sum=('CREDIT_CURRENCY_currency_1','sum'),
                    CREDIT_CURRENCY_currency_2_sum=('CREDIT_CURRENCY_currency_2','sum'),
                    CREDIT_CURRENCY_currency_3_sum=('CREDIT_CURRENCY_currency_3','sum'),
                    CREDIT_CURRENCY_currency_4_sum=('CREDIT_CURRENCY_currency_4','sum'),
                    CREDIT_TYPE_Another_type_of_loan_sum=('CREDIT_TYPE_Another_type_of_loan','sum'),
                    CREDIT_TYPE_Car_loan_sum=('CREDIT_TYPE_Car_loan','sum'),
                    CREDIT_TYPE_Cash_loan_non_earmarked_sum=('CREDIT_TYPE_Cash_loan_non_earmarked','sum'),
                    CREDIT_TYPE_Consumer_credit_sum=('CREDIT_TYPE_Consumer_credit','sum'),
                    CREDIT_TYPE_Credit_card_sum=('CREDIT_TYPE_Credit_card','sum'),
                    CREDIT_TYPE_Interbank_credit_sum=('CREDIT_TYPE_Interbank_credit','sum'),
                    CREDIT_TYPE_Loan_for_business_development_sum=('CREDIT_TYPE_Loan_for_business_development','sum'),
                    CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending_sum=('CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending','sum'),
                    CREDIT_TYPE_Loan_for_the_purchase_of_equipment_sum=('CREDIT_TYPE_Loan_for_the_purchase_of_equipment','sum'),
                    CREDIT_TYPE_Loan_for_working_capital_replenishment_sum=('CREDIT_TYPE_Loan_for_working_capital_replenishment','sum'),
                    CREDIT_TYPE_Microloan_sum=('CREDIT_TYPE_Microloan','sum'),
                    CREDIT_TYPE_Mobile_operator_loan_sum=('CREDIT_TYPE_Mobile_operator_loan','sum'),
                    CREDIT_TYPE_Mortgage_sum=('CREDIT_TYPE_Mortgage','sum'),
                    CREDIT_TYPE_Real_estate_loan_sum=('CREDIT_TYPE_Real_estate_loan','sum'),
                    CREDIT_TYPE_Unknown_type_of_loan_sum=('CREDIT_TYPE_Unknown_type_of_loan','sum')
                   ) \
               .reset_index()

bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU_count,DAYS_CREDIT_min,CREDIT_DAY_OVERDUE_sum,DAYS_CREDIT_ENDDATE_mean,DAYS_ENDDATE_FACT_mean,AMT_CREDIT_MAX_OVERDUE_sum,CNT_CREDIT_PROLONG_sum,AMT_CREDIT_SUM_sum,AMT_CREDIT_SUM_DEBT_sum,...,CREDIT_TYPE_Interbank_credit_sum,CREDIT_TYPE_Loan_for_business_development_sum,CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending_sum,CREDIT_TYPE_Loan_for_the_purchase_of_equipment_sum,CREDIT_TYPE_Loan_for_working_capital_replenishment_sum,CREDIT_TYPE_Microloan_sum,CREDIT_TYPE_Mobile_operator_loan_sum,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Real_estate_loan_sum,CREDIT_TYPE_Unknown_type_of_loan_sum
0,100001,7,49,0,82.428571,825.500000,0.000,0,1453365.000,596686.500,...,0,0,0,0,0,0,0,0,0,0
1,100002,8,103,0,-349.000000,697.500000,8405.145,0,865055.565,245781.000,...,0,0,0,0,0,0,0,0,0,0
2,100003,4,606,0,-544.500000,1097.333333,0.000,0,1017400.500,0.000,...,0,0,0,0,0,0,0,0,0,0
3,100004,2,408,0,-488.500000,532.500000,0.000,0,189037.800,0.000,...,0,0,0,0,0,0,0,0,0,0
4,100005,3,62,0,439.333333,123.000000,0.000,0,657126.000,568408.500,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305806,456249,13,483,0,-1232.333333,1364.750000,30735.000,0,3693858.660,163071.000,...,0,0,0,0,0,0,0,0,0,0
305807,456250,3,760,0,1288.333333,760.000000,0.000,0,3086459.550,2232040.095,...,0,0,0,0,0,0,0,0,0,0
305808,456253,4,713,0,280.500000,794.000000,0.000,0,3960000.000,1795833.000,...,0,0,0,0,0,0,0,0,0,0
305809,456254,1,1104,0,-859.000000,859.000000,0.000,0,45000.000,0.000,...,0,0,0,0,0,0,0,0,0,0
