In [5]:
# import packages

import os
import time
import warnings
import zipfile
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix

warnings.filterwarnings('ignore')

In [12]:
# load data

DATA_DIR =  "/../Data/"

ds_names = (
    "application_train", "application_test", 
    "bureau", "bureau_balance", 
    "credit_card_balance", "installments_payments",
    "previous_application","POS_CASH_balance"
)

datasets = {}

for ds_name in ds_names:
    datasets[ds_name] = pd.read_csv(os.getcwd() + DATA_DIR + f'{ds_name}.csv')

In [13]:
# bureau_balance transformation rollup

bureau_bal = datasets['bureau_balance']

bureau_bal['STATUS'] = bureau_bal['STATUS'].replace(to_replace=['C','X'],
                                                    value=[0,0]) \
                                           .astype(int)

bureau_bal['MONTHS_BALANCE'] = bureau_bal['MONTHS_BALANCE'] * -1

bureau_bal = bureau_bal.groupby('SK_ID_BUREAU') \
                       .agg(MONTHS_BALANCE_max=('MONTHS_BALANCE','max'),
                            STATUS_sum=('STATUS','sum')) \
                       .reset_index()

bureau_bal

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE_max,STATUS_sum
0,5001709,96,0
1,5001710,82,0
2,5001711,3,0
3,5001712,18,0
4,5001713,21,0
...,...,...,...
817390,6842884,47,0
817391,6842885,23,60
817392,6842886,32,0
817393,6842887,36,0


In [7]:
# bureau + bureau_balance transformation rollup

bureau = datasets['bureau']

bureau = bureau.merge(bureau_bal, on='SK_ID_BUREAU', how='left') \
               .replace(to_replace='\s+', value='_', regex=True) \
               .replace(to_replace='\-', value='_', regex=True) \
               .replace(to_replace='\(', value='', regex=True) \
               .replace(to_replace='\)', value='', regex=True) 

bureau['DAYS_CREDIT'] = bureau['DAYS_CREDIT'] * -1
bureau['DAYS_ENDDATE_FACT'] = bureau['DAYS_ENDDATE_FACT'] * -1

bureau = pd.get_dummies(data=bureau,
                        columns=['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])

bureau = bureau.groupby('SK_ID_CURR') \
               .agg(SK_ID_BUREAU_count=('SK_ID_BUREAU','count'),
                    DAYS_CREDIT_min=('DAYS_CREDIT','min'),
                    CREDIT_DAY_OVERDUE_sum=('CREDIT_DAY_OVERDUE','sum'),
                    DAYS_CREDIT_ENDDATE_mean=('DAYS_CREDIT_ENDDATE','mean'),
                    DAYS_ENDDATE_FACT_mean=('DAYS_ENDDATE_FACT','mean'),
                    AMT_CREDIT_MAX_OVERDUE_sum=('AMT_CREDIT_MAX_OVERDUE','sum'),
                    CNT_CREDIT_PROLONG_sum=('CNT_CREDIT_PROLONG','sum'),
                    AMT_CREDIT_SUM_sum=('AMT_CREDIT_SUM','sum'),
                    AMT_CREDIT_SUM_DEBT_sum=('AMT_CREDIT_SUM_DEBT','sum'),
                    AMT_CREDIT_SUM_LIMIT_sum=('AMT_CREDIT_SUM_LIMIT','sum'),
                    AMT_CREDIT_SUM_OVERDUE_sum=('AMT_CREDIT_SUM_OVERDUE','sum'),
                    MONTHS_BALANCE_max_mean=('MONTHS_BALANCE_max','mean'),
                    STATUS_sum_sum=('STATUS_sum','sum'),
                    CREDIT_ACTIVE_Active_sum=('CREDIT_ACTIVE_Active','sum'),
                    CREDIT_ACTIVE_Bad_debt_sum=('CREDIT_ACTIVE_Bad_debt','sum'),
                    CREDIT_ACTIVE_Closed_sum=('CREDIT_ACTIVE_Closed','sum'),
                    CREDIT_ACTIVE_Sold_sum=('CREDIT_ACTIVE_Sold','sum'),
                    CREDIT_CURRENCY_currency_1_sum=('CREDIT_CURRENCY_currency_1','sum'),
                    CREDIT_CURRENCY_currency_2_sum=('CREDIT_CURRENCY_currency_2','sum'),
                    CREDIT_CURRENCY_currency_3_sum=('CREDIT_CURRENCY_currency_3','sum'),
                    CREDIT_CURRENCY_currency_4_sum=('CREDIT_CURRENCY_currency_4','sum'),
                    CREDIT_TYPE_Another_type_of_loan_sum=('CREDIT_TYPE_Another_type_of_loan','sum'),
                    CREDIT_TYPE_Car_loan_sum=('CREDIT_TYPE_Car_loan','sum'),
                    CREDIT_TYPE_Cash_loan_non_earmarked_sum=('CREDIT_TYPE_Cash_loan_non_earmarked','sum'),
                    CREDIT_TYPE_Consumer_credit_sum=('CREDIT_TYPE_Consumer_credit','sum'),
                    CREDIT_TYPE_Credit_card_sum=('CREDIT_TYPE_Credit_card','sum'),
                    CREDIT_TYPE_Interbank_credit_sum=('CREDIT_TYPE_Interbank_credit','sum'),
                    CREDIT_TYPE_Loan_for_business_development_sum=('CREDIT_TYPE_Loan_for_business_development','sum'),
                    CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending_sum=('CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending','sum'),
                    CREDIT_TYPE_Loan_for_the_purchase_of_equipment_sum=('CREDIT_TYPE_Loan_for_the_purchase_of_equipment','sum'),
                    CREDIT_TYPE_Loan_for_working_capital_replenishment_sum=('CREDIT_TYPE_Loan_for_working_capital_replenishment','sum'),
                    CREDIT_TYPE_Microloan_sum=('CREDIT_TYPE_Microloan','sum'),
                    CREDIT_TYPE_Mobile_operator_loan_sum=('CREDIT_TYPE_Mobile_operator_loan','sum'),
                    CREDIT_TYPE_Mortgage_sum=('CREDIT_TYPE_Mortgage','sum'),
                    CREDIT_TYPE_Real_estate_loan_sum=('CREDIT_TYPE_Real_estate_loan','sum'),
                    CREDIT_TYPE_Unknown_type_of_loan_sum=('CREDIT_TYPE_Unknown_type_of_loan','sum')
                   ) \
               .reset_index()

bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU_count,DAYS_CREDIT_min,CREDIT_DAY_OVERDUE_sum,DAYS_CREDIT_ENDDATE_mean,DAYS_ENDDATE_FACT_mean,AMT_CREDIT_MAX_OVERDUE_sum,CNT_CREDIT_PROLONG_sum,AMT_CREDIT_SUM_sum,AMT_CREDIT_SUM_DEBT_sum,...,CREDIT_TYPE_Interbank_credit_sum,CREDIT_TYPE_Loan_for_business_development_sum,CREDIT_TYPE_Loan_for_purchase_of_shares_margin_lending_sum,CREDIT_TYPE_Loan_for_the_purchase_of_equipment_sum,CREDIT_TYPE_Loan_for_working_capital_replenishment_sum,CREDIT_TYPE_Microloan_sum,CREDIT_TYPE_Mobile_operator_loan_sum,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Real_estate_loan_sum,CREDIT_TYPE_Unknown_type_of_loan_sum
0,100001,7,49,0,82.428571,825.500000,0.000,0,1453365.000,596686.500,...,0,0,0,0,0,0,0,0,0,0
1,100002,8,103,0,-349.000000,697.500000,8405.145,0,865055.565,245781.000,...,0,0,0,0,0,0,0,0,0,0
2,100003,4,606,0,-544.500000,1097.333333,0.000,0,1017400.500,0.000,...,0,0,0,0,0,0,0,0,0,0
3,100004,2,408,0,-488.500000,532.500000,0.000,0,189037.800,0.000,...,0,0,0,0,0,0,0,0,0,0
4,100005,3,62,0,439.333333,123.000000,0.000,0,657126.000,568408.500,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305806,456249,13,483,0,-1232.333333,1364.750000,30735.000,0,3693858.660,163071.000,...,0,0,0,0,0,0,0,0,0,0
305807,456250,3,760,0,1288.333333,760.000000,0.000,0,3086459.550,2232040.095,...,0,0,0,0,0,0,0,0,0,0
305808,456253,4,713,0,280.500000,794.000000,0.000,0,3960000.000,1795833.000,...,0,0,0,0,0,0,0,0,0,0
305809,456254,1,1104,0,-859.000000,859.000000,0.000,0,45000.000,0.000,...,0,0,0,0,0,0,0,0,0,0


In [47]:

prevapp = datasets['previous_application']

In [48]:
prevapp

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,2300464,352015,Consumer loans,14704.290,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,...,Furniture,30.0,low_normal,POS industry with interest,365243.0,-508.0,362.0,-358.0,-351.0,0.0
1670210,2357031,334635,Consumer loans,6622.020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,...,Furniture,12.0,middle,POS industry with interest,365243.0,-1604.0,-1274.0,-1304.0,-1297.0,0.0
1670211,2659632,249544,Consumer loans,11520.855,105237.0,102523.5,10525.5,105237.0,MONDAY,12,...,Consumer electronics,10.0,low_normal,POS household with interest,365243.0,-1457.0,-1187.0,-1187.0,-1181.0,0.0
1670212,2785582,400317,Cash loans,18821.520,180000.0,191880.0,,180000.0,WEDNESDAY,9,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-1155.0,-825.0,-825.0,-817.0,1.0


In [49]:
#previous application
numerical_ix = prevapp.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = prevapp.select_dtypes(include=['object', 'bool']).columns
num_features = list(numerical_ix)
cat_features = list(categorical_ix)
print(f"# of numerical   features: {len(numerical_ix)}")
print(f"Numerical   features: {numerical_ix}")
print('--------')
print(f"# of categorical features: {len(categorical_ix)}")
print(f"Categorical features: {categorical_ix}")

# of numerical   features: 21
Numerical   features: Index(['SK_ID_PREV', 'SK_ID_CURR', 'AMT_ANNUITY', 'AMT_APPLICATION',
       'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE',
       'HOUR_APPR_PROCESS_START', 'NFLAG_LAST_APPL_IN_DAY',
       'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY',
       'RATE_INTEREST_PRIVILEGED', 'DAYS_DECISION', 'SELLERPLACE_AREA',
       'CNT_PAYMENT', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',
       'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION',
       'NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')
--------
# of categorical features: 16
Categorical features: Index(['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
       'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
       'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
       'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 

In [62]:
pos = datasets['POS_CASH_balance']

In [63]:
pos
pos_num = pos.select_dtypes(include=['int64', 'float64']).columns
pos_cat = pos.select_dtypes(include=['object', 'bool']).columns
num_pos = list(pos_num)
cat_pos = list(pos_cat)
print(f"# of numerical   features: {len(pos_num)}")
print(f"Numerical   features: {pos_num}")
print('--------')
print(f"# of categorical features: {len(cat_pos)}")
print(f"Categorical features: {pos_cat}")

# of numerical   features: 7
Numerical   features: Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'CNT_INSTALMENT',
       'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF'],
      dtype='object')
--------
# of categorical features: 1
Categorical features: Index(['NAME_CONTRACT_STATUS'], dtype='object')


In [64]:
pos

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,31,48.0,45.0,Active,0,0
1,1715348,367990,33,36.0,35.0,Active,0,0
2,1784872,397406,32,12.0,9.0,Active,0,0
3,1903291,269225,35,48.0,42.0,Active,0,0
4,2341044,334279,35,36.0,35.0,Active,0,0
...,...,...,...,...,...,...,...,...
10001353,2448283,226558,20,6.0,0.0,Active,843,0
10001354,1717234,141565,19,12.0,0.0,Active,602,0
10001355,1283126,315695,21,10.0,0.0,Active,609,0
10001356,1082516,450255,22,12.0,0.0,Active,614,0


In [65]:
pos['MONTHS_BALANCE'] = pos['MONTHS_BALANCE'].abs()

In [66]:
pos['NAME_CONTRACT_STATUS'].value_counts()

Active                   9151119
Completed                 744883
Signed                     87260
Demand                      7065
Returned to the store       5461
Approved                    4917
Amortized debt               636
Canceled                      15
XNA                            2
Name: NAME_CONTRACT_STATUS, dtype: int64

In [67]:
pos = pd.get_dummies(data=pos,
                        columns=['NAME_CONTRACT_STATUS'])

In [68]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [69]:
missing_data(pos)

Unnamed: 0,Total,Percent
CNT_INSTALMENT_FUTURE,26087,0.260835
CNT_INSTALMENT,26071,0.260675
SK_ID_PREV,0,0.0
SK_ID_CURR,0,0.0
MONTHS_BALANCE,0,0.0
SK_DPD,0,0.0
SK_DPD_DEF,0,0.0
NAME_CONTRACT_STATUS_Active,0,0.0
NAME_CONTRACT_STATUS_Amortized debt,0,0.0
NAME_CONTRACT_STATUS_Approved,0,0.0


In [78]:
list(pos.columns)
id_cols = ['SK_ID_PREV',
           'SK_ID_CURR']
pos_col_list = list(set(pos.columns) - set(id_cols))
pos_col_list

['CNT_INSTALMENT_FUTURE',
 'NAME_CONTRACT_STATUS_Demand',
 'MONTHS_BALANCE',
 'NAME_CONTRACT_STATUS_XNA',
 'SK_DPD_DEF',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'CNT_INSTALMENT',
 'NAME_CONTRACT_STATUS_Active',
 'SK_DPD']

In [79]:
class prevAppsFeaturesAggregater(BaseEstimator, TransformerMixin):
    def __init__(self, features=None): # no *args or **kargs
    
        self.features = features
        self.agg_ops = ["min", "max", "mean", "count","sum"]

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):         
        result = X.groupby(['SK_ID_CURR','SK_ID_PREV'], as_index=False).agg({ft:self.agg_ops for ft in self.features})
        result.columns = result.columns.map(lambda ct: '_'.join([x for x in ct if x != '']))
        
        #if 'AMT_APPLICATION' in features:
         #   result['range_AMT_APPLICATION'] = result['AMT_APPLICATION_max'] - result['AMT_APPLICATION_min']
        return result 
    
from sklearn.pipeline import make_pipeline 
def test_driver_prevAppsFeaturesAggregater(df, features):
    print(f"df.shape: {df.shape}\n")
    print(f"df[{features}][0:5]: \n{df[features][0:5]}")
    test_pipeline = make_pipeline(prevAppsFeaturesAggregater(features))
    return(test_pipeline.fit_transform(df))

features = pos_col_list
res = test_driver_prevAppsFeaturesAggregater(pos, pos_col_list)
print(f"HELLO")
print(f"Test driver: \n{res[0:10]}")
print(f"input[features][0:10]: \n{pos[0:10]}")

df.shape: (10001358, 16)

df[['CNT_INSTALMENT_FUTURE', 'NAME_CONTRACT_STATUS_Demand', 'MONTHS_BALANCE', 'NAME_CONTRACT_STATUS_XNA', 'SK_DPD_DEF', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Signed', 'NAME_CONTRACT_STATUS_Returned to the store', 'NAME_CONTRACT_STATUS_Amortized debt', 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Canceled', 'CNT_INSTALMENT', 'NAME_CONTRACT_STATUS_Active', 'SK_DPD']][0:5]: 
   CNT_INSTALMENT_FUTURE  NAME_CONTRACT_STATUS_Demand  MONTHS_BALANCE  \
0                   45.0                            0              31   
1                   35.0                            0              33   
2                    9.0                            0              32   
3                   42.0                            0              35   
4                   35.0                            0              35   

   NAME_CONTRACT_STATUS_XNA  SK_DPD_DEF  NAME_CONTRACT_STATUS_Completed  \
0                         0           0                  

In [84]:
res.MONTHS_BALANCE_max[res['SK_ID_CURR']==100001]

0    57
1    96
Name: MONTHS_BALANCE_max, dtype: int64

In [85]:
res

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,CNT_INSTALMENT_FUTURE_min,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_count,CNT_INSTALMENT_FUTURE_sum,NAME_CONTRACT_STATUS_Demand_min,NAME_CONTRACT_STATUS_Demand_max,NAME_CONTRACT_STATUS_Demand_mean,...,NAME_CONTRACT_STATUS_Active_min,NAME_CONTRACT_STATUS_Active_max,NAME_CONTRACT_STATUS_Active_mean,NAME_CONTRACT_STATUS_Active_count,NAME_CONTRACT_STATUS_Active_sum,SK_DPD_min,SK_DPD_max,SK_DPD_mean,SK_DPD_count,SK_DPD_sum
0,100001,1369693,0.0,4.0,2.000000,5,10.0,0,0,0.0,...,0,1,0.800000,5,4,0,0,0.000000,5,0
1,100001,1851984,0.0,2.0,0.750000,4,3.0,0,0,0.0,...,0,1,0.750000,4,3,0,7,1.750000,4,7
2,100002,1038818,6.0,24.0,15.000000,19,285.0,0,0,0.0,...,1,1,1.000000,19,19,0,0,0.000000,19,0
3,100003,1810518,0.0,12.0,7.875000,8,63.0,0,0,0.0,...,0,1,0.875000,8,7,0,0,0.000000,8,0
4,100003,2396755,1.0,12.0,6.500000,12,78.0,0,0,0.0,...,1,1,1.000000,12,12,0,0,0.000000,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936320,456255,1359084,0.0,12.0,7.555556,9,68.0,0,0,0.0,...,0,1,0.888889,9,8,0,0,0.000000,9,0
936321,456255,1743609,0.0,12.0,6.818182,11,75.0,0,0,0.0,...,0,1,0.909091,11,10,0,5,0.454545,11,5
936322,456255,2073384,0.0,24.0,13.800000,5,69.0,0,0,0.0,...,0,1,0.800000,5,4,0,0,0.000000,5,0
936323,456255,2631384,0.0,36.0,23.520000,25,588.0,0,0,0.0,...,0,1,0.960000,25,24,0,0,0.000000,25,0


In [119]:
res.columns

Index(['SK_ID_CURR', 'SK_ID_PREV', 'CNT_INSTALMENT_FUTURE_min',
       'CNT_INSTALMENT_FUTURE_max', 'CNT_INSTALMENT_FUTURE_mean',
       'CNT_INSTALMENT_FUTURE_count', 'CNT_INSTALMENT_FUTURE_sum',
       'NAME_CONTRACT_STATUS_Demand_min', 'NAME_CONTRACT_STATUS_Demand_max',
       'NAME_CONTRACT_STATUS_Demand_mean', 'NAME_CONTRACT_STATUS_Demand_count',
       'NAME_CONTRACT_STATUS_Demand_sum', 'MONTHS_BALANCE_min',
       'MONTHS_BALANCE_max', 'MONTHS_BALANCE_mean', 'MONTHS_BALANCE_count',
       'MONTHS_BALANCE_sum', 'NAME_CONTRACT_STATUS_XNA_min',
       'NAME_CONTRACT_STATUS_XNA_max', 'NAME_CONTRACT_STATUS_XNA_mean',
       'NAME_CONTRACT_STATUS_XNA_count', 'NAME_CONTRACT_STATUS_XNA_sum',
       'SK_DPD_DEF_min', 'SK_DPD_DEF_max', 'SK_DPD_DEF_mean',
       'SK_DPD_DEF_count', 'SK_DPD_DEF_sum',
       'NAME_CONTRACT_STATUS_Completed_min',
       'NAME_CONTRACT_STATUS_Completed_max',
       'NAME_CONTRACT_STATUS_Completed_mean',
       'NAME_CONTRACT_STATUS_Completed_count',
       'NA

In [None]:
# define function for below. 

In [125]:
## more to follow, have to define function for this
res1 = res.groupby('SK_ID_CURR') \
               .agg(SSK_ID_PREV_count=('SK_ID_PREV','count'),
                    CNT_INSTALMENT_FUTURE_min=('CNT_INSTALMENT_FUTURE_min','min'),
                    CNT_INSTALMENT_FUTURE_max=('CNT_INSTALMENT_FUTURE_max','min'),
                    CNT_INSTALMENT_FUTURE_mean=('CNT_INSTALMENT_FUTURE_mean','min'),
                    CNT_INSTALMENT_FUTURE_count=('CNT_INSTALMENT_FUTURE_count','min'),
                    CNT_INSTALMENT_FUTURE_sum=('CNT_INSTALMENT_FUTURE_sum','min'),
                    NAME_CONTRACT_STATUS_Demand_min=('NAME_CONTRACT_STATUS_Demand_min','min'),
                    NAME_CONTRACT_STATUS_Demand_max=('NAME_CONTRACT_STATUS_Demand_max','max'),
                    NAME_CONTRACT_STATUS_Demand_mean=('NAME_CONTRACT_STATUS_Demand_mean','mean'),
                    NAME_CONTRACT_STATUS_Demand_count=('NAME_CONTRACT_STATUS_Demand_count','count'),
                    NAME_CONTRACT_STATUS_Demand_sum=('NAME_CONTRACT_STATUS_Demand_sum','sum')
                    ) \
               .reset_index()
   

In [126]:
res1

Unnamed: 0,SK_ID_CURR,SSK_ID_PREV_count,CNT_INSTALMENT_FUTURE_min,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_count,CNT_INSTALMENT_FUTURE_sum,NAME_CONTRACT_STATUS_Demand_min,NAME_CONTRACT_STATUS_Demand_max,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Demand_count,NAME_CONTRACT_STATUS_Demand_sum
0,100001,2,0.0,2.0,0.750,4,3.0,0,0,0.0,2,0
1,100002,1,6.0,24.0,15.000,19,285.0,0,0,0.0,1,0
2,100003,3,0.0,6.0,2.625,8,21.0,0,0,0.0,3,0
3,100004,1,0.0,4.0,2.250,4,9.0,0,0,0.0,1,0
4,100005,1,0.0,12.0,7.200,10,72.0,0,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
337247,456251,1,0.0,8.0,4.375,8,35.0,0,0,0.0,1,0
337248,456252,1,0.0,6.0,3.000,7,21.0,0,0,0.0,1,0
337249,456253,3,0.0,2.0,0.750,4,3.0,0,0,0.0,3,0
337250,456254,2,4.0,14.0,9.000,9,99.0,0,0,0.0,2,0
