# Aggregations of Partial History
People change over time, perhaps using only a subset of the history supplied in our aggregations can boost our model score.

In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

%matplotlib inline

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


### Setup Data 
First, I will load the data for the main application as well as the credit card history into two pandas dataframes.  Pandas will be used to do the heavy lifting of the aggregations.  For this work, I will select a subset of the entire dataset to make things easier to work with.

In [2]:
app_data = pd.read_csv('../data/raw/application_train.csv')
cc_data = pd.read_csv('../data/raw/credit_card_balance.csv')

In [3]:
N_TEST_USERS = 100000
test_user_ids = np.random.choice(app_data['SK_ID_CURR'], N_TEST_USERS)
app_data = app_data.loc[app_data['SK_ID_CURR'].isin(test_user_ids)]
cc_data = cc_data.loc[cc_data['SK_ID_CURR'].isin(test_user_ids)]

In [4]:
app_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
9,100012,0,Revolving loans,M,N,Y,0,135000.0,405000.0,20250.0,...,0,0,0,0,,,,,,
11,100015,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
15,100019,0,Cash loans,M,Y,Y,0,157500.0,299772.0,20160.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
cc_data.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
9,1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,...,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0
13,1441883,171537,-5,0.0,270000,0.0,0.0,0.0,0.0,12218.49,...,0.0,0.0,0.0,0,0.0,0.0,2.0,Active,0,0
14,1864742,303581,-1,0.0,45000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,59.0,Active,0,0
18,2016842,302450,-3,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,2.0,Completed,0,0


In [9]:
def last_k(data, agg_func, k):
    return agg_func(data[:k])

def last_6_mean(data):
    return last_k(data, np.mean, 6)

def last_12_mean(data):
    return last_k(data, np.mean, 12)

def last_18_mean(data):
    return last_k(data, np.mean, 18)

def last_24_mean(data):
    return last_k(data, np.mean, 24)

In [10]:
import time 
aggs = {
    'AMT_BALANCE':[last_6_mean, last_12_mean, last_18_mean, last_24_mean]
}

t_start = time.time()
cc_agg = cc_data.groupby('SK_ID_CURR').agg(aggs)
print('Finshined: ', time.time()-t_start)

('Finshined: ', 13.81001591682434)


In [11]:
colname = lambda x, y: 'CC_' + x + '_' + y.upper() 
new_cols = [colname(c[0],c[1]) for c in list(cc_agg.columns)]
cc_agg.columns = new_cols
cc_agg.head(4)

Unnamed: 0_level_0,CC_AMT_BALANCE_LAST_6_MEAN,CC_AMT_BALANCE_LAST_12_MEAN,CC_AMT_BALANCE_LAST_18_MEAN,CC_AMT_BALANCE_LAST_24_MEAN
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100048,43183.4475,50084.79375,53802.08,54142.44375
100050,0.0,0.0,0.0,0.0
100059,0.0,0.0,0.0,0.0
100082,2606.7825,22280.415,14871.005,16599.135


In [13]:
data = app_data.join(cc_agg)
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CC_AMT_BALANCE_LAST_6_MEAN,CC_AMT_BALANCE_LAST_12_MEAN,CC_AMT_BALANCE_LAST_18_MEAN,CC_AMT_BALANCE_LAST_24_MEAN
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
9,100012,0,Revolving loans,M,N,Y,0,135000.0,405000.0,20250.0,...,,,,,,,,,,
11,100015,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,...,0.0,0.0,0.0,0.0,0.0,2.0,,,,
15,100019,0,Cash loans,M,Y,Y,0,157500.0,299772.0,20160.0,...,0.0,0.0,0.0,0.0,0.0,1.0,,,,


In [18]:
data.corr()['TARGET'].sort_values(ascending=True)

EXT_SOURCE_3                  -0.180616
EXT_SOURCE_2                  -0.160529
EXT_SOURCE_1                  -0.155786
FLOORSMAX_AVG                 -0.048150
DAYS_EMPLOYED                 -0.047948
FLOORSMAX_MEDI                -0.047586
FLOORSMAX_MODE                -0.047193
AMT_GOODS_PRICE               -0.038642
REGION_POPULATION_RELATIVE    -0.036330
TOTALAREA_MODE                -0.035249
ELEVATORS_AVG                 -0.035099
ELEVATORS_MEDI                -0.034905
LIVINGAREA_AVG                -0.034661
LIVINGAREA_MEDI               -0.034366
ELEVATORS_MODE                -0.033079
APARTMENTS_AVG                -0.032645
LIVINGAREA_MODE               -0.032048
APARTMENTS_MEDI               -0.031950
FLOORSMIN_AVG                 -0.030414
APARTMENTS_MODE               -0.030223
FLOORSMIN_MEDI                -0.030053
FLOORSMIN_MODE                -0.029398
AMT_CREDIT                    -0.029143
FLAG_DOCUMENT_6               -0.029140
YEARS_BUILD_MEDI              -0.026768
