# Aggregations of Partial History
People change over time, perhaps using only a subset of the history supplied in our aggregations can boost our model score.

In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

%matplotlib inline

### Setup Data 
First, I will load the data for the main application as well as the credit card history into two pandas dataframes.  Pandas will be used to do the heavy lifting of the aggregations.  For this work, I will select a subset of the entire dataset to make things easier to work with.

In [2]:
app_data = pd.read_csv('../data/raw/application_train.csv')
cc_data = pd.read_csv('../data/raw/credit_card_balance.csv')

In [3]:
N_TEST_USERS = 1000
test_user_ids = np.random.choice(app_data['SK_ID_CURR'], N_TEST_USERS)
app_data = app_data.loc[app_data['SK_ID_CURR'].isin(test_user_ids)]
cc_data = cc_data.loc[cc_data['SK_ID_CURR'].isin(test_user_ids)]

In [4]:
app_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
1294,101523,0,Cash loans,F,N,Y,0,135000.0,505642.5,25947.0,...,0,0,0,0,,,,,,
1594,101866,0,Cash loans,M,N,N,0,180000.0,1609272.0,46251.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
1857,102178,0,Cash loans,F,N,N,0,117000.0,239850.0,25830.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
2214,102598,0,Cash loans,M,Y,Y,0,157500.0,404325.0,22063.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0
2454,102861,0,Cash loans,F,Y,Y,0,202500.0,1800000.0,49500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0


In [5]:
cc_data.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
468,1840380,402158,-5,430273.755,450000,0.0,22666.545,0.0,22666.545,22083.075,...,430273.755,430273.755,0.0,7,0.0,7.0,22.0,Active,0,0
522,2386359,147939,-1,33966.585,135000,0.0,54050.13,0.0,54050.13,188.595,...,33966.585,33966.585,0.0,33,0.0,33.0,3.0,Active,0,0
821,2309885,251971,-1,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,5.0,Completed,0,0
1442,1163691,444478,-4,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,3.0,Completed,0,0
1527,1678063,147939,-1,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,9.0,Completed,0,0


In [6]:
def last_k(data, agg_func, k):
    return agg_func(data[:k])

def last_6_mean(data):
    return last_k(data, np.mean, 6)

def last_12_mean(data):
    return last_k(data, np.mean, 12)

def last_18_mean(data):
    return last_k(data, np.mean, 18)

def last_24_mean(data):
    return last_k(data, np.mean, 24)

In [7]:
import time 
aggs = {
    'AMT_BALANCE':[last_6_mean, last_12_mean, last_18_mean, last_24_mean]
}

t_start = time.time()
cc_agg = cc_data.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']).groupby('SK_ID_CURR').agg(aggs)
print('Finshined: ', time.time()-t_start)

('Finshined: ', 0.25685787200927734)


In [8]:
colname = lambda x, y: 'CC_' + x + '_' + y.upper() 
new_cols = [colname(c[0],c[1]) for c in list(cc_agg.columns)]
cc_agg.columns = new_cols
cc_agg.head(4)

Unnamed: 0_level_0,CC_AMT_BALANCE_LAST_6_MEAN,CC_AMT_BALANCE_LAST_12_MEAN,CC_AMT_BALANCE_LAST_18_MEAN,CC_AMT_BALANCE_LAST_24_MEAN
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101866,339419.16,391110.2325,399228.7,390295.033125
102981,0.0,0.0,0.0,0.0
103528,471165.525,422742.195,425819.295,425819.295
104476,85174.26,61496.505,43191.0075,34125.5775


In [9]:
data = app_data.join(cc_agg)
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CC_AMT_BALANCE_LAST_6_MEAN,CC_AMT_BALANCE_LAST_12_MEAN,CC_AMT_BALANCE_LAST_18_MEAN,CC_AMT_BALANCE_LAST_24_MEAN
1294,101523,0,Cash loans,F,N,Y,0,135000.0,505642.5,25947.0,...,,,,,,,,,,
1594,101866,0,Cash loans,M,N,N,0,180000.0,1609272.0,46251.0,...,0.0,0.0,0.0,0.0,0.0,3.0,,,,
1857,102178,0,Cash loans,F,N,N,0,117000.0,239850.0,25830.0,...,0.0,0.0,0.0,0.0,0.0,2.0,,,,
2214,102598,0,Cash loans,M,Y,Y,0,157500.0,404325.0,22063.5,...,0.0,0.0,0.0,0.0,1.0,3.0,,,,
2454,102861,0,Cash loans,F,Y,Y,0,202500.0,1800000.0,49500.0,...,0.0,0.0,0.0,0.0,0.0,3.0,,,,


In [10]:
data.corr()['TARGET'].sort_values(ascending=True)

EXT_SOURCE_2                  -0.199689
EXT_SOURCE_3                  -0.198864
EXT_SOURCE_1                  -0.191741
REGION_POPULATION_RELATIVE    -0.074213
YEARS_BUILD_MODE              -0.066560
NONLIVINGAPARTMENTS_MODE      -0.064251
NONLIVINGAPARTMENTS_AVG       -0.060545
NONLIVINGAPARTMENTS_MEDI      -0.058809
YEARS_BUILD_MEDI              -0.057087
YEARS_BUILD_AVG               -0.056849
FLAG_DOCUMENT_6               -0.049479
FLAG_EMAIL                    -0.049376
FLOORSMAX_AVG                 -0.048811
FLOORSMAX_MEDI                -0.047792
LIVINGAREA_MEDI               -0.047409
LIVINGAREA_AVG                -0.046811
AMT_GOODS_PRICE               -0.046088
FLOORSMAX_MODE                -0.044949
DAYS_EMPLOYED                 -0.043866
AMT_INCOME_TOTAL              -0.042843
AMT_CREDIT                    -0.040092
FLAG_PHONE                    -0.040031
FLAG_DOCUMENT_5               -0.039694
LIVINGAREA_MODE               -0.039170
AMT_REQ_CREDIT_BUREAU_WEEK    -0.037926
