# 0. Set up

In [1]:
## Import packages
import os
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
## Set paths
user = os.getenv('USERNAME')
shared_workspace = '/home/mdawkins/modelling_club'
user_dir = os.path.join(shared_workspace, user)
data_dir = os.path.join(shared_workspace, 'raw_data')
data_output_dir = os.path.join(shared_workspace, 'raw_data_lfs/engineered/bureau_balance/')

# 1. Import data

In [6]:
home_loan_train = pd.read_csv(data_dir + '/raw/application_train.csv',index_col='SK_ID_CURR')
train_response = home_loan_train['TARGET']

home_loan_test = pd.read_csv(data_dir + '/raw/application_test_noTarget.csv',index_col='SK_ID_CURR')
home_loan_test.loc[:, 'TARGET'] = None

bureau = pd.read_csv(data_dir + '/raw/bureau.csv')
bureauBalance = pd.read_csv(data_dir + '/raw/bureau_balance.csv')

# 2. Feature engineering

In [15]:
bureau_balance_features = bureauBalance

# BB_DPD_FLAG - flag for if they have ever been overdue
bureau_balance_features.loc[:, 'BB_DPD_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['1', '2', '3', '4', '5']), 'BB_DPD_FLAG'] = '1'

# BB_DPD_1_30_FLAG - flag for if they have ever paid 1 - 30 days late
bureau_balance_features.loc[:, 'BB_DPD_1_30_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['1']), 'BB_DPD_1_30_FLAG'] = '1'

# BB_DPD_31_60_FLAG - flag for if they have ever paid 31 - 60 days late
bureau_balance_features.loc[:, 'BB_DPD_31_60_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['2']), 'BB_DPD_31_60_FLAG'] = '1'

# BB_DPD_61_90_FLAG - flag for if they have ever paid 61 - 90 days late
bureau_balance_features.loc[:, 'BB_DPD_61_90_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['3']), 'BB_DPD_61_90_FLAG'] = '1'

# BB_DPD_91_120_FLAG - flag for if they have ever paid 91 - 120 days late
bureau_balance_features.loc[:, 'BB_DPD_91_120_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['4']), 'BB_DPD_91_120_FLAG'] = '1'

# BB_DPD_120_FLAG - flag for if they have ever paid 120+ days late
bureau_balance_features.loc[:, 'BB_DPD_120_FLAG'] = '0'
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['5']), 'BB_DPD_120_FLAG'] = '1'

# bureau_balance_features[['BB_DPD_FLAG', 'BB_DPD_1_30_FLAG', 'BB_DPD_31_60_FLAG', 'BB_DPD_61_90_FLAG', 'BB_DPD_91_120_FLAG', 'BB_DPD_120_FLAG']].drop_duplicates()
# bureau_balance_features

# BB_DPD_MAX - flag for max days late they have ever been
bureau_balance_features.loc[:, 'BB_STATUS_NUM'] = None
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['1']), 'BB_STATUS_NUM'] = 1
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['2']), 'BB_STATUS_NUM'] = 2
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['3']), 'BB_STATUS_NUM'] = 3
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['4']), 'BB_STATUS_NUM'] = 4
bureau_balance_features.loc[bureauBalance['STATUS'].isin(['5']), 'BB_STATUS_NUM'] = 5
# bureau_balance_features[['STATUS', 'BB_STATUS_NUM']].drop_duplicates()
bureau_balance_features['BB_DPD_MAX'] = bureau_balance_features.groupby("SK_ID_BUREAU")['BB_STATUS_NUM'].transform('max')

# aggregate to SK_ID_BUREAU level
bureau_balance_features = bureau_balance_features[['SK_ID_BUREAU', 'BB_DPD_FLAG', 'BB_DPD_1_30_FLAG', 'BB_DPD_31_60_FLAG', 'BB_DPD_61_90_FLAG', 'BB_DPD_91_120_FLAG', 'BB_DPD_120_FLAG', 'BB_DPD_MAX']]
bureau_balance_features = bureau_balance_features.merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], on="SK_ID_BUREAU")
bureau_balance_features = bureau_balance_features.drop("SK_ID_BUREAU", 1).groupby('SK_ID_CURR').max()
# bureau_balance_features = bureau_balance_features.groupby("SK_ID_BUREAU")['BB_DPD_FLAG', 'BB_DPD_1_30_FLAG', 'BB_DPD_31_60_FLAG', 'BB_DPD_61_90_FLAG', 'BB_DPD_91_120_FLAG', 'BB_DPD_120_FLAG', 'BB_DPD_MAX'].transform('max')   # no index/ID remains
bureau_balance_features.head()

Unnamed: 0_level_0,BB_DPD_FLAG,BB_DPD_1_30_FLAG,BB_DPD_31_60_FLAG,BB_DPD_61_90_FLAG,BB_DPD_91_120_FLAG,BB_DPD_120_FLAG,BB_DPD_MAX
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,1,1,0,0,0,0,1.0
100010,0,0,0,0,0,0,
100019,0,0,0,0,0,0,
100032,0,0,0,0,0,0,
100033,0,0,0,0,0,0,


In [16]:
bureau_balance_features['BB_DPD_MAX'] = bureau_balance_features['BB_DPD_MAX'].astype('category')
bureau_balance_features.dtypes

BB_DPD_FLAG             object
BB_DPD_1_30_FLAG        object
BB_DPD_31_60_FLAG       object
BB_DPD_61_90_FLAG       object
BB_DPD_91_120_FLAG      object
BB_DPD_120_FLAG         object
BB_DPD_MAX            category
dtype: object

In [17]:
bureau_balance_features.to_pickle(data_output_dir + "bureau_balance_features.pkl")

In [7]:
# check the proportion defaults for loans with DPD>0
test = bureau_balance_features.merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], on="SK_ID_BUREAU")
test = test.merge(home_loan_train[['TARGET']], on='SK_ID_CURR')
test[['TARGET', 'BB_DPD_FLAG']].groupby('BB_DPD_FLAG').mean()

Unnamed: 0_level_0,TARGET
BB_DPD_FLAG,Unnamed: 1_level_1
0,0.068359
1,0.106632
