# 0. Set-up

In [1]:
## Import packages
import gc
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [2]:
user = os.getenv('USERNAME')
shared_workspace = '/home/mdawkins/modelling_club'
user_dir = os.path.join(shared_workspace, user)

raw_data_dir = os.path.join(shared_workspace, 'raw_data')
engineered_data_dir = os.path.join(shared_workspace, 'raw_data_lfs')

model_file_output_dir = os.path.join(user_dir, 'pipeline/input')

## 1. Initialise model file

In [3]:
model_file = pd.read_csv(engineered_data_dir + '/ids2split.csv',index_col='SK_ID_CURR')

# 2. Application data

In [5]:
application = pd.read_csv(engineered_data_dir + '/engineered/application/application.csv',index_col='SK_ID_CURR')

In [6]:
model_file = model_file.merge(application, on='SK_ID_CURR', how='left')

# 3. Previous 

In [7]:
previous = pd.read_pickle(engineered_data_dir + '/engineered/previous/previous_payments.pkl')
previous = previous.set_index("SK_ID_CURR")

In [8]:
model_file = model_file.merge(previous, on='SK_ID_CURR', how='left')

# 4. Bureau data

## Mortgage data

- Active

In [11]:
Active_Mortgage_Clean = pd.read_pickle(engineered_data_dir + '/engineered/bureau/Active_Mortgage_Clean.pkl')
Active_Mortgage_Clean = Active_Mortgage_Clean.add_prefix('ACT_MORTG_')

In [12]:
model_file = model_file.merge(Active_Mortgage_Clean, on='SK_ID_CURR', how='left')

- Closed

In [8]:
Closed_Mortgage_Clean = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Mortgage_Clean.csv',index_col='SK_ID_CURR')
Closed_Mortgage_Clean = Closed_Mortgage_Clean.add_prefix('CLD_MORTG_')

In [9]:
model_file = model_file.merge(Closed_Mortgage_Clean, on='SK_ID_CURR', how='left')

## Consumer Credit data

- Active 

In [20]:
Active_ConsumerCredit_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_ConsumerCredit_Agg.csv',index_col=['SK_ID_CURR'])
Active_ConsumerCredit_Agg = Active_ConsumerCredit_Agg.add_prefix('ACT_CONCRED_')

In [11]:
model_file = model_file.merge(Active_ConsumerCredit_Agg, on='SK_ID_CURR', how='left')

In [10]:
Active_ConsumerCredit_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_ConsumerCredit_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Active_ConsumerCredit_Agg_Date = Active_ConsumerCredit_Agg_Date.add_prefix('ACT_CONCRED_')
Active_ConsumerCredit_Dates = Active_ConsumerCredit_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [11]:
for d in Active_ConsumerCredit_Dates:
    model_file = model_file.merge(Active_ConsumerCredit_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [14]:
Closed_ConsumerCredit_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_ConsumerCredit_Agg.csv',index_col=['SK_ID_CURR'])
Closed_ConsumerCredit_Agg = Closed_ConsumerCredit_Agg.add_prefix('CLD_CONCRED_')

In [15]:
model_file = model_file.merge(Closed_ConsumerCredit_Agg, on='SK_ID_CURR', how='left')

In [16]:
Closed_ConsumerCredit_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_ConsumerCredit_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Closed_ConsumerCredit_Agg_Date = Closed_ConsumerCredit_Agg_Date.add_prefix('CLD_CONCRED_')
Closed_ConsumerCredit_Dates = Closed_ConsumerCredit_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [17]:
for d in Closed_ConsumerCredit_Dates:
    model_file = model_file.merge(Closed_ConsumerCredit_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

## Credit Card data

- Active

In [18]:
Active_CreditCard_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_CreditCard_Agg.csv',index_col=['SK_ID_CURR'])
Active_CreditCard_Agg = Active_CreditCard_Agg.add_prefix('ACT_CREDCARD_')

In [19]:
model_file = model_file.merge(Active_CreditCard_Agg, on='SK_ID_CURR', how='left')

In [20]:
Active_CreditCard_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_CreditCard_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Active_CreditCard_Agg_Date = Active_CreditCard_Agg_Date.add_prefix('ACT_CREDCARD_')
Active_CreditCard_Dates = Active_CreditCard_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [21]:
for d in Active_CreditCard_Dates:
    model_file = model_file.merge(Active_CreditCard_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [22]:
Closed_CreditCard_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_CreditCard_Agg.csv',index_col=['SK_ID_CURR'])
Closed_CreditCard_Agg = Closed_CreditCard_Agg.add_prefix('CLD_CREDCARD_')

In [23]:
model_file = model_file.merge(Closed_CreditCard_Agg, on='SK_ID_CURR', how='left')

In [24]:
Closed_CreditCard_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_CreditCard_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Closed_CreditCard_Agg_Date = Closed_CreditCard_Agg_Date.add_prefix('CLD_CREDCARD_')
Closed_CreditCard_Dates = Closed_CreditCard_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [25]:
for d in Closed_CreditCard_Dates:
    model_file = model_file.merge(Closed_CreditCard_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

Other

- Active

In [26]:
Active_Other_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_Other_Agg.csv',index_col=['SK_ID_CURR'])
Active_Other_Agg = Active_Other_Agg.add_prefix('ACT_OTHER_')

In [27]:
model_file = model_file.merge(Active_Other_Agg, on='SK_ID_CURR', how='left')

In [28]:
Active_Other_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_Other_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Active_Other_Agg_Date = Active_Other_Agg_Date.add_prefix('ACT_OTHER_')
Active_Other_Dates = Active_Other_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [29]:
for d in Active_Other_Dates:
    model_file = model_file.merge(Active_Other_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [30]:
Closed_Other_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Other_Agg.csv',index_col=['SK_ID_CURR'])
Closed_Other_Agg = Closed_Other_Agg.add_prefix('CLD_OTHER_')

In [31]:
model_file = model_file.merge(Closed_Other_Agg, on='SK_ID_CURR', how='left')

In [32]:
Closed_Other_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Other_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Closed_Other_Agg_Date = Closed_Other_Agg_Date.add_prefix('CLD_OTHER_')
Closed_Other_Dates = Closed_Other_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [33]:
for d in Closed_Other_Dates:
    model_file = model_file.merge(Closed_Other_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

## Sold Debt data

In [34]:
Sold_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Sold_Agg.csv',index_col=['SK_ID_CURR'])
Sold_Agg = Sold_Agg.add_prefix('SOLD_')

In [35]:
# model_file = model_file.merge(Sold_Agg, on='SK_ID_CURR', how='left')

### Final model file

In [36]:
# model_file = model_file.drop(["DAYS_EMPLOYED","DAYS_BIRTH","NAME_INCOME_TYPE"],axis=1)

In [37]:
model_file

Unnamed: 0_level_0,SPLIT,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,D2_CLD_OTHER_AMT_ANNUITY_DEBT_MAX,D2_CLD_OTHER_COUNT,D3_CLD_OTHER_AMT_CREDIT_MAX_OVERDUE,D3_CLD_OTHER_AMT_CREDIT_SUM,D3_CLD_OTHER_AMT_CREDIT_SUM_MAX,D3_CLD_OTHER_AMT_CREDIT_SUM_DEBT,D3_CLD_OTHER_AMT_CREDIT_SUM_DEBT_MAX,D3_CLD_OTHER_AMT_ANNUITY_DEBT,D3_CLD_OTHER_AMT_ANNUITY_DEBT_MAX,D3_CLD_OTHER_COUNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
113072,validation,0.0,Revolving loans,M,Y,Y,0,283500.0,180000.0,9000.0,...,,,,,,,,,,
225465,train,0.0,Cash loans,F,N,Y,0,67500.0,182016.0,10291.5,...,,,,,,,,,,
260656,train,0.0,Cash loans,F,N,Y,0,112500.0,315000.0,17716.5,...,,,,,,,,,,
445761,validation,0.0,Cash loans,F,N,Y,0,675000.0,2000000.0,59301.0,...,,,,,,,,,,
421144,train,0.0,Cash loans,F,N,Y,0,180000.0,679500.0,19867.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256859,train,0.0,Cash loans,M,Y,Y,1,180000.0,313438.5,22842.0,...,,,,,,,,,,
246902,train,0.0,Cash loans,F,N,Y,0,90000.0,528633.0,25560.0,...,,,,,,,,,,
287398,validation,0.0,Cash loans,F,N,N,0,67500.0,315000.0,13342.5,...,,,,,,,,,,
282658,train,0.0,Revolving loans,M,Y,N,0,247500.0,675000.0,33750.0,...,,,,,,,,,,


In [38]:
model_file.to_csv(os.path.join(model_file_output_dir,'model_file.csv'))
model_file.to_csv(os.path.join(engineered_data_dir,'model_file.csv'))