# 0. Set-up

In [1]:
## Import packages
import gc
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [48]:
user = os.getenv('USERNAME')
shared_workspace = '/home/mdawkins/modelling_club'
user_dir = os.path.join(shared_workspace, user)

raw_data_dir = os.path.join(shared_workspace, 'raw_data')
engineered_data_dir = os.path.join(shared_workspace, 'raw_data_lfs')

model_file_output_dir = '/home/mdawkins/modelling_club/raw_data_lfs'

# 1. Initialise model file

In [3]:
model_file = pd.read_csv(engineered_data_dir + '/ids2split.csv',index_col='SK_ID_CURR')

# 2. Application data

In [4]:
application = pd.read_csv(engineered_data_dir + '/engineered/application/application.csv',index_col='SK_ID_CURR')

In [5]:
model_file = model_file.merge(application, on='SK_ID_CURR', how='left')

# 3. Previous 

In [6]:
previous = pd.read_pickle(engineered_data_dir + '/engineered/previous/previous_payments.pkl')
previous = previous.set_index("SK_ID_CURR")

In [7]:
model_file = model_file.merge(previous, on='SK_ID_CURR', how='left')

# 4. POS CASH balance data

In [8]:
pos_cash_balance = pd.read_pickle(engineered_data_dir + '/engineered/POS_CASH_balance/pos_features.pkl')
# pos_cash_balance has SK_ID_CURR as the index already

In [9]:
model_file = model_file.merge(pos_cash_balance, on='SK_ID_CURR', how='left')

# 5. Bureau balance data

In [10]:
bureau_balance = pd.read_pickle(engineered_data_dir + '/engineered/bureau_balance/bureau_balance_features.pkl')
# bureau_balance has SK_ID_CURR as the index already

In [11]:
model_file = model_file.merge(bureau_balance, on='SK_ID_CURR', how='left')

# 6. Credit card data

None currently available

# 7. Bureau data

## Mortgage data

- Active

In [12]:
Active_Mortgage_Clean = pd.read_pickle(engineered_data_dir + '/engineered/bureau/Active_Mortgage_Clean.pkl')
Active_Mortgage_Clean = Active_Mortgage_Clean.add_prefix('ACT_MORTG_')

In [13]:
model_file = model_file.merge(Active_Mortgage_Clean, on='SK_ID_CURR', how='left')

- Closed

In [14]:
Closed_Mortgage_Clean = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Mortgage_Clean.csv',index_col='SK_ID_CURR')
Closed_Mortgage_Clean = Closed_Mortgage_Clean.add_prefix('CLD_MORTG_')

In [15]:
model_file = model_file.merge(Closed_Mortgage_Clean, on='SK_ID_CURR', how='left')

## Consumer Credit data

- Active 

In [16]:
Active_ConsumerCredit_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_ConsumerCredit_Agg.csv',index_col=['SK_ID_CURR'])
Active_ConsumerCredit_Agg = Active_ConsumerCredit_Agg.add_prefix('ACT_CONCRED_')

In [17]:
model_file = model_file.merge(Active_ConsumerCredit_Agg, on='SK_ID_CURR', how='left')

In [18]:
Active_ConsumerCredit_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_ConsumerCredit_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Active_ConsumerCredit_Agg_Date = Active_ConsumerCredit_Agg_Date.add_prefix('ACT_CONCRED_')
Active_ConsumerCredit_Dates = Active_ConsumerCredit_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [19]:
for d in Active_ConsumerCredit_Dates:
    model_file = model_file.merge(Active_ConsumerCredit_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [20]:
Closed_ConsumerCredit_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_ConsumerCredit_Agg.csv',index_col=['SK_ID_CURR'])
Closed_ConsumerCredit_Agg = Closed_ConsumerCredit_Agg.add_prefix('CLD_CONCRED_')

In [21]:
model_file = model_file.merge(Closed_ConsumerCredit_Agg, on='SK_ID_CURR', how='left')

In [22]:
Closed_ConsumerCredit_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_ConsumerCredit_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Closed_ConsumerCredit_Agg_Date = Closed_ConsumerCredit_Agg_Date.add_prefix('CLD_CONCRED_')
Closed_ConsumerCredit_Dates = Closed_ConsumerCredit_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [23]:
for d in Closed_ConsumerCredit_Dates:
    model_file = model_file.merge(Closed_ConsumerCredit_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

## Credit Card data

- Active

In [24]:
Active_CreditCard_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_CreditCard_Agg.csv',index_col=['SK_ID_CURR'])
Active_CreditCard_Agg = Active_CreditCard_Agg.add_prefix('ACT_CREDCARD_')

In [25]:
model_file = model_file.merge(Active_CreditCard_Agg, on='SK_ID_CURR', how='left')

In [26]:
Active_CreditCard_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_CreditCard_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Active_CreditCard_Agg_Date = Active_CreditCard_Agg_Date.add_prefix('ACT_CREDCARD_')
Active_CreditCard_Dates = Active_CreditCard_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [27]:
for d in Active_CreditCard_Dates:
    model_file = model_file.merge(Active_CreditCard_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [28]:
Closed_CreditCard_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_CreditCard_Agg.csv',index_col=['SK_ID_CURR'])
Closed_CreditCard_Agg = Closed_CreditCard_Agg.add_prefix('CLD_CREDCARD_')

In [29]:
model_file = model_file.merge(Closed_CreditCard_Agg, on='SK_ID_CURR', how='left')

In [30]:
Closed_CreditCard_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_CreditCard_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
Closed_CreditCard_Agg_Date = Closed_CreditCard_Agg_Date.add_prefix('CLD_CREDCARD_')
Closed_CreditCard_Dates = Closed_CreditCard_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [31]:
for d in Closed_CreditCard_Dates:
    model_file = model_file.merge(Closed_CreditCard_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

# Other

- Active

In [32]:
Active_Other_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_Other_Agg.csv',index_col=['SK_ID_CURR'])
Active_Other_Agg = Active_Other_Agg.add_prefix('ACT_OTHER_')

In [33]:
model_file = model_file.merge(Active_Other_Agg, on='SK_ID_CURR', how='left')

In [34]:
# Active_Other_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Active_Other_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
# Active_Other_Agg_Date = Active_Other_Agg_Date.add_prefix('ACT_OTHER_')
# Active_Other_Dates = Active_Other_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [35]:
# for d in Active_Other_Dates:
#     model_file = model_file.merge(Active_Other_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

- Closed

In [36]:
Closed_Other_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Other_Agg.csv',index_col=['SK_ID_CURR'])
Closed_Other_Agg = Closed_Other_Agg.add_prefix('CLD_OTHER_')

In [37]:
model_file = model_file.merge(Closed_Other_Agg, on='SK_ID_CURR', how='left')

In [38]:
# Closed_Other_Agg_Date = pd.read_csv(engineered_data_dir + '/engineered/bureau/Closed_Other_Agg_Date.csv',index_col=['DATE_RANGE','SK_ID_CURR'])
# Closed_Other_Agg_Date = Closed_Other_Agg_Date.add_prefix('CLD_OTHER_')
# Closed_Other_Dates = Closed_Other_Agg_Date.index.get_level_values("DATE_RANGE").unique()

In [39]:
# for d in Closed_Other_Dates:
#     model_file = model_file.merge(Closed_Other_Agg_Date.loc[d].add_prefix('D'+str(d)+'_'), on='SK_ID_CURR', how='left')

## Sold Debt data

In [40]:
Sold_Agg = pd.read_csv(engineered_data_dir + '/engineered/bureau/Sold_Agg.csv',index_col=['SK_ID_CURR'])
Sold_Agg = Sold_Agg.add_prefix('SOLD_')

In [41]:
# model_file = model_file.merge(Sold_Agg, on='SK_ID_CURR', how='left')

### Final model file

In [42]:
# model_file = model_file.drop(["DAYS_EMPLOYED","DAYS_BIRTH","NAME_INCOME_TYPE"],axis=1)

In [43]:
model_file

Unnamed: 0_level_0,SPLIT,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,ACT_OTHER_AMT_ANNUITY_MAX,ACT_OTHER_COUNT,CLD_OTHER_AMT_CREDIT_MAX_OVERDUE,CLD_OTHER_AMT_CREDIT_SUM,CLD_OTHER_AMT_CREDIT_SUM_MAX,CLD_OTHER_AMT_CREDIT_SUM_DEBT,CLD_OTHER_AMT_CREDIT_SUM_DEBT_MAX,CLD_OTHER_AMT_ANNUITY_DEBT,CLD_OTHER_AMT_ANNUITY_DEBT_MAX,CLD_OTHER_COUNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
113072,validation,0.0,Revolving loans,M,Y,Y,0,283500.0,180000.0,9000.0,...,,,,,,,,,,
225465,train,0.0,Cash loans,F,N,Y,0,67500.0,182016.0,10291.5,...,,,,,,,,,,
260656,train,0.0,Cash loans,F,N,Y,0,112500.0,315000.0,17716.5,...,,,,,,,,,,
445761,validation,0.0,Cash loans,F,N,Y,0,675000.0,2000000.0,59301.0,...,,,,,,,,,,
421144,train,0.0,Cash loans,F,N,Y,0,180000.0,679500.0,19867.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256859,train,0.0,Cash loans,M,Y,Y,1,180000.0,313438.5,22842.0,...,,,,,,,,,,
246902,train,0.0,Cash loans,F,N,Y,0,90000.0,528633.0,25560.0,...,,,,,,,,,,
287398,validation,0.0,Cash loans,F,N,N,0,67500.0,315000.0,13342.5,...,,,,,,,,,,
282658,train,0.0,Revolving loans,M,Y,N,0,247500.0,675000.0,33750.0,...,,,,,,,,,,


In [45]:
model_file.isna().sum().sort_values().to_dict()

{'SPLIT': 0,
 'REG_REGION_NOT_WORK_REGION': 0,
 'LIVE_REGION_NOT_WORK_REGION': 0,
 'REG_CITY_NOT_LIVE_CITY': 0,
 'REG_CITY_NOT_WORK_CITY': 0,
 'LIVE_CITY_NOT_WORK_CITY': 0,
 'REGION_ID': 0,
 'HIGHER_ED': 0,
 'YEARS_BIRTH': 0,
 'CREDIT_IN_YEARS_INCOME': 0,
 'FLAG_DOCUMENT_21': 0,
 'FLAG_DOCUMENT_20': 0,
 'FLAG_DOCUMENT_19': 0,
 'FLAG_DOCUMENT_18': 0,
 'FLAG_DOCUMENT_17': 0,
 'FLAG_DOCUMENT_16': 0,
 'FLAG_DOCUMENT_15': 0,
 'FLAG_DOCUMENT_14': 0,
 'FLAG_DOCUMENT_13': 0,
 'FLAG_DOCUMENT_12': 0,
 'FLAG_DOCUMENT_11': 0,
 'FLAG_DOCUMENT_10': 0,
 'FLAG_DOCUMENT_9': 0,
 'FLAG_DOCUMENT_8': 0,
 'FLAG_DOCUMENT_7': 0,
 'FLAG_DOCUMENT_6': 0,
 'FLAG_DOCUMENT_5': 0,
 'FLAG_DOCUMENT_4': 0,
 'FLAG_DOCUMENT_3': 0,
 'FLAG_DOCUMENT_2': 0,
 'REG_REGION_NOT_LIVE_REGION': 0,
 'HOUR_APPR_PROCESS_START': 0,
 'ORGANIZATION_TYPE': 0,
 'REGION_RATING_CLIENT_W_CITY': 0,
 'DAYS_BIRTH': 0,
 'REGION_POPULATION_RELATIVE': 0,
 'NAME_HOUSING_TYPE': 0,
 'NAME_FAMILY_STATUS': 0,
 'NAME_EDUCATION_TYPE': 0,
 'NAME_INCOME_TYP

In [49]:
model_file.to_csv(os.path.join(model_file_output_dir,'model_file.csv'))
model_file.to_csv(os.path.join(engineered_data_dir,'model_file.csv'))

model_file.to_pickle(os.path.join(model_file_output_dir,'model_file.pkl'))
model_file.to_pickle(os.path.join(engineered_data_dir,'model_file.pkl'))