# 0. Set up

In [1]:
## Import packages
import os
import pandas as pd

In [2]:
## Set paths
user = os.getenv('USERNAME')
shared_workspace = '/home/mdawkins/modelling_club'
user_dir = os.path.join(shared_workspace, user)
data_dir = os.path.join(shared_workspace, 'raw_data')
data_output_dir = os.path.join(shared_workspace, 'raw_data_lfs/engineered/bureau/')

# 1. Import data

In [3]:
home_loan_train = pd.read_csv(data_dir + '/raw/application_train.csv',index_col='SK_ID_CURR')
train_response = home_loan_train['TARGET']

home_loan_test = pd.read_csv(data_dir + '/raw/application_test_noTarget.csv',index_col='SK_ID_CURR')
home_loan_test.loc[:, 'TARGET'] = None

bureau = pd.read_csv(data_dir + '/raw/bureau.csv')
bureauBalance = pd.read_csv(data_dir + '/raw/bureau_balance.csv')

# 2. Cleaning

In [4]:
bureau = bureau[bureau["CREDIT_CURRENCY"]=='currency 1']
bureau = bureau.drop("CREDIT_CURRENCY",axis=1)

# 3. Bad Debt

In [5]:
Bad_Debt = bureau[["SK_ID_CURR","CREDIT_ACTIVE"]][bureau["CREDIT_ACTIVE"]=="Bad debt"].groupby("SK_ID_CURR").count().rename(columns={"CREDIT_ACTIVE":"BAD_DEBT"})
Bad_Debt.to_pickle(data_output_dir + 'Bad_Debt.pkl')

# 4. Active and Closed Debt

In [7]:
## Consumer Credit
Active_ConsumerCredit = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Active') & (bureau["CREDIT_TYPE"] == "Consumer credit")]
Closed_ConsumerCredit = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Closed') & (bureau["CREDIT_TYPE"] == "Consumer credit")]
## Credit Card
Active_CreditCard = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Active') & (bureau["CREDIT_TYPE"] == "Credit card")]
Closed_CreditCard = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Closed') & (bureau["CREDIT_TYPE"] == "Credit card")]
## Mortgage
Active_Mortgage = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Active') & (bureau["CREDIT_TYPE"] == "Mortgage")]
Closed_Mortgage = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Closed') & (bureau["CREDIT_TYPE"] == "Mortgage")]
## Other
Active_Other = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Active') & ~(bureau["CREDIT_TYPE"].isin(["Consumer credit","Credit card","Mortgage"]))]
Closed_Other = bureau.loc[(bureau["CREDIT_ACTIVE"] == 'Closed') & ~(bureau["CREDIT_TYPE"].isin(["Consumer credit","Credit card","Mortgage"]))]

### Active Consumer Credit

In [8]:
def conCred_year_brackets(year):
    if (year >= -1): return 1
    elif (year >= -2): return 2
    else: return 3

In [9]:
Active_ConsumerCredit = Active_ConsumerCredit.drop(["DAYS_ENDDATE_FACT","CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Active_ConsumerCredit = Active_ConsumerCredit.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Active_ConsumerCredit = Active_ConsumerCredit.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Active_ConsumerCredit["YEARS_CREDIT"] = Active_ConsumerCredit["DAYS_CREDIT"]/365
Active_ConsumerCredit["DATE_RANGE"] = Active_ConsumerCredit["YEARS_CREDIT"].apply(conCred_year_brackets)

In [10]:
Active_ConsumerCredit_Agg = Active_ConsumerCredit.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [11]:
Active_ConsumerCredit_Agg.to_pickle(data_output_dir + "Active_ConsumerCredit_Agg.pkl")

In [13]:
Active_ConsumerCredit_Agg_Date = Active_ConsumerCredit.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [14]:
Active_ConsumerCredit_Agg_Date.to_pickle(data_output_dir + "Active_ConsumerCredit_Agg_Date.pkl")

### Closed Consumer Credit

In [16]:
def closedConCred_year_brackets(year):
    if (year >= -2): return 1
    elif (year >= -4): return 2
    else: return 3

In [17]:
Closed_ConsumerCredit = Closed_ConsumerCredit.drop(["CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Closed_ConsumerCredit = Closed_ConsumerCredit.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Closed_ConsumerCredit = Closed_ConsumerCredit.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Closed_ConsumerCredit["YEARS_ENDDATE_FACT"] = Closed_ConsumerCredit["DAYS_ENDDATE_FACT"]/365
Closed_ConsumerCredit["DATE_RANGE"] = Closed_ConsumerCredit["YEARS_ENDDATE_FACT"].apply(closedConCred_year_brackets)

In [18]:
Closed_ConsumerCredit_Agg = Closed_ConsumerCredit.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [19]:
Closed_ConsumerCredit_Agg.to_pickle(data_output_dir + "Closed_ConsumerCredit_Agg.pkl")

In [21]:
Closed_ConsumerCredit_Agg_Date = Closed_ConsumerCredit.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [22]:
Closed_ConsumerCredit_Agg_Date.to_pickle(data_output_dir + "Closed_ConsumerCredit_Agg_Date.pkl")

### Active Credit Card

In [24]:
def credCard_year_brackets(year):
    if (year >= -1): return 1
    elif (year >= -2): return 2
    elif (year >= -4): return 3
    else: return 4

In [25]:
Active_CreditCard = Active_CreditCard.drop(["DAYS_ENDDATE_FACT","CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Active_CreditCard = Active_CreditCard.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Active_CreditCard = Active_CreditCard.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Active_CreditCard["YEARS_CREDIT"] = Active_CreditCard["DAYS_CREDIT"]/365
Active_CreditCard["DATE_RANGE"] = Active_CreditCard["YEARS_CREDIT"].apply(credCard_year_brackets)

In [26]:
Active_CreditCard_Agg = Active_CreditCard.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_CREDIT_SUM_LIMIT=('AMT_CREDIT_SUM_LIMIT', sum),
         AMT_CREDIT_SUM_LIMIT_MAX=('AMT_CREDIT_SUM_LIMIT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [27]:
Active_CreditCard_Agg.to_pickle(data_output_dir + "Active_CreditCard_Agg.pkl")

In [29]:
Active_CreditCard_Agg_Date = Active_CreditCard.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_CREDIT_SUM_LIMIT=('AMT_CREDIT_SUM_LIMIT', sum),
         AMT_CREDIT_SUM_LIMIT_MAX=('AMT_CREDIT_SUM_LIMIT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [30]:
Active_CreditCard_Agg_Date.to_pickle(data_output_dir + "Active_CreditCard_Agg_Date.pkl")

### Closed Credit Card

In [32]:
def closedCredCard_year_brackets(year):
    if (year >= -1): return 1
    elif (year >= -2): return 2
    elif (year >= -4): return 3
    else: return 4

In [33]:
Closed_CreditCard = Closed_CreditCard.drop(["CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Closed_CreditCard = Closed_CreditCard.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Closed_CreditCard = Closed_CreditCard.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Closed_CreditCard["YEARS_ENDDATE_FACT"] = Closed_CreditCard["DAYS_ENDDATE_FACT"]/365
Closed_CreditCard["DATE_RANGE"] = Closed_CreditCard["YEARS_ENDDATE_FACT"].apply(closedCredCard_year_brackets)

In [34]:
Closed_CreditCard_Agg = Closed_CreditCard.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_LIMIT=('AMT_CREDIT_SUM_LIMIT', sum),
         AMT_CREDIT_SUM_LIMIT_MAX=('AMT_CREDIT_SUM_LIMIT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [35]:
Closed_CreditCard_Agg.to_pickle(data_output_dir + "Closed_CreditCard_Agg.pkl")

In [37]:
Closed_CreditCard_Agg_Date = Closed_CreditCard.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         CNT_CREDIT_PROLONG=('CNT_CREDIT_PROLONG', sum),
         CNT_CREDIT_PROLONG_MAX=('CNT_CREDIT_PROLONG', max),
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_LIMIT=('AMT_CREDIT_SUM_LIMIT', sum),
         AMT_CREDIT_SUM_LIMIT_MAX=('AMT_CREDIT_SUM_LIMIT', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [38]:
Closed_CreditCard_Agg_Date.to_pickle(data_output_dir + "Closed_CreditCard_Agg_Date.pkl")

### Active_Mortgage

In [40]:
Active_Mortgage = Active_Mortgage.drop(["DAYS_ENDDATE_FACT","CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Active_Mortgage["COUNT"] = Active_Mortgage[["SK_ID_CURR","SK_ID_BUREAU"]].groupby("SK_ID_CURR").transform('count')

In [47]:
Active_Mortgage_Singl = Active_Mortgage[Active_Mortgage["COUNT"]<=1]
Active_Mortgage_Dupes = Active_Mortgage[Active_Mortgage["COUNT"]>1]
Active_Mortgage_Dupes["COUNT"] = Active_Mortgage_Dupes[["SK_ID_CURR","SK_ID_BUREAU"]].groupby('SK_ID_CURR').transform('count') 
Active_Mortgage_Dupes = Active_Mortgage_Dupes[Active_Mortgage_Dupes['DAYS_CREDIT_UPDATE'] == Active_Mortgage_Dupes.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].transform('min')]
Active_Mortgage_Dupes = Active_Mortgage_Dupes[Active_Mortgage_Dupes['DAYS_CREDIT'] == Active_Mortgage_Dupes.groupby('SK_ID_CURR')['DAYS_CREDIT'].transform('min')]
Active_Mortgage_Dupes = Active_Mortgage_Dupes[Active_Mortgage_Dupes['AMT_CREDIT_SUM'] == Active_Mortgage_Dupes.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].transform('max')]
Active_Mortgage_Dupes = Active_Mortgage_Dupes[Active_Mortgage_Dupes["DAYS_CREDIT_ENDDATE"].notnull()]
Active_Mortgage_Dupes = Active_Mortgage_Dupes.drop_duplicates(subset=['SK_ID_CURR'])
Active_Mortgage_Clean = pd.concat([Active_Mortgage_Singl, Active_Mortgage_Dupes])
Active_Mortgage_Clean = Active_Mortgage_Clean.set_index("SK_ID_CURR")
Active_Mortgage_Clean = Active_Mortgage_Clean.drop(["SK_ID_BUREAU"],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
Active_Mortgage_Clean.to_pickle(data_output_dir + "Active_Mortgage_Clean.pkl")

### Closed_Mortgage

In [50]:
Closed_Mortgage = Closed_Mortgage.drop(["CREDIT_ACTIVE","CREDIT_TYPE"],axis=1)
Closed_Mortgage["COUNT"] = Closed_Mortgage[["SK_ID_CURR","SK_ID_BUREAU"]].groupby("SK_ID_CURR").transform('count')

In [54]:
Closed_Mortgage_Singl = Closed_Mortgage[Closed_Mortgage["COUNT"]<=1]
Closed_Mortgage_Dupes = Closed_Mortgage[Closed_Mortgage["COUNT"]>1]
Closed_Mortgage_Dupes["COUNT"] = Closed_Mortgage_Dupes[["SK_ID_CURR","SK_ID_BUREAU"]].groupby('SK_ID_CURR').transform('count') 
Closed_Mortgage_Dupes = Closed_Mortgage_Dupes[Closed_Mortgage_Dupes['DAYS_CREDIT_UPDATE'] == Closed_Mortgage_Dupes.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].transform('min')]
Closed_Mortgage_Dupes = Closed_Mortgage_Dupes[Closed_Mortgage_Dupes['DAYS_ENDDATE_FACT'] == Closed_Mortgage_Dupes.groupby('SK_ID_CURR')['DAYS_ENDDATE_FACT'].transform('min')]
Closed_Mortgage_Dupes = Closed_Mortgage_Dupes[Closed_Mortgage_Dupes['AMT_CREDIT_SUM'] == Closed_Mortgage_Dupes.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].transform('max')]
Closed_Mortgage_Dupes = Closed_Mortgage_Dupes.drop_duplicates(subset=['SK_ID_CURR'])
Closed_Mortgage_Clean = pd.concat([Closed_Mortgage_Singl, Closed_Mortgage_Dupes])
Closed_Mortgage_Clean = Closed_Mortgage_Clean.set_index("SK_ID_CURR")
Closed_Mortgage_Clean = Closed_Mortgage_Clean.drop(["SK_ID_BUREAU"],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [55]:
Closed_Mortgage_Clean.to_pickle(data_output_dir + "Closed_Mortgage_Clean.pkl")

### Active Other

In [57]:
def other_year_brackets(year):
    if (year >= -1): return 1
    elif (year >= -2): return 2
    else: return 3

In [58]:
Active_Other = Active_Other.drop(["DAYS_ENDDATE_FACT","CREDIT_ACTIVE"],axis=1)
Active_Other = Active_Other.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Active_Other = Active_Other.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Active_Other["YEARS_CREDIT"] = Active_Other["DAYS_CREDIT"]/365
Active_Other["DATE_RANGE"] = Active_Other["YEARS_CREDIT"].apply(other_year_brackets)

In [59]:
Active_Other_Agg = Active_Other.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_ANNUITY=('AMT_ANNUITY', sum),
         AMT_ANNUITY_MAX=('AMT_ANNUITY', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [60]:
Active_Other_Agg.to_pickle(data_output_dir + "Active_Other_Agg.pkl")

In [62]:
Active_Other_Agg_Date = Active_Other.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_ANNUITY_DEBT=('AMT_ANNUITY', sum),
         AMT_ANNUITY_DEBT_MAX=('AMT_ANNUITY', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [63]:
Active_Other_Agg_Date.to_pickle(data_output_dir + "Active_Other_Agg_Date.pkl")

### Closed Other

In [65]:
def closedOther_year_brackets(year):
    if (year >= -2): return 1
    elif (year >= -4): return 2
    else: return 3

In [66]:
Closed_Other = Closed_Other.drop(["CREDIT_ACTIVE"],axis=1)
Closed_Other = Closed_Other.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
Closed_Other = Closed_Other.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
Closed_Other["YEARS_CREDIT"] = Closed_Other["DAYS_ENDDATE_FACT"]/365
Closed_Other["DATE_RANGE"] = Closed_Other["YEARS_CREDIT"].apply(closedOther_year_brackets)

In [67]:
Closed_Other_Agg = Closed_Other.groupby(["SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_ANNUITY_DEBT=('AMT_ANNUITY', sum),
         AMT_ANNUITY_DEBT_MAX=('AMT_ANNUITY', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [68]:
Closed_Other_Agg.to_pickle(data_output_dir + "Closed_Other_Agg.pkl")

In [70]:
Closed_Other_Agg_Date = Closed_Other.groupby(["DATE_RANGE","SK_ID_CURR"]).agg(
         AMT_CREDIT_MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', max), 
         AMT_CREDIT_SUM=('AMT_CREDIT_SUM', sum),
         AMT_CREDIT_SUM_MAX=('AMT_CREDIT_SUM', max),
         AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', sum),
         AMT_CREDIT_SUM_DEBT_MAX=('AMT_CREDIT_SUM_DEBT', max),
         AMT_ANNUITY_DEBT=('AMT_ANNUITY', sum),
         AMT_ANNUITY_DEBT_MAX=('AMT_ANNUITY', max),
         COUNT=('SK_ID_BUREAU',len)
        )

In [71]:
Closed_Other_Agg_Date.to_pickle(data_output_dir + "Closed_Other_Agg_Date.pkl")

# 5. Sold debt

In [73]:
bureau_sold_debt = bureau[bureau["CREDIT_ACTIVE"]=='Sold']
bureau_sold_debt = bureau_sold_debt.sort_values(["SK_ID_CURR","DAYS_CREDIT","DAYS_CREDIT_UPDATE"])
bureau_sold_debt = bureau_sold_debt.drop_duplicates(subset=["SK_ID_CURR", "DAYS_CREDIT"], keep='last')
bureau_sold_debt = bureau_sold_debt.drop(["SK_ID_BUREAU"],axis=1)

In [77]:
bureau_sold_debt_agg = bureau_sold_debt.groupby('SK_ID_CURR', as_index = False).agg(['mean', 'max', 'min', 'sum', 'count'])

In [78]:
bureau_sold_debt_agg.columns = [''.join(col).strip() for col in bureau_sold_debt_agg.columns.values]

In [80]:
bureau_sold_debt_agg.to_pickle(data_output_dir + "Sold_Agg.pkl")