In [1]:
## Import packages
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
## Set paths
user = os.getenv('USERNAME')
shared_workspace = '/home/mdawkins/modelling_club'
user_dir = os.path.join(shared_workspace, user)
data_dir = os.path.join(shared_workspace, 'raw_data')
data_output_dir = os.path.join(shared_workspace, 'raw_data_lfs/engineered/previous/')

In [3]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, categorical_columns=None, nan_as_category=True):
    """Create a new column for each categorical value in categorical columns. """
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [col for col in original_columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    categorical_columns = [c for c in list(df.columns) if c not in original_columns]
    return df, categorical_columns

def group(df_to_agg, prefix, aggregations, aggregate_by='SK_ID_CURR'):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index([f'{prefix}{e[0]}_{e[1].upper()}' for e in agg_df.columns.tolist()])
    return agg_df.reset_index()


def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)


def do_agg(df, group_cols, counted, agg_feature_name, agg_type):
    if agg_type=='min':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].min().reset_index().rename(columns={counted: agg_feature_name})
    elif agg_type=='max':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].max().reset_index().rename(columns={counted: agg_feature_name})
    elif agg_type=='mean':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted: agg_feature_name})
    elif agg_type=='median':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].median().reset_index().rename(columns={counted: agg_feature_name})
    elif agg_type=='std':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].std().reset_index().rename(columns={counted: agg_feature_name})
    elif agg_type=='var':
        gp = df[group_cols + [counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted: agg_feature_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df

def get_age_range(days_birth):
    """ Return the age range label (int). """
    age_years = -days_birth / 365.25
    if age_years < 20: return 1
    elif age_years < 30: return 2
    elif age_years < 40: return 3
    elif age_years < 50: return 4
    elif age_years < 60: return 5
    elif age_years < 70: return 6
    elif age_years < 80: return 7
    elif age_years < 99: return 8
    else: return 0

In [4]:
""" Process previous_application.csv and return a pandas dataframe. """
prev = pd.read_csv(data_dir + '/raw/previous_application.csv')
pay = pd.read_csv(data_dir + '/raw/installments_payments.csv')

In [5]:
PREVIOUS_AGG = {
    'SK_ID_PREV': ['nunique'],
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_DOWN_PAYMENT': ['max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['max', 'mean'],
    'DAYS_TERMINATION': ['max'],
    
    # Engineered features
    'CREDIT_TO_ANNUITY_RATIO': ['mean', 'max'],
    'APPLICATION_CREDIT_DIFF': ['min', 'max', 'mean'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean', 'var'],
    'DOWN_PAYMENT_TO_CREDIT': ['mean'],
}

PREVIOUS_ACTIVE_AGG = {
    'SK_ID_PREV': ['nunique'],
    'SIMPLE_INTERESTS': ['mean'],
    'AMT_ANNUITY': ['max', 'sum'],
    'AMT_APPLICATION': ['max', 'mean'],
    'AMT_CREDIT': ['sum'],
    'AMT_DOWN_PAYMENT': ['max', 'mean'],
    'DAYS_DECISION': ['min', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'AMT_PAYMENT': ['sum'],
    'INSTALMENT_PAYMENT_DIFF': ['mean', 'max'],
    'REMAINING_DEBT': ['max', 'mean', 'sum'],
    'REPAYMENT_RATIO': ['mean'],
}

PREVIOUS_APPROVED_AGG = {
    'SK_ID_PREV': ['nunique'],
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'AMT_DOWN_PAYMENT': ['max'],
    'AMT_GOODS_PRICE': ['max'],
    'HOUR_APPR_PROCESS_START': ['min', 'max'],
    'DAYS_DECISION': ['min', 'mean'],
    'CNT_PAYMENT': ['max', 'mean'],
    'DAYS_TERMINATION': ['mean'],
    # Engineered features
    'CREDIT_TO_ANNUITY_RATIO': ['mean', 'max'],
    'APPLICATION_CREDIT_DIFF': ['max'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    # The following features are only for approved applications
    'DAYS_FIRST_DRAWING': ['max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['max', 'mean'],
    'DAYS_LAST_DUE_DIFF': ['min', 'max', 'mean'],
    'SIMPLE_INTERESTS': ['min', 'max', 'mean'],
}

PREVIOUS_REFUSED_AGG = {
    'AMT_APPLICATION': ['max', 'mean'],
    'AMT_CREDIT': ['min', 'max'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min', 'max', 'mean', 'var'],
    'APPLICATION_CREDIT_RATIO': ['min', 'mean'],
    'NAME_CONTRACT_TYPE_Consumer loans': ['mean'],
    'NAME_CONTRACT_TYPE_Cash loans': ['mean'],
    'NAME_CONTRACT_TYPE_Revolving loans': ['mean'],
}

PREVIOUS_LATE_PAYMENTS_AGG = {
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min'],
    'NAME_CONTRACT_TYPE_Consumer loans': ['mean'],
    'NAME_CONTRACT_TYPE_Cash loans': ['mean'],
    'NAME_CONTRACT_TYPE_Revolving loans': ['mean'],
}

PREVIOUS_LOAN_TYPE_AGG = {
    'AMT_CREDIT': ['sum'],
    'AMT_ANNUITY': ['mean', 'max'],
    'SIMPLE_INTERESTS': ['min', 'mean', 'max', 'var'],
    'APPLICATION_CREDIT_DIFF': ['min', 'var'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['max'],
    'DAYS_LAST_DUE_1ST_VERSION': ['max', 'mean'],
    'CNT_PAYMENT': ['mean'],
}

PREVIOUS_TIME_AGG = {
    'AMT_CREDIT': ['sum'],
    'AMT_ANNUITY': ['mean', 'max'],
    'SIMPLE_INTERESTS': ['mean', 'max'],
    'DAYS_DECISION': ['min', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    # Engineered features
    'APPLICATION_CREDIT_DIFF': ['min'],
    'APPLICATION_CREDIT_RATIO': ['min', 'max', 'mean'],
    'NAME_CONTRACT_TYPE_Consumer loans': ['mean'],
    'NAME_CONTRACT_TYPE_Cash loans': ['mean'],
    'NAME_CONTRACT_TYPE_Revolving loans': ['mean'],
}



# One-hot encode most important categorical features
ohe_columns = [
    'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE', 'CHANNEL_TYPE',
    'NAME_TYPE_SUITE', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
    'NAME_PRODUCT_TYPE', 'NAME_CLIENT_TYPE']
prev, categorical_cols = one_hot_encoder(prev, ohe_columns, nan_as_category= False)

# Feature engineering: ratios and difference
prev['APPLICATION_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
prev['APPLICATION_CREDIT_RATIO'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
prev['CREDIT_TO_ANNUITY_RATIO'] = prev['AMT_CREDIT']/prev['AMT_ANNUITY']
prev['DOWN_PAYMENT_TO_CREDIT'] = prev['AMT_DOWN_PAYMENT'] / prev['AMT_CREDIT']

# Interest ratio on previous application (simplified)
total_payment = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
prev['SIMPLE_INTERESTS'] = (total_payment/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']

# Active loans - approved and not complete yet (last_due 365243)
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
active = approved[approved['DAYS_LAST_DUE'] == 365243]

# Find how much was already payed in active loans (using installments csv)
active_pay = pay[pay['SK_ID_PREV'].isin(active['SK_ID_PREV'])]
active_pay_agg = active_pay.groupby('SK_ID_PREV')[['AMT_INSTALMENT', 'AMT_PAYMENT']].sum()
active_pay_agg.reset_index(inplace= True)

# Active loans: difference of what was payed and installments
active_pay_agg['INSTALMENT_PAYMENT_DIFF'] = active_pay_agg['AMT_INSTALMENT'] - active_pay_agg['AMT_PAYMENT']

# Merge with active
active = active.merge(active_pay_agg, on= 'SK_ID_PREV', how= 'left')
active['REMAINING_DEBT'] = active['AMT_CREDIT'] - active['AMT_PAYMENT']
active['REPAYMENT_RATIO'] = active['AMT_PAYMENT'] / active['AMT_CREDIT']

# Perform aggregations for active applications
active_agg_df = group(active, 'PREV_ACTIVE_', PREVIOUS_ACTIVE_AGG)
active_agg_df['TOTAL_REPAYMENT_RATIO'] = active_agg_df['PREV_ACTIVE_AMT_PAYMENT_SUM']/active_agg_df['PREV_ACTIVE_AMT_CREDIT_SUM']

del active_pay, active_pay_agg, active; gc.collect()

# Change 365243 values to nan (missing)
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

# Days last due difference (scheduled x done)
prev['DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
approved['DAYS_LAST_DUE_DIFF'] = approved['DAYS_LAST_DUE_1ST_VERSION'] - approved['DAYS_LAST_DUE']

# # Categorical features
# categorical_agg = {key: ['mean'] for key in categorical_cols}

# Perform general aggregations
agg_prev = group(prev, 'PREV_', {**PREVIOUS_AGG}) #, **categorical_agg})

# Merge active loans dataframe on agg_prev
agg_prev = agg_prev.merge(active_agg_df, how='left', on='SK_ID_CURR')

del active_agg_df; gc.collect()

# Aggregations for approved and refused loans
agg_prev = group_and_merge(approved, agg_prev, 'PREV_APPROVED_', PREVIOUS_APPROVED_AGG)
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
agg_prev = group_and_merge(refused, agg_prev, 'PREV_REFUSED_', PREVIOUS_REFUSED_AGG)

del approved, refused; gc.collect()

# Aggregations for Consumer loans and Cash loans
for loan_type in ['Consumer loans', 'Cash loans']:
    type_df = prev[prev[f'NAME_CONTRACT_TYPE_{loan_type}'] == 1]
    prefix = 'PREV_' + loan_type.split(" ")[0] + '_'
    agg_prev = group_and_merge(type_df, agg_prev, prefix, PREVIOUS_LOAN_TYPE_AGG)
    del type_df; gc.collect()

# Get the SK_ID_PREV for loans with late payments (days past due)
pay['LATE_PAYMENT'] = pay['DAYS_ENTRY_PAYMENT'] - pay['DAYS_INSTALMENT']
pay['LATE_PAYMENT'] = pay['LATE_PAYMENT'].apply(lambda x: 1 if x > 0 else 0)
dpd_id = pay[pay['LATE_PAYMENT'] > 0]['SK_ID_PREV'].unique()

# Aggregations for loans with late payments
agg_dpd = group_and_merge(prev[prev['SK_ID_PREV'].isin(dpd_id)], agg_prev, 'PREV_LATE_', PREVIOUS_LATE_PAYMENTS_AGG)

del agg_dpd, dpd_id; gc.collect()

# Aggregations for loans in the last x months
for time_frame in [12, 24]:
    time_frame_df = prev[prev['DAYS_DECISION'] >= -30*time_frame]
    prefix = 'PREV_LAST{}M_'.format(time_frame)
    agg_prev = group_and_merge(time_frame_df, agg_prev, prefix, PREVIOUS_TIME_AGG)
    
    del time_frame_df; gc.collect()

del prev; gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0

In [6]:
agg_prev

Unnamed: 0,SK_ID_CURR,PREV_SK_ID_PREV_NUNIQUE,PREV_AMT_ANNUITY_MIN,PREV_AMT_ANNUITY_MAX,PREV_AMT_ANNUITY_MEAN,PREV_AMT_DOWN_PAYMENT_MAX,PREV_AMT_DOWN_PAYMENT_MEAN,PREV_HOUR_APPR_PROCESS_START_MIN,PREV_HOUR_APPR_PROCESS_START_MAX,PREV_HOUR_APPR_PROCESS_START_MEAN,...,PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MIN,PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MAX,PREV_LAST24M_DAYS_LAST_DUE_1ST_VERSION_MEAN,PREV_LAST24M_APPLICATION_CREDIT_DIFF_MIN,PREV_LAST24M_APPLICATION_CREDIT_RATIO_MIN,PREV_LAST24M_APPLICATION_CREDIT_RATIO_MAX,PREV_LAST24M_APPLICATION_CREDIT_RATIO_MEAN,PREV_LAST24M_NAME_CONTRACT_TYPE_Consumer loans_MEAN,PREV_LAST24M_NAME_CONTRACT_TYPE_Cash loans_MEAN,PREV_LAST24M_NAME_CONTRACT_TYPE_Revolving loans_MEAN
0,100002,1,9251.775,9251.775,9251.775000,0.0,0.00,9,9,9.000000,...,125.0,125.0,125.000000,0.0,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000
1,100003,3,6737.310,98356.995,56553.990000,6885.0,3442.50,12,17,14.666667,...,,,,,,,,,,
2,100004,1,5357.250,5357.250,5357.250000,4860.0,4860.00,5,5,5.000000,...,,,,,,,,,,
3,100006,9,2482.920,39954.510,23651.175000,66987.0,34840.17,12,15,14.666667,...,-215.0,1259.0,364.333333,-218115.0,0.759418,1.250017,1.010763,0.222222,0.555556,0.222222
4,100007,6,1834.290,22678.785,12278.805000,3676.5,3390.75,8,15,12.333333,...,346.0,346.0,346.000000,-26788.5,0.902335,0.902335,0.902335,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291052,456251,1,6605.910,6605.910,6605.910000,0.0,0.00,17,17,17.000000,...,0.0,0.0,0.000000,0.0,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000
291053,456252,1,10074.465,10074.465,10074.465000,3456.0,3456.00,10,10,10.000000,...,,,,,,,,,,
291054,456253,2,3973.095,5567.715,4770.405000,5913.0,4403.25,11,12,11.500000,...,,,,,,,,,,
291055,456254,2,2296.440,19065.825,10681.132500,0.0,0.00,12,18,15.000000,...,99.0,203.0,151.000000,-23634.0,0.878356,0.904480,0.891418,1.000000,0.000000,0.000000


In [9]:
agg_prev.to_csv(data_output_dir + "previous_payments.csv")

In [10]:
agg_prev.to_pickle(path=data_output_dir + "previous_payments.pkl")