In [8]:
#import packages
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

## Read Data

In [9]:
# Read in bureau
buro = pd.read_csv('data/bureau.csv')
buro.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [10]:
buro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
SK_ID_CURR                int64
SK_ID_BUREAU              int64
CREDIT_ACTIVE             object
CREDIT_CURRENCY           object
DAYS_CREDIT               int64
CREDIT_DAY_OVERDUE        int64
DAYS_CREDIT_ENDDATE       float64
DAYS_ENDDATE_FACT         float64
AMT_CREDIT_MAX_OVERDUE    float64
CNT_CREDIT_PROLONG        int64
AMT_CREDIT_SUM            float64
AMT_CREDIT_SUM_DEBT       float64
AMT_CREDIT_SUM_LIMIT      float64
AMT_CREDIT_SUM_OVERDUE    float64
CREDIT_TYPE               object
DAYS_CREDIT_UPDATE        int64
AMT_ANNUITY               float64
dtypes: float64(8), int64(6), object(3)
memory usage: 222.6+ MB


**check missing values**

In [11]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
missing_values_table(buro)

Your selected dataframe has 17 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
AMT_ANNUITY,1226791,71.5
AMT_CREDIT_MAX_OVERDUE,1124488,65.5
DAYS_ENDDATE_FACT,633653,36.9
AMT_CREDIT_SUM_LIMIT,591780,34.5
AMT_CREDIT_SUM_DEBT,257669,15.0
DAYS_CREDIT_ENDDATE,105553,6.1
AMT_CREDIT_SUM,13,0.0


In [12]:
buro_bal = pd.read_csv('data/bureau_balance.csv')
buro_bal.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [13]:
buro_bal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
SK_ID_BUREAU      int64
MONTHS_BALANCE    int64
STATUS            object
dtypes: int64(2), object(1)
memory usage: 624.8+ MB


In [14]:
missing_values_table(buro_bal)

Your selected dataframe has 3 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


## Drop Missing Value
---
drop columns with missing value >= 65%

In [15]:
def drop_cols_missing(df, percent):
    thresh = len(df) * percent
    df.dropna(thresh = thresh, axis = 1, inplace = True)
    return df

In [16]:
buro = drop_cols_missing(buro,0.65)

In [17]:
#double check
missing_values_table(buro)

Your selected dataframe has 14 columns.
There are 4 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
AMT_CREDIT_SUM_LIMIT,591780,34.5
AMT_CREDIT_SUM_DEBT,257669,15.0
DAYS_CREDIT_ENDDATE,105553,6.1
AMT_CREDIT_SUM,13,0.0


## Aggregation Functions

In [18]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values stats(count, mean, max, min and sum)
        groupby group_var in a dataframe.
    
    Args:
        df(dataframe):  
            the input dataframe
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
        agg (dataframe): 
            a dataframe with the stats aggregated for each group indexed by group_var
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    #select numeric columns + group_var
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = df[group_var]

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
                
    agg.columns = columns
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [19]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts of 
    each unique category for each observation groupby 'group_var'
    
    Args:
    df(dataframe):
        input dataframe
        
    group_var(string):
        The variable by which to group the dataframe. 
        
    df_name(string)
        Variable added to the front of column names to keep track of columns

    
    Returns:
    categorical(dataframe)
        A dataframe with counts and normalized counts of each unique category
        groupby the `group_var`.
        
    """
    
    # Select the categorical columns and do one-hot encoding
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # add the group_var as index
    categorical[group_var] = df[group_var]

    # calculate the sum and mean(count and normalized count) groupby group_var
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    #rename columns to df_name+column_name+stat_name
    for var in categorical.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stats in level 1
            for stat in ['count', 'count_norm']:
                column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    _, idx = np.unique(categorical, axis = 1, return_index=True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [20]:
def remove_duplicate_columns(df):
    _, idx = np.unique(df, axis = 1, return_index=True)
    df = df.iloc[:, idx]
    return df

## Preprocess

### 1. bureau.csv

In [21]:
#aggregate numerical features
buro_agg = agg_numeric(buro.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
buro_agg.head()

Unnamed: 0,bureau_DAYS_CREDIT_sum,bureau_DAYS_CREDIT_min,bureau_DAYS_CREDIT_ENDDATE_min,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_UPDATE_sum,bureau_DAYS_CREDIT_UPDATE_min,bureau_DAYS_CREDIT_UPDATE_mean,bureau_DAYS_CREDIT_max,bureau_DAYS_CREDIT_UPDATE_max,bureau_CNT_CREDIT_PROLONG_min,...,bureau_DAYS_CREDIT_ENDDATE_sum,bureau_DAYS_CREDIT_ENDDATE_max,bureau_AMT_CREDIT_SUM_DEBT_mean,bureau_AMT_CREDIT_SUM_min,SK_ID_CURR,bureau_AMT_CREDIT_SUM_mean,bureau_AMT_CREDIT_SUM_DEBT_max,bureau_AMT_CREDIT_SUM_max,bureau_AMT_CREDIT_SUM_DEBT_sum,bureau_AMT_CREDIT_SUM_sum
0,-5145,-1572,-1329.0,-735.0,-652,-155,-93.142857,-49,-6,0,...,577.0,1778.0,85240.928571,85500.0,100001,207623.571429,373239.0,378000.0,596686.5,1453365.0
1,-6992,-1437,-1072.0,-874.0,-3999,-1185,-499.875,-103,-7,0,...,-2094.0,780.0,49156.2,0.0,100002,108131.945625,245781.0,450000.0,245781.0,865055.565
2,-5603,-2586,-2434.0,-1400.75,-3264,-2131,-816.0,-606,-43,0,...,-2178.0,1216.0,0.0,22248.0,100003,254350.125,0.0,810000.0,0.0,1017400.5
3,-1734,-1326,-595.0,-867.0,-1064,-682,-532.0,-408,-382,0,...,-977.0,-382.0,0.0,94500.0,100004,94518.9,0.0,94537.8,0.0,189037.8
4,-572,-373,-128.0,-190.666667,-163,-121,-54.333333,-62,-11,0,...,1318.0,1324.0,189469.5,29826.0,100005,219042.0,543087.0,568800.0,568408.5,657126.0


In [22]:
missing_values_table(buro_agg)

Your selected dataframe has 42 columns.
There are 12 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
bureau_AMT_CREDIT_SUM_LIMIT_min,25308,8.3
bureau_AMT_CREDIT_SUM_LIMIT_mean,25308,8.3
bureau_AMT_CREDIT_SUM_LIMIT_max,25308,8.3
bureau_AMT_CREDIT_SUM_DEBT_min,8372,2.7
bureau_AMT_CREDIT_SUM_DEBT_mean,8372,2.7
bureau_AMT_CREDIT_SUM_DEBT_max,8372,2.7
bureau_DAYS_CREDIT_ENDDATE_min,2585,0.8
bureau_DAYS_CREDIT_ENDDATE_mean,2585,0.8
bureau_DAYS_CREDIT_ENDDATE_max,2585,0.8
bureau_AMT_CREDIT_SUM_min,2,0.0


In [23]:
#should be < 12*5+1 columns 
buro_agg.shape

(305811, 42)

In [24]:
#count categorical features
buro_counts = count_categorical(buro, group_var = 'SK_ID_CURR', df_name = 'bureau')
buro_counts.head()

Unnamed: 0_level_0,bureau_CREDIT_TYPE_Mobile operator loan_count_norm,bureau_CREDIT_TYPE_Mobile operator loan_count,bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count_norm,bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count,bureau_CREDIT_ACTIVE_Bad debt_count_norm,bureau_CREDIT_ACTIVE_Bad debt_count,bureau_CREDIT_TYPE_Interbank credit_count_norm,bureau_CREDIT_TYPE_Interbank credit_count,bureau_CREDIT_TYPE_Real estate loan_count_norm,bureau_CREDIT_TYPE_Real estate loan_count,...,bureau_CREDIT_TYPE_Credit card_count_norm,bureau_CREDIT_TYPE_Credit card_count,bureau_CREDIT_ACTIVE_Active_count_norm,bureau_CREDIT_ACTIVE_Closed_count_norm,bureau_CREDIT_TYPE_Consumer credit_count_norm,bureau_CREDIT_CURRENCY_currency 1_count_norm,bureau_CREDIT_ACTIVE_Active_count,bureau_CREDIT_ACTIVE_Closed_count,bureau_CREDIT_TYPE_Consumer credit_count,bureau_CREDIT_CURRENCY_currency 1_count
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.428571,0.571429,1.0,1.0,3,4,7,7
100002,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.5,4,0.25,0.75,0.5,1.0,2,6,4,8
100003,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.5,2,0.25,0.75,0.5,1.0,1,3,2,4
100004,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,1.0,1.0,1.0,0,2,2,2
100005,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.333333,1,0.666667,0.333333,0.666667,1.0,2,1,2,3


In [25]:
buro_counts.shape

(305811, 46)

### 2. bureau_balance.csv

#### groupby SK_ID_BUREAU

In [26]:
#aggregate numerical values
buro_bal_agg = agg_numeric(buro_bal, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
buro_bal_agg.head()

Unnamed: 0,bureau_balance_MONTHS_BALANCE_sum,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_count,SK_ID_BUREAU
0,-4656,-96,-48.0,0,97,5001709
1,-3403,-82,-41.0,0,83,5001710
2,-6,-3,-1.5,0,4,5001711
3,-171,-18,-9.0,0,19,5001712
4,-231,-21,-10.5,0,22,5001713


In [27]:
# only one numerical col: MONTHS_BALANCE
# should be 1*5+1 columns
buro_bal_agg.shape

(817395, 6)

In [28]:
buro_bal_counts = count_categorical(buro_bal, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
buro_bal_counts.head()

Unnamed: 0_level_0,bureau_balance_STATUS_4_count_norm,bureau_balance_STATUS_3_count_norm,bureau_balance_STATUS_2_count_norm,bureau_balance_STATUS_5_count_norm,bureau_balance_STATUS_4_count,bureau_balance_STATUS_3_count,bureau_balance_STATUS_2_count,bureau_balance_STATUS_5_count,bureau_balance_STATUS_1_count_norm,bureau_balance_STATUS_1_count,bureau_balance_STATUS_0_count_norm,bureau_balance_STATUS_0_count,bureau_balance_STATUS_X_count_norm,bureau_balance_STATUS_C_count_norm,bureau_balance_STATUS_X_count,bureau_balance_STATUS_C_count
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5001709,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0.113402,0.886598,11,86
5001710,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.060241,5,0.361446,0.578313,30,48
5001711,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.75,3,0.25,0.0,1,0
5001712,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.526316,10,0.0,0.473684,0,9
5001713,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,1.0,0.0,22,0


In [29]:
buro_bal_counts.shape

(817395, 16)

#### then do aggregation again groupby SK_ID_CURR

In [30]:
# merge numeric stats and categorical count features generated above
bureau_by_loan = buro_bal_agg.merge(buro_bal_counts, right_index = True, left_on = 'SK_ID_BUREAU', how = 'outer')

# Merge with bureau.csv to include the SK_ID_CURR
bureau_by_loan = buro[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(bureau_by_loan, on = 'SK_ID_BUREAU', how = 'left')

bureau_by_loan.shape

(1716428, 23)

In [31]:
bureau_by_loan.columns

Index(['SK_ID_BUREAU', 'SK_ID_CURR', 'bureau_balance_MONTHS_BALANCE_sum',
       'bureau_balance_MONTHS_BALANCE_min',
       'bureau_balance_MONTHS_BALANCE_mean',
       'bureau_balance_MONTHS_BALANCE_max',
       'bureau_balance_MONTHS_BALANCE_count',
       'bureau_balance_STATUS_4_count_norm',
       'bureau_balance_STATUS_3_count_norm',
       'bureau_balance_STATUS_2_count_norm',
       'bureau_balance_STATUS_5_count_norm', 'bureau_balance_STATUS_4_count',
       'bureau_balance_STATUS_3_count', 'bureau_balance_STATUS_2_count',
       'bureau_balance_STATUS_5_count', 'bureau_balance_STATUS_1_count_norm',
       'bureau_balance_STATUS_1_count', 'bureau_balance_STATUS_0_count_norm',
       'bureau_balance_STATUS_0_count', 'bureau_balance_STATUS_X_count_norm',
       'bureau_balance_STATUS_C_count_norm', 'bureau_balance_STATUS_X_count',
       'bureau_balance_STATUS_C_count'],
      dtype='object')

In [32]:
# Aggregate the stats for each client
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'client')
bureau_balance_by_client.shape

(305811, 86)

In [33]:
bureau_balance_by_client.columns

Index(['client_bureau_balance_MONTHS_BALANCE_sum_sum',
       'client_bureau_balance_MONTHS_BALANCE_sum_min',
       'client_bureau_balance_MONTHS_BALANCE_sum_mean',
       'client_bureau_balance_MONTHS_BALANCE_min_sum',
       'client_bureau_balance_MONTHS_BALANCE_mean_sum',
       'client_bureau_balance_MONTHS_BALANCE_min_min',
       'client_bureau_balance_MONTHS_BALANCE_mean_min',
       'client_bureau_balance_MONTHS_BALANCE_min_mean',
       'client_bureau_balance_MONTHS_BALANCE_mean_mean',
       'client_bureau_balance_MONTHS_BALANCE_sum_max',
       'client_bureau_balance_MONTHS_BALANCE_min_max',
       'client_bureau_balance_MONTHS_BALANCE_mean_max',
       'client_bureau_balance_MONTHS_BALANCE_max_sum',
       'client_bureau_balance_MONTHS_BALANCE_max_min',
       'client_bureau_balance_MONTHS_BALANCE_max_mean',
       'client_bureau_balance_STATUS_4_count_norm_sum',
       'client_bureau_balance_STATUS_5_count_norm_sum',
       'client_bureau_balance_STATUS_4_count_sum',
    

# Merge with training data

In [34]:
train = pd.read_csv('data/application_train.csv')
test = pd.read_csv('data/application_test.csv')

In [35]:
original_features = list(train.columns)
print('Original Number of Features: ', len(original_features))

# Merge with the value counts of bureau
train = train.merge(buro_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
train = train.merge(buro_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the monthly information grouped by client
train = train.merge(bureau_balance_by_client, on = 'SK_ID_CURR', how = 'left')

new_features = list(train.columns)
print('Number of features using previous loans from other institutions data: ', len(new_features))

Original Number of Features:  122
Number of features using previous loans from other institutions data:  294


In [36]:
buro_features = train.drop(columns = original_features)
buro_features["SK_ID_CURR"] = train["SK_ID_CURR"]
buro_features.columns

Index(['bureau_CREDIT_TYPE_Mobile operator loan_count_norm',
       'bureau_CREDIT_TYPE_Mobile operator loan_count',
       'bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count_norm',
       'bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count',
       'bureau_CREDIT_ACTIVE_Bad debt_count_norm',
       'bureau_CREDIT_ACTIVE_Bad debt_count',
       'bureau_CREDIT_TYPE_Interbank credit_count_norm',
       'bureau_CREDIT_TYPE_Interbank credit_count',
       'bureau_CREDIT_TYPE_Real estate loan_count_norm',
       'bureau_CREDIT_TYPE_Real estate loan_count',
       ...
       'client_bureau_balance_STATUS_0_count_max',
       'client_bureau_balance_STATUS_C_count_mean',
       'client_bureau_balance_MONTHS_BALANCE_count_mean',
       'client_bureau_balance_STATUS_X_count_sum',
       'client_bureau_balance_STATUS_0_count_sum',
       'client_bureau_balance_STATUS_C_count_max',
       'client_bureau_balance_MONTHS_BALANCE_count_max',
       'client_bureau

In [26]:
len(buro_features.columns)

188

## Imput missing value with median

In [37]:
buro_features.fillna(buro_features.median(),inplace=True)
#double check
missing_values_table(buro_features).head(10)

Your selected dataframe has 173 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


## saving new features to file

In [38]:
buro_features.to_csv('bureau_features.csv', index = False)

In [39]:
buro_features.head()

Unnamed: 0,bureau_CREDIT_TYPE_Mobile operator loan_count_norm,bureau_CREDIT_TYPE_Mobile operator loan_count,bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count_norm,bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_count,bureau_CREDIT_ACTIVE_Bad debt_count_norm,bureau_CREDIT_ACTIVE_Bad debt_count,bureau_CREDIT_TYPE_Interbank credit_count_norm,bureau_CREDIT_TYPE_Interbank credit_count,bureau_CREDIT_TYPE_Real estate loan_count_norm,bureau_CREDIT_TYPE_Real estate loan_count,...,client_bureau_balance_STATUS_0_count_max,client_bureau_balance_STATUS_C_count_mean,client_bureau_balance_MONTHS_BALANCE_count_mean,client_bureau_balance_STATUS_X_count_sum,client_bureau_balance_STATUS_0_count_sum,client_bureau_balance_STATUS_C_count_max,client_bureau_balance_MONTHS_BALANCE_count_max,client_bureau_balance_STATUS_C_count_sum,client_bureau_balance_MONTHS_BALANCE_count_sum,SK_ID_CURR
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,2.875,13.75,15.0,45.0,13.0,22.0,23.0,110.0,100002
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,9.0,24.75,0.0,0.0,24.0,41.0,0.0,0.0,100003
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,9.0,24.75,0.0,0.0,24.0,41.0,0.0,0.0,100004
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,9.0,24.75,0.0,0.0,24.0,41.0,0.0,0.0,100006
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,9.0,24.75,0.0,0.0,24.0,41.0,0.0,0.0,100007


In [24]:
print('Original Number of Features: ', len(list(test.columns)))

# Merge with the value counts of bureau
test = test.merge(buro_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
test = test.merge(buro_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the value counts of bureau balance
test = test.merge(bureau_balance_by_client, on = 'SK_ID_CURR', how = 'left')

print('Shape of Testing Data: ', test.shape)

Original Number of Features:  121
Shape of Testing Data:  (48744, 308)


# Align train and test set

In [25]:
train_labels = train['TARGET']

# Align the dataframes, this will remove the 'TARGET' column
train, test = train.align(test, join = 'inner', axis = 1)

train['TARGET'] = train_labels

print('Training Data Shape: ', train.shape)
print('Testing Data Shape: ', test.shape)

Training Data Shape:  (307511, 309)
Testing Data Shape:  (48744, 308)


# Save processed datafile

In [26]:
train.to_csv('train_bureau.csv', index = False)
test.to_csv('test_bureau.csv', index = False)