In [3]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

# Memory management
import gc 

__Function to Aggregate Numeric Data__<br>

This groups data by the group_var and calculates mean, max, min, and sum. It will only be applied to numeric data by default in pandas.

In [4]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 
    
    """
    
    # Remove id variables other than grouping variable
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

__Function to Calculate Categorical Counts__<br>

This function calculates the occurrences (counts) of each category in a categorical variable for each client. It also calculates the normed count, which is the count for a category divided by the total counts for all categories in a categorical variable.

In [5]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('category'))
    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

optimize aggregation function for categorical features

In [90]:
def agg(df):
    return df.groupby('SK_ID_CURR').agg(['sum', 'count', 'mean'])


def opt_agg_categorical(df, parent_var, df_name):
    
    import multiprocessing as mp
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('category'))
    
    
    p = mp.Pool(processes=8)
    
    split_categoricals = np.array_split(categorical, 8, axis=1)
    
    # Make sure to put the identifying id on the split columns
    for split in split_categoricals:
        split['SK_ID_CURR'] = previous['SK_ID_CURR']
 
 
    df_pool_results = p.map(agg, split_categoricals)
    
    p.close()
    
    # combine together
    categorical = pd.concat(df_pool_results, axis=1)

    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

__Function for KDE Plots of Variable__

We also made a function that plots the distribution of variable colored by the value of TARGET (either 1 for did not repay the loan or 0 for did repay the loan). We can use this function to visually examine any new variables we create. This also calculates the correlation cofficient of the variable with the target which can be used as an approximation of whether or not the created variable will be useful.

In [7]:
# Plots the disribution of a variable colored by value of the target
def kde_target(var_name, df):
    
    # Calculate the correlation coefficient between the new variable and the target
    corr = df['TARGET'].corr(df[var_name])
    
    # Calculate medians for repaid vs not repaid
    avg_repaid = df.ix[df['TARGET'] == 0, var_name].median()
    avg_not_repaid = df.ix[df['TARGET'] == 1, var_name].median()
    
    plt.figure(figsize = (12, 6))
    
    # Plot the distribution for target == 0 and target == 1
    sns.kdeplot(df.ix[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
    sns.kdeplot(df.ix[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
    
    # label the plot
    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend();
    
    # print out the correlation
    print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
    # Print out average values
    print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
    print('Median value for loan that was repaid =     %0.4f' % avg_repaid)

__Function to Convert Data Types__

This will help reduce memory usage by using more efficient types for the variables. For example category is often a better type than object (unless the number of unique categories is close to the number of rows in the dataframe).

In [8]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

One client can have multiple previous loan records. Thus, we need to do the aggregation.

load data

In [75]:
previous = pd.read_csv('data/previous_application.csv')
previous = convert_types(previous, print_info=True)
previous.head()

Original Memory Usage: 0.49 gb.
New Memory Usage: 0.16 gb.


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615234,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735352,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335938,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.394531,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


Calculate aggregate statistics for each numeric column

In [10]:
previous_agg = agg_numeric(previous, 'SK_ID_CURR', 'previous')
print('Previous aggregation shape: ', previous_agg.shape)
previous_agg.head()

Previous aggregation shape:  (338857, 80)


Unnamed: 0_level_0,previous_DAYS_DECISION_sum,previous_DAYS_DECISION_min,previous_DAYS_DECISION_mean,previous_DAYS_DECISION_max,previous_DAYS_FIRST_DUE_sum,previous_DAYS_FIRST_DUE_min,previous_DAYS_FIRST_DUE_mean,previous_DAYS_FIRST_DUE_max,previous_DAYS_LAST_DUE_sum,previous_DAYS_LAST_DUE_min,...,previous_DAYS_FIRST_DRAWING_min,previous_DAYS_FIRST_DRAWING_mean,previous_DAYS_FIRST_DRAWING_max,previous_DAYS_FIRST_DRAWING_sum,previous_RATE_INTEREST_PRIMARY_min,previous_RATE_INTEREST_PRIMARY_mean,previous_RATE_INTEREST_PRIMARY_max,previous_RATE_INTEREST_PRIVILEGED_min,previous_RATE_INTEREST_PRIVILEGED_mean,previous_RATE_INTEREST_PRIVILEGED_max
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-1740,-1740,-1740.0,-1740,-1709.0,-1709.0,-1709.0,-1709.0,-1619.0,-1619.0,...,365243.0,365243.0,365243.0,365243.0,,,,,,
100002,-606,-606,-606.0,-606,-565.0,-565.0,-565.0,-565.0,-25.0,-25.0,...,365243.0,365243.0,365243.0,365243.0,,,,,,
100003,-3915,-2341,-1305.0,-746,-3823.0,-2310.0,-1274.333374,-716.0,-3163.0,-1980.0,...,365243.0,365243.0,365243.0,1095729.0,,,,,,
100004,-815,-815,-815.0,-815,-784.0,-784.0,-784.0,-784.0,-724.0,-724.0,...,365243.0,365243.0,365243.0,365243.0,,,,,,
100005,-1072,-757,-536.0,-315,-706.0,-706.0,-706.0,-706.0,-466.0,-466.0,...,365243.0,365243.0,365243.0,365243.0,,,,,,


Calculate value counts for each categorical column

In [58]:
%%timeit -n 1 -r 1
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous')
print('Previous counts shape: ', previous_counts.shape)
previous_counts.head()

Previous counts shape:  (338857, 285)
1min 9s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Use line profiler to detect which lines should be optimized

In [12]:
%load_ext line_profiler

In [13]:
%lprun -f agg_categorical agg_categorical(previous, 'SK_ID_CURR', 'previous')

Timer unit: 1e-06 s

Total time: 69.8841 s
File: <ipython-input-5-d38cd81d2347>
Function: agg_categorical at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def agg_categorical(df, parent_var, df_name):
     2                                               """
     3                                               Aggregates the categorical features in a child dataframe
     4                                               for each observation of the parent variable.
     5                                               
     6                                               Parameters
     7                                               --------
     8                                               df : dataframe 
     9                                                   The dataframe to calculate the value counts for.
    10                                                   
    11                                               parent_var : string
    12                                                   The variable by which to group and aggregate the dataframe. For each unique
    13                                                   value of this variable, the final dataframe will have one row
    14                                                   
    15                                               df_name : string
    16                                                   Variable added to the front of column names to keep track of columns
    17                                           
    18                                               
    19                                               Return
    20                                               --------
    21                                               categorical : dataframe
    22                                                   A dataframe with aggregated statistics for each observation of the parent_var
    23                                                   The columns are also renamed and columns with duplicate values are removed.
    24                                                   
    25                                               """
    26                                               
    27                                               # Select the categorical columns
    28         1     828740.0 828740.0      1.2      categorical = pd.get_dummies(df.select_dtypes('category'))
    29                                               # Make sure to put the identifying id on the column
    30         1       4975.0   4975.0      0.0      categorical[parent_var] = df[parent_var]
    31                                           
    32                                               # Groupby the group var and calculate the sum and mean
    33         1   33943675.0 33943675.0     48.6      categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    34                                               
    35         1          1.0      1.0      0.0      column_names = []
    36                                               
    37                                               # Iterate through the columns in level 0
    38       144        166.0      1.2      0.0      for var in categorical.columns.levels[0]:
    39                                                   # Iterate through the stats in level 1
    40       572        261.0      0.5      0.0          for stat in ['sum', 'count', 'mean']:
    41                                                       # Make a new column name
    42       429        354.0      0.8      0.0              column_names.append('%s_%s_%s' % (df_name, var, stat))
    43                                               
    44         1        528.0    528.0      0.0      categorical.columns = column_names
    45                                               
    46                                               # Remove duplicate columns by values
    47         1   34743896.0 34743896.0     49.7      _, idx = np.unique(categorical, axis = 1, return_index = True)
    48         1     361457.0 361457.0      0.5      categorical = categorical.iloc[:, idx]
    49                                               
    50         1          3.0      3.0      0.0      return categorical

In [92]:
%%timeit -n 1 -r 1
opt_previous_counts = opt_agg_categorical(previous, 'SK_ID_CURR', 'previous')
print('opt Previous counts shape: ', opt_previous_counts.shape)
opt_previous_counts.head()

opt Previous counts shape:  (338857, 285)
49.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [18]:
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous')

merge previous records to application

In [19]:
train = pd.read_csv('data/application_train.csv')
train = convert_types(train)
test = pd.read_csv('data/application_test.csv')
test = convert_types(test)
application = pd.read_csv('data/application_train.csv')

# Merge in the previous information
train = train.merge(previous_counts, on ='SK_ID_CURR', how = 'left')
train = train.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')

test = test.merge(previous_counts, on ='SK_ID_CURR', how = 'left')
test = test.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')

# Remove variables to free memory
# gc.enable()
# del previous, previous_agg, previous_counts
# gc.collect()

In [23]:
application = pd.read_csv('data/application_train.csv')

__Calculate Missing Values__

In [20]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df, print_info = False):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        if print_info:
            # Print some summary information
            print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
                "There are " + str(mis_val_table_ren_columns.shape[0]) +
                  " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
def remove_missing_columns(train, test, threshold = 65):
    # Calculate missing stats for train and test (remember to calculate a percent!)
    train_miss = pd.DataFrame(train.isnull().sum())
    train_miss['percent'] = 100 * train_miss[0] / len(train)
    
    test_miss = pd.DataFrame(test.isnull().sum())
    test_miss['percent'] = 100 * test_miss[0] / len(test)
    
    # list of missing columns for train and test
    missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
    missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns + missing_test_columns))
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    train = train.drop(columns = missing_columns)
    test = test.drop(columns = missing_columns)
    
    return train, test

In [None]:
train, test = remove_missing_columns(train, test)

In [24]:
application.shape

(307511, 122)

In [22]:
train.shape

(307511, 487)

In [28]:
list(previous_agg.columns)

['previous_DAYS_DECISION_sum',
 'previous_DAYS_DECISION_min',
 'previous_DAYS_DECISION_mean',
 'previous_DAYS_DECISION_max',
 'previous_DAYS_FIRST_DUE_sum',
 'previous_DAYS_FIRST_DUE_min',
 'previous_DAYS_FIRST_DUE_mean',
 'previous_DAYS_FIRST_DUE_max',
 'previous_DAYS_LAST_DUE_sum',
 'previous_DAYS_LAST_DUE_min',
 'previous_DAYS_LAST_DUE_mean',
 'previous_DAYS_LAST_DUE_max',
 'previous_DAYS_TERMINATION_sum',
 'previous_DAYS_TERMINATION_min',
 'previous_DAYS_TERMINATION_mean',
 'previous_DAYS_TERMINATION_max',
 'previous_DAYS_LAST_DUE_1ST_VERSION_sum',
 'previous_DAYS_LAST_DUE_1ST_VERSION_min',
 'previous_DAYS_LAST_DUE_1ST_VERSION_mean',
 'previous_DAYS_LAST_DUE_1ST_VERSION_max',
 'previous_RATE_INTEREST_PRIMARY_sum',
 'previous_RATE_INTEREST_PRIVILEGED_sum',
 'previous_RATE_INTEREST_PRIMARY_count',
 'previous_NFLAG_INSURED_ON_APPROVAL_min',
 'previous_NFLAG_INSURED_ON_APPROVAL_mean',
 'previous_NFLAG_INSURED_ON_APPROVAL_max',
 'previous_NFLAG_INSURED_ON_APPROVAL_sum',
 'previous_RATE_

In [29]:
list(previous_counts.columns)

['previous_NAME_GOODS_CATEGORY_Animals_mean',
 'previous_NAME_GOODS_CATEGORY_Animals_sum',
 'previous_NAME_GOODS_CATEGORY_House Construction_mean',
 'previous_NAME_GOODS_CATEGORY_House Construction_sum',
 'previous_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_mean',
 'previous_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_sum',
 'previous_NAME_CASH_LOAN_PURPOSE_Money for a third person_mean',
 'previous_NAME_CASH_LOAN_PURPOSE_Money for a third person_sum',
 'previous_NAME_CASH_LOAN_PURPOSE_Hobby_mean',
 'previous_NAME_CASH_LOAN_PURPOSE_Hobby_sum',
 'previous_NAME_GOODS_CATEGORY_Education_mean',
 'previous_NAME_GOODS_CATEGORY_Education_sum',
 'previous_NAME_GOODS_CATEGORY_Additional Service_mean',
 'previous_NAME_GOODS_CATEGORY_Additional Service_sum',
 'previous_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_mean',
 'previous_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_sum',
 'previous_NAME_CASH_LOAN_PURPOSE_Buying a garage_mean',
 'previous_NAME_CASH_LOAN_PURPOSE_Buyin

In [31]:
# after merging previous loan records to the application table
missing = missing_values_table(train)
missing

Unnamed: 0,Missing Values,% of Total Values
previous_RATE_INTEREST_PRIVILEGED_max,302902,98.5
previous_RATE_INTEREST_PRIVILEGED_mean,302902,98.5
previous_RATE_INTEREST_PRIMARY_min,302902,98.5
previous_RATE_INTEREST_PRIMARY_mean,302902,98.5
previous_RATE_INTEREST_PRIVILEGED_min,302902,98.5
previous_RATE_INTEREST_PRIMARY_max,302902,98.5
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
COMMONAREA_MEDI,214865,69.9
NONLIVINGAPARTMENTS_MODE,213514,69.4


In [32]:
# missing value from previous application
pre_missing = missing[['previous' in s for s in missing.index]]

In [34]:
pre_missing

Unnamed: 0,Missing Values,% of Total Values
previous_RATE_INTEREST_PRIVILEGED_max,302902,98.5
previous_RATE_INTEREST_PRIVILEGED_mean,302902,98.5
previous_RATE_INTEREST_PRIMARY_min,302902,98.5
previous_RATE_INTEREST_PRIMARY_mean,302902,98.5
previous_RATE_INTEREST_PRIVILEGED_min,302902,98.5
previous_RATE_INTEREST_PRIMARY_max,302902,98.5
previous_RATE_DOWN_PAYMENT_min,33906,11.0
previous_RATE_DOWN_PAYMENT_max,33906,11.0
previous_RATE_DOWN_PAYMENT_mean,33906,11.0
previous_AMT_DOWN_PAYMENT_min,33906,11.0
