In [1]:
import pandas as pd
import numpy as np
import gc 
import warnings
warnings.filterwarnings('ignore')

In [2]:
def convert_types(df):
    cols = list(df.columns.values)
    for col in cols:
        
        if ('SK_ID' in col):
            df[col] = df[col].fillna(0).astype(np.int32)
            
        elif (df[col].dtype == 'object') and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype('category')
        
        elif list(df[col].unique()) == [1, 0]:
            df[col] = df[col].astype(bool)
        
        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)
            
        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)
        
        
    return df

In [3]:
# aggregate mumerical variables

def agg_numeric(df, key, df_name):
    for col in df:
        if col != key and 'SK_ID' in col:
            df = df.drop(columns = col)
    key_ids = df[key].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[key] = key_ids

    agg = numeric_df.groupby(key).agg(['count', 'mean', 'max', 'min', 'sum'])

    columns = []

    for var in agg.columns.levels[0]:
        if var != key:
            for stat in agg.columns.levels[1]:
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [4]:
def agg_categorical(df, key, df_name):

    
    categorical = pd.get_dummies(df.select_dtypes('category'))
    categorical[key] = df[key]

    categorical = categorical.groupby(key).agg(['sum', 'mean'])
    
    column_names = []
    
    for var in categorical.columns.levels[0]:
        for stat in ['sum', 'mean']:
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

Optimize aggregation funcion for categorical variables

In [5]:
def agg(df):
    return df.groupby('SK_ID_CURR').agg(['sum', 'mean'])


def opt_agg_categorical(df, key, df_name):
    
    import multiprocessing as mp
    
    categorical = pd.get_dummies(df.select_dtypes('category'))
    
    
    p = mp.Pool(processes=8)
    
    split_categoricals = np.array_split(categorical, 8, axis=1)
    
    for split in split_categoricals:
        split['SK_ID_CURR'] = previous['SK_ID_CURR']
 
 
    df_pool_results = p.map(agg, split_categoricals)
    
    p.close()
    
    categorical = pd.concat(df_pool_results, axis=1)

    
    column_names = []
    
    for var in categorical.columns.levels[0]:
        for stat in ['sum', 'mean']:
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [6]:
# load data
previous = pd.read_csv('data/previous_application.csv')
previous = convert_types(previous)
previous = previous.drop(columns=['RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED']) # drop two columns with 99% missing
print('Previous shape:', previous.shape)
previous.head()

Previous shape: (1670214, 35)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615234,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735352,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335938,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.394531,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [7]:
# aggregate numerical features
previous_agg = agg_numeric(previous, 'SK_ID_CURR', 'previous')
print('Previous aggregation shape: ', previous_agg.shape)
previous_agg.head()

Previous aggregation shape:  (338857, 71)


Unnamed: 0_level_0,previous_DAYS_DECISION_sum,previous_DAYS_DECISION_min,previous_DAYS_DECISION_mean,previous_DAYS_DECISION_max,previous_DAYS_FIRST_DUE_sum,previous_DAYS_FIRST_DUE_min,previous_DAYS_FIRST_DUE_mean,previous_DAYS_FIRST_DUE_max,previous_DAYS_LAST_DUE_sum,previous_DAYS_LAST_DUE_min,...,previous_AMT_APPLICATION_mean,previous_AMT_GOODS_PRICE_mean,previous_AMT_APPLICATION_max,previous_AMT_GOODS_PRICE_max,previous_AMT_GOODS_PRICE_sum,previous_AMT_APPLICATION_sum,previous_DAYS_FIRST_DRAWING_min,previous_DAYS_FIRST_DRAWING_mean,previous_DAYS_FIRST_DRAWING_max,previous_DAYS_FIRST_DRAWING_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-1740,-1740,-1740.0,-1740,-1709.0,-1709.0,-1709.0,-1709.0,-1619.0,-1619.0,...,24835.5,24835.5,24835.5,24835.5,24835.5,24835.5,365243.0,365243.0,365243.0,365243.0
100002,-606,-606,-606.0,-606,-565.0,-565.0,-565.0,-565.0,-25.0,-25.0,...,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,365243.0,365243.0,365243.0,365243.0
100003,-3915,-2341,-1305.0,-746,-3823.0,-2310.0,-1274.333374,-716.0,-3163.0,-1980.0,...,435436.5,435436.5,900000.0,900000.0,1306309.5,1306309.5,365243.0,365243.0,365243.0,1095729.0
100004,-815,-815,-815.0,-815,-784.0,-784.0,-784.0,-784.0,-724.0,-724.0,...,24282.0,24282.0,24282.0,24282.0,24282.0,24282.0,365243.0,365243.0,365243.0,365243.0
100005,-1072,-757,-536.0,-315,-706.0,-706.0,-706.0,-706.0,-466.0,-466.0,...,22308.75,44617.5,44617.5,44617.5,44617.5,44617.5,365243.0,365243.0,365243.0,365243.0


In [16]:
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous') # aggregate categorical features

In [14]:
%%timeit -n 1 -r 1
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous') # aggregate categorical features

34.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Use line profiler to detect which lines should be optimized

In [9]:
%load_ext line_profiler

In [10]:
%lprun -f agg_categorical agg_categorical(previous, 'SK_ID_CURR', 'previous')

Timer unit: 1e-06 s

Total time: 34.1753 s
File: <ipython-input-6-0fe9c451d92f>
Function: agg_categorical at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def agg_categorical(df, parent_var, df_name):
     2                                               """
     3                                               Aggregates the categorical features in a child dataframe
     4                                               for each observation of the parent variable.
     5                                               
     6                                               Parameters
     7                                               --------
     8                                               df : dataframe 
     9                                                   The dataframe to calculate the value counts for.
    10                                                   
    11                                               parent_var : string
    12                                                   The variable by which to group and aggregate the dataframe. For each unique
    13                                                   value of this variable, the final dataframe will have one row
    14                                                   
    15                                               df_name : string
    16                                                   Variable added to the front of column names to keep track of columns
    17                                           
    18                                               
    19                                               Return
    20                                               --------
    21                                               categorical : dataframe
    22                                                   A dataframe with aggregated statistics for each observation of the parent_var
    23                                                   The columns are also renamed and columns with duplicate values are removed.
    24                                                   
    25                                               """
    26                                               
    27                                               # Select the categorical columns
    28         1     804637.0 804637.0      2.4      categorical = pd.get_dummies(df.select_dtypes('category'))
    29                                               # Make sure to put the identifying id on the column
    30         1       9659.0   9659.0      0.0      categorical[parent_var] = df[parent_var]
    31                                           
    32                                               # Groupby the group var and calculate the sum and mean
    33         1   26241780.0 26241780.0     76.8      categorical = categorical.groupby(parent_var).agg(['sum', 'mean'])
    34                                               
    35         1          3.0      3.0      0.0      column_names = []
    36                                               
    37                                               # Iterate through the columns in level 0
    38       144        171.0      1.2      0.0      for var in categorical.columns.levels[0]:
    39                                                   # Iterate through the stats in level 1
    40       429        208.0      0.5      0.0          for stat in ['sum', 'mean']:
    41                                                       # Make a new column name
    42       286        248.0      0.9      0.0              column_names.append('%s_%s_%s' % (df_name, var, stat))
    43                                               
    44         1        552.0    552.0      0.0      categorical.columns = column_names
    45                                               
    46                                               # Remove duplicate columns by values
    47         1    6782055.0 6782055.0     19.8      _, idx = np.unique(categorical, axis = 1, return_index = True)
    48         1     335942.0 335942.0      1.0      categorical = categorical.iloc[:, idx]
    49                                               
    50         1          2.0      2.0      0.0      return categorical

In [11]:
%%timeit -n 1 -r 1
opt_previous_counts = opt_agg_categorical(previous, 'SK_ID_CURR', 'previous')

21.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [19]:
# fill missing in new features
previous_merge = previous_agg.merge(previous_counts, on = 'SK_ID_CURR', how = 'left')
cols = list(previous_merge.columns.values)
train = pd.read_csv('data/application_train.csv')
train_new = train.merge(previous_agg, on ='SK_ID_CURR', how = 'left')
train_new = train_new.merge(previous_counts, on ='SK_ID_CURR', how = 'left')
train_new = train_new[cols]
train_new = train_new.fillna(train_new.median())

In [20]:
# save new features
train_new.to_csv('previous_new_features', encoding='utf-8', index=False)