In [28]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder


# FUNCTIONS

In [29]:
# Examine missing values
# Function to calculate missing values by column 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [30]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# LOAD DATA

In [31]:
# installments_payment
installments = pd.read_csv('/home/convidado/Denise/installments_payments.csv')
print('Training data shape: ', installments.shape)
installments.head()

Training data shape:  (13605401, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [32]:
print("-----------------------Missing Values------------------------------")
missing_values = missing_values_table(installments)
missing_values.head(20)

-----------------------Missing Values------------------------------
Your selected dataframe has 8 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
DAYS_ENTRY_PAYMENT,2905,0.0
AMT_PAYMENT,2905,0.0


In [33]:
# Number of each type of column
installments.dtypes.value_counts()

float64    5
int64      3
dtype: int64

# FEATURE ENGINEERING

In [34]:
# Agreggations
# Group by the client id, calculate aggregation statistics
installments_agg = installments.drop(columns = ['SK_ID_PREV']).\
groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

installments_agg.head()

Unnamed: 0_level_0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_NUMBER,...,AMT_INSTALMENT,AMT_INSTALMENT,AMT_INSTALMENT,AMT_INSTALMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_PAYMENT,AMT_PAYMENT,AMT_PAYMENT,AMT_PAYMENT
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,min,sum,count,mean,max,min,...,count,mean,max,min,sum,count,mean,max,min,sum
0,100001,7,1.142857,2.0,1.0,8.0,7,2.714286,4,1,...,7,5885.132143,17397.9,3951.0,41195.925,7,5885.132143,17397.9,3951.0,41195.925
1,100002,19,1.052632,2.0,1.0,20.0,19,10.0,19,1,...,19,11559.247105,53093.745,9251.775,219625.695,19,11559.247105,53093.745,9251.775,219625.695
2,100003,25,1.04,2.0,1.0,26.0,25,5.08,12,1,...,25,64754.586,560835.36,6662.97,1618864.65,25,64754.586,560835.36,6662.97,1618864.65
3,100004,3,1.333333,2.0,1.0,4.0,3,2.0,3,1,...,3,7096.155,10573.965,5357.25,21288.465,3,7096.155,10573.965,5357.25,21288.465
4,100005,9,1.111111,2.0,1.0,10.0,9,5.0,9,1,...,9,6240.205,17656.245,4813.2,56161.845,9,6240.205,17656.245,4813.2,56161.845


In [35]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in installments_agg.columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in installments_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('installments_payment_%s_%s' % (var, stat))
            
# Assign the list of columns names as the dataframe column names
installments_agg.columns = columns
installments_agg.head()

Unnamed: 0,SK_ID_CURR,installments_payment_NUM_INSTALMENT_VERSION_count,installments_payment_NUM_INSTALMENT_VERSION_mean,installments_payment_NUM_INSTALMENT_VERSION_max,installments_payment_NUM_INSTALMENT_VERSION_min,installments_payment_NUM_INSTALMENT_VERSION_sum,installments_payment_NUM_INSTALMENT_NUMBER_count,installments_payment_NUM_INSTALMENT_NUMBER_mean,installments_payment_NUM_INSTALMENT_NUMBER_max,installments_payment_NUM_INSTALMENT_NUMBER_min,...,installments_payment_AMT_INSTALMENT_count,installments_payment_AMT_INSTALMENT_mean,installments_payment_AMT_INSTALMENT_max,installments_payment_AMT_INSTALMENT_min,installments_payment_AMT_INSTALMENT_sum,installments_payment_AMT_PAYMENT_count,installments_payment_AMT_PAYMENT_mean,installments_payment_AMT_PAYMENT_max,installments_payment_AMT_PAYMENT_min,installments_payment_AMT_PAYMENT_sum
0,100001,7,1.142857,2.0,1.0,8.0,7,2.714286,4,1,...,7,5885.132143,17397.9,3951.0,41195.925,7,5885.132143,17397.9,3951.0,41195.925
1,100002,19,1.052632,2.0,1.0,20.0,19,10.0,19,1,...,19,11559.247105,53093.745,9251.775,219625.695,19,11559.247105,53093.745,9251.775,219625.695
2,100003,25,1.04,2.0,1.0,26.0,25,5.08,12,1,...,25,64754.586,560835.36,6662.97,1618864.65,25,64754.586,560835.36,6662.97,1618864.65
3,100004,3,1.333333,2.0,1.0,4.0,3,2.0,3,1,...,3,7096.155,10573.965,5357.25,21288.465,3,7096.155,10573.965,5357.25,21288.465
4,100005,9,1.111111,2.0,1.0,10.0,9,5.0,9,1,...,9,6240.205,17656.245,4813.2,56161.845,9,6240.205,17656.245,4813.2,56161.845


# Median percentagem between the amount of payment and the amount of installment

In [36]:
grp = installments[["SK_ID_CURR", "AMT_INSTALMENT", "AMT_PAYMENT" ]]

# Get the percentage of down payment and total value per credit
grp["installments_payments_PAYMENT_INSTALMENT_RATIO"] = grp["AMT_PAYMENT"]/grp["AMT_INSTALMENT"]

# Get the median per client
median_percentage = grp[["SK_ID_CURR", "installments_payments_PAYMENT_INSTALMENT_RATIO"]].\
groupby("SK_ID_CURR", as_index = False).median()

# Merge dataframes
installments_agg = installments_agg.merge(median_percentage, on = ['SK_ID_CURR'], how = 'left' )

del median_percentage

installments_agg.head()


Unnamed: 0,SK_ID_CURR,installments_payment_NUM_INSTALMENT_VERSION_count,installments_payment_NUM_INSTALMENT_VERSION_mean,installments_payment_NUM_INSTALMENT_VERSION_max,installments_payment_NUM_INSTALMENT_VERSION_min,installments_payment_NUM_INSTALMENT_VERSION_sum,installments_payment_NUM_INSTALMENT_NUMBER_count,installments_payment_NUM_INSTALMENT_NUMBER_mean,installments_payment_NUM_INSTALMENT_NUMBER_max,installments_payment_NUM_INSTALMENT_NUMBER_min,...,installments_payment_AMT_INSTALMENT_mean,installments_payment_AMT_INSTALMENT_max,installments_payment_AMT_INSTALMENT_min,installments_payment_AMT_INSTALMENT_sum,installments_payment_AMT_PAYMENT_count,installments_payment_AMT_PAYMENT_mean,installments_payment_AMT_PAYMENT_max,installments_payment_AMT_PAYMENT_min,installments_payment_AMT_PAYMENT_sum,installments_payments_PAYMENT_INSTALMENT_RATIO
0,100001,7,1.142857,2.0,1.0,8.0,7,2.714286,4,1,...,5885.132143,17397.9,3951.0,41195.925,7,5885.132143,17397.9,3951.0,41195.925,1.0
1,100002,19,1.052632,2.0,1.0,20.0,19,10.0,19,1,...,11559.247105,53093.745,9251.775,219625.695,19,11559.247105,53093.745,9251.775,219625.695,1.0
2,100003,25,1.04,2.0,1.0,26.0,25,5.08,12,1,...,64754.586,560835.36,6662.97,1618864.65,25,64754.586,560835.36,6662.97,1618864.65,1.0
3,100004,3,1.333333,2.0,1.0,4.0,3,2.0,3,1,...,7096.155,10573.965,5357.25,21288.465,3,7096.155,10573.965,5357.25,21288.465,1.0
4,100005,9,1.111111,2.0,1.0,10.0,9,5.0,9,1,...,6240.205,17656.245,4813.2,56161.845,9,6240.205,17656.245,4813.2,56161.845,1.0


# Median difference between when the intallment supposed to be paid and when it was actually paid

In [37]:
# Difference 
grp = installments[["SK_ID_CURR", "DAYS_INSTALMENT", "DAYS_ENTRY_PAYMENT" ]]
grp["installments_payments_MEDIAN_DIFF_PAYMENTS_INSTALLMENT"] = grp["DAYS_ENTRY_PAYMENT"] - grp["DAYS_INSTALMENT"]

# Get the median per client
median_percentage = grp[["SK_ID_CURR", "installments_payments_MEDIAN_DIFF_PAYMENTS_INSTALLMENT"]].\
groupby("SK_ID_CURR", as_index = False).median()

# Merge dataframes
installments_agg = installments_agg.merge(median_percentage, on = ['SK_ID_CURR'], how = 'left' )

del median_percentage

installments_agg.head()

Unnamed: 0,SK_ID_CURR,installments_payment_NUM_INSTALMENT_VERSION_count,installments_payment_NUM_INSTALMENT_VERSION_mean,installments_payment_NUM_INSTALMENT_VERSION_max,installments_payment_NUM_INSTALMENT_VERSION_min,installments_payment_NUM_INSTALMENT_VERSION_sum,installments_payment_NUM_INSTALMENT_NUMBER_count,installments_payment_NUM_INSTALMENT_NUMBER_mean,installments_payment_NUM_INSTALMENT_NUMBER_max,installments_payment_NUM_INSTALMENT_NUMBER_min,...,installments_payment_AMT_INSTALMENT_max,installments_payment_AMT_INSTALMENT_min,installments_payment_AMT_INSTALMENT_sum,installments_payment_AMT_PAYMENT_count,installments_payment_AMT_PAYMENT_mean,installments_payment_AMT_PAYMENT_max,installments_payment_AMT_PAYMENT_min,installments_payment_AMT_PAYMENT_sum,installments_payments_PAYMENT_INSTALMENT_RATIO,installments_payments_MEDIAN_DIFF_PAYMENTS_INSTALLMENT
0,100001,7,1.142857,2.0,1.0,8.0,7,2.714286,4,1,...,17397.9,3951.0,41195.925,7,5885.132143,17397.9,3951.0,41195.925,1.0,-6.0
1,100002,19,1.052632,2.0,1.0,20.0,19,10.0,19,1,...,53093.745,9251.775,219625.695,19,11559.247105,53093.745,9251.775,219625.695,1.0,-19.0
2,100003,25,1.04,2.0,1.0,26.0,25,5.08,12,1,...,560835.36,6662.97,1618864.65,25,64754.586,560835.36,6662.97,1618864.65,1.0,-6.0
3,100004,3,1.333333,2.0,1.0,4.0,3,2.0,3,1,...,10573.965,5357.25,21288.465,3,7096.155,10573.965,5357.25,21288.465,1.0,-9.0
4,100005,9,1.111111,2.0,1.0,10.0,9,5.0,9,1,...,17656.245,4813.2,56161.845,9,6240.205,17656.245,4813.2,56161.845,1.0,-29.0


# GET THE CSV

In [39]:
# Save the CSV
installments_agg.to_csv('/home/convidado/Denise/14_07_bureau_balance_previous_app_pos_cash_credit_card (copy)/installments_agg.csv')
