In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder


# FUNCTIONS

In [2]:
# Examine missing values
# Function to calculate missing values by column 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [3]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# LOAD DATA

In [4]:
# Previous_application
previous_app = pd.read_csv('/home/convidado/Denise/previous_application.csv')
print('Training data shape: ', previous_app.shape)
previous_app.head()

Training data shape:  (1670214, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [5]:
print("-----------------------Missing Values------------------------------")
missing_values = missing_values_table(previous_app)
missing_values.head(20)

-----------------------Missing Values------------------------------
Your selected dataframe has 37 columns.
There are 16 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
RATE_INTEREST_PRIMARY,1664263,99.6
RATE_INTEREST_PRIVILEGED,1664263,99.6
AMT_DOWN_PAYMENT,895844,53.6
RATE_DOWN_PAYMENT,895844,53.6
NAME_TYPE_SUITE,820405,49.1
DAYS_FIRST_DRAWING,673065,40.3
DAYS_FIRST_DUE,673065,40.3
DAYS_LAST_DUE_1ST_VERSION,673065,40.3
DAYS_LAST_DUE,673065,40.3
DAYS_TERMINATION,673065,40.3


In [6]:
# Number of each type of column
previous_app.dtypes.value_counts()

object     16
float64    15
int64       6
dtype: int64

# Drop features with missing values >= 99%

In [7]:
previous_app = previous_app.drop(columns = ["RATE_INTEREST_PRIMARY", "RATE_INTEREST_PRIVILEGED"])

previous_app.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


# FEATURE ENGINEERING

In [8]:
# Number of unique classes in each object column
previous_app.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE              4
WEEKDAY_APPR_PROCESS_START      7
FLAG_LAST_APPL_PER_CONTRACT     2
NAME_CASH_LOAN_PURPOSE         25
NAME_CONTRACT_STATUS            4
NAME_PAYMENT_TYPE               4
CODE_REJECT_REASON              9
NAME_TYPE_SUITE                 7
NAME_CLIENT_TYPE                4
NAME_GOODS_CATEGORY            28
NAME_PORTFOLIO                  5
NAME_PRODUCT_TYPE               3
CHANNEL_TYPE                    8
NAME_SELLER_INDUSTRY           11
NAME_YIELD_GROUP                5
PRODUCT_COMBINATION            17
dtype: int64

In [9]:
previous_app.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
       'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
       'RATE_DOWN_PAYMENT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS',
       'DAYS_DECISION', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
       'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
       'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
       'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY', 'CNT_PAYMENT',
       'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'DAYS_FIRST_DRAWING',
       'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE',
       'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')

In [10]:
# Drop some categorical features with too much categories

previous_app_drop = previous_app.drop(columns = ["WEEKDAY_APPR_PROCESS_START",'HOUR_APPR_PROCESS_START',\
                                                 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',\
                                                "NAME_CASH_LOAN_PURPOSE","NAME_TYPE_SUITE", 'NAME_GOODS_CATEGORY',\
                                                'NAME_SELLER_INDUSTRY','PRODUCT_COMBINATION', "CHANNEL_TYPE",\
                                                "NAME_PORTFOLIO", "NAME_YIELD_GROUP", "SELLERPLACE_AREA",\
                                                "FLAG_LAST_APPL_PER_CONTRACT","NAME_PAYMENT_TYPE", \
                                                "NAME_PRODUCT_TYPE"])

previous_app_drop.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,RATE_DOWN_PAYMENT,NAME_CONTRACT_STATUS,DAYS_DECISION,CODE_REJECT_REASON,NAME_CLIENT_TYPE,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,0.0,Approved,-73,XAP,Repeater,12.0,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,,Approved,-164,XAP,Repeater,36.0,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,,Approved,-301,XAP,Repeater,12.0,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,,Approved,-512,XAP,Repeater,12.0,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,,Refused,-781,HC,Repeater,24.0,,,,,,


In [11]:
# Number of unique classes in each object column
previous_app_drop.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE      4
NAME_CONTRACT_STATUS    4
CODE_REJECT_REASON      9
NAME_CLIENT_TYPE        4
dtype: int64

In [12]:
# One-Hot-Encoding
previous_app_cat, pa_cat = one_hot_encoder(previous_app_drop)

# Agreggations
# Group by the client id, calculate aggregation statistics
previous_app_agg = previous_app_cat.drop(columns = ['SK_ID_PREV']).\
groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

previous_app_agg.head()

Unnamed: 0_level_0,SK_ID_CURR,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_APPLICATION,AMT_APPLICATION,AMT_APPLICATION,AMT_APPLICATION,...,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_nan,NAME_CLIENT_TYPE_nan,NAME_CLIENT_TYPE_nan,NAME_CLIENT_TYPE_nan,NAME_CLIENT_TYPE_nan
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,min,sum,count,mean,max,min,...,count,mean,max,min,sum,count,mean,max,min,sum
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,...,1,0.0,0,0,0,1,0,0,0,0
1,100002,1,9251.775,9251.775,9251.775,9251.775,1,179055.0,179055.0,179055.0,...,1,0.0,0,0,0,1,0,0,0,0
2,100003,3,56553.99,98356.995,6737.31,169661.97,3,435436.5,900000.0,68809.5,...,3,0.0,0,0,0,3,0,0,0,0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,...,1,0.0,0,0,0,1,0,0,0,0
4,100005,1,4813.2,4813.2,4813.2,4813.2,2,22308.75,44617.5,0.0,...,2,0.0,0,0,0,2,0,0,0,0


In [13]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in previous_app_agg.columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in previous_app_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('previous_app_%s_%s' % (var, stat))
            
# Assign the list of columns names as the dataframe column names
previous_app_agg.columns = columns
previous_app_agg.head()

Unnamed: 0,SK_ID_CURR,previous_app_AMT_ANNUITY_count,previous_app_AMT_ANNUITY_mean,previous_app_AMT_ANNUITY_max,previous_app_AMT_ANNUITY_min,previous_app_AMT_ANNUITY_sum,previous_app_AMT_APPLICATION_count,previous_app_AMT_APPLICATION_mean,previous_app_AMT_APPLICATION_max,previous_app_AMT_APPLICATION_min,...,previous_app_NAME_CLIENT_TYPE_XNA_count,previous_app_NAME_CLIENT_TYPE_XNA_mean,previous_app_NAME_CLIENT_TYPE_XNA_max,previous_app_NAME_CLIENT_TYPE_XNA_min,previous_app_NAME_CLIENT_TYPE_XNA_sum,previous_app_NAME_CLIENT_TYPE_nan_count,previous_app_NAME_CLIENT_TYPE_nan_mean,previous_app_NAME_CLIENT_TYPE_nan_max,previous_app_NAME_CLIENT_TYPE_nan_min,previous_app_NAME_CLIENT_TYPE_nan_sum
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,...,1,0.0,0,0,0,1,0,0,0,0
1,100002,1,9251.775,9251.775,9251.775,9251.775,1,179055.0,179055.0,179055.0,...,1,0.0,0,0,0,1,0,0,0,0
2,100003,3,56553.99,98356.995,6737.31,169661.97,3,435436.5,900000.0,68809.5,...,3,0.0,0,0,0,3,0,0,0,0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,...,1,0.0,0,0,0,1,0,0,0,0
4,100005,1,4813.2,4813.2,4813.2,4813.2,2,22308.75,44617.5,0.0,...,2,0.0,0,0,0,2,0,0,0,0


# "DAYS_FIRST_DUE"/ "DAYS_LAST_DUE_1ST_VERSION"/  "DAYS_LAST_DUE"/ "DAYS_TERMINATION"

In [14]:
grp = previous_app[["SK_ID_CURR", "SK_ID_PREV", "DAYS_FIRST_DUE","DAYS_LAST_DUE_1ST_VERSION",  "DAYS_LAST_DUE", "DAYS_TERMINATION"]]

# Checking annomalines 
print("BEFORE: ")
print(grp.describe())

# Replace the anomalous values with nan
grp.replace({365243: np.nan}, inplace = True)

print("AFTER ")
print(grp.describe())

BEFORE: 
         SK_ID_CURR    SK_ID_PREV  DAYS_FIRST_DUE  DAYS_LAST_DUE_1ST_VERSION  \
count  1.670214e+06  1.670214e+06   997149.000000              997149.000000   
mean   2.783572e+05  1.923089e+06    13826.269337               33767.774054   
std    1.028148e+05  5.325980e+05    72444.869708              106857.034789   
min    1.000010e+05  1.000001e+06    -2892.000000               -2801.000000   
25%    1.893290e+05  1.461857e+06    -1628.000000               -1242.000000   
50%    2.787145e+05  1.923110e+06     -831.000000                -361.000000   
75%    3.675140e+05  2.384280e+06     -411.000000                 129.000000   
max    4.562550e+05  2.845382e+06   365243.000000              365243.000000   

       DAYS_LAST_DUE  DAYS_TERMINATION  
count  997149.000000     997149.000000  
mean    76582.403064      81992.343838  
std    149647.415123     153303.516729  
min     -2889.000000      -2874.000000  
25%     -1314.000000      -1270.000000  
50%      -537.000000    

In [15]:
# Difference between the expected data for the last due and the real date. Positive values indicate delays 
# on the final payment.
grp["MEDIAN_DIFF_LAST_DUE_EXPECTATION_REALDATE"] = grp["DAYS_TERMINATION"] - grp["DAYS_LAST_DUE"]

# Difference between the expected data for the first due and the real date. Positive values indicate delays 
# on the first payment.
grp["MEDIAN_DIFF_FIRST_DUE_EXPECTATION_REALDATE"] = grp["DAYS_FIRST_DUE"] - grp["DAYS_LAST_DUE_1ST_VERSION"]


# Median days of delay per client
median_delay_last = grp[["SK_ID_CURR", "MEDIAN_DIFF_LAST_DUE_EXPECTATION_REALDATE"]].groupby("SK_ID_CURR", as_index = False).\
median()

median_delay_first = grp[["SK_ID_CURR", "MEDIAN_DIFF_FIRST_DUE_EXPECTATION_REALDATE"]].groupby("SK_ID_CURR", as_index = False).\
median()

# Merge dataframes
median_delay = median_delay_last.merge(median_delay_first, on = ['SK_ID_CURR'], how = 'left')

del median_delay_last, median_delay_first

previous_app_agg = previous_app_agg.merge(median_delay, on = ['SK_ID_CURR'], how = 'left' )

del median_delay

previous_app_agg.head()


Unnamed: 0,SK_ID_CURR,previous_app_AMT_ANNUITY_count,previous_app_AMT_ANNUITY_mean,previous_app_AMT_ANNUITY_max,previous_app_AMT_ANNUITY_min,previous_app_AMT_ANNUITY_sum,previous_app_AMT_APPLICATION_count,previous_app_AMT_APPLICATION_mean,previous_app_AMT_APPLICATION_max,previous_app_AMT_APPLICATION_min,...,previous_app_NAME_CLIENT_TYPE_XNA_max,previous_app_NAME_CLIENT_TYPE_XNA_min,previous_app_NAME_CLIENT_TYPE_XNA_sum,previous_app_NAME_CLIENT_TYPE_nan_count,previous_app_NAME_CLIENT_TYPE_nan_mean,previous_app_NAME_CLIENT_TYPE_nan_max,previous_app_NAME_CLIENT_TYPE_nan_min,previous_app_NAME_CLIENT_TYPE_nan_sum,MEDIAN_DIFF_LAST_DUE_EXPECTATION_REALDATE,MEDIAN_DIFF_FIRST_DUE_EXPECTATION_REALDATE
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,...,0,0,0,1,0,0,0,0,7.0,-210.0
1,100002,1,9251.775,9251.775,9251.775,9251.775,1,179055.0,179055.0,179055.0,...,0,0,0,1,0,0,0,0,8.0,-690.0
2,100003,3,56553.99,98356.995,6737.31,169661.97,3,435436.5,900000.0,68809.5,...,0,0,0,3,0,0,0,0,8.0,-330.0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,...,0,0,0,1,0,0,0,0,10.0,-90.0
4,100005,1,4813.2,4813.2,4813.2,4813.2,2,22308.75,44617.5,0.0,...,0,0,0,2,0,0,0,0,6.0,-330.0


# Median percentage of the ratio down payment/total credit per client

In [16]:
grp = previous_app[["SK_ID_CURR", "AMT_CREDIT", "AMT_DOWN_PAYMENT" ]]

# Get the percentage of down payment and total value per credit
grp["DOWN_PAYMENT_TOTAL_CREDIT_RATIO"] = grp["AMT_DOWN_PAYMENT"]/grp["AMT_CREDIT"]

# Get the median per client
median_percentage = grp[["SK_ID_CURR", "DOWN_PAYMENT_TOTAL_CREDIT_RATIO"]].groupby("SK_ID_CURR", as_index = False).median()

# Merge dataframes
previous_app_agg = previous_app_agg.merge(median_percentage, on = ['SK_ID_CURR'], how = 'left' )

del median_percentage


In [17]:
previous_app_agg.head()

Unnamed: 0,SK_ID_CURR,previous_app_AMT_ANNUITY_count,previous_app_AMT_ANNUITY_mean,previous_app_AMT_ANNUITY_max,previous_app_AMT_ANNUITY_min,previous_app_AMT_ANNUITY_sum,previous_app_AMT_APPLICATION_count,previous_app_AMT_APPLICATION_mean,previous_app_AMT_APPLICATION_max,previous_app_AMT_APPLICATION_min,...,previous_app_NAME_CLIENT_TYPE_XNA_min,previous_app_NAME_CLIENT_TYPE_XNA_sum,previous_app_NAME_CLIENT_TYPE_nan_count,previous_app_NAME_CLIENT_TYPE_nan_mean,previous_app_NAME_CLIENT_TYPE_nan_max,previous_app_NAME_CLIENT_TYPE_nan_min,previous_app_NAME_CLIENT_TYPE_nan_sum,MEDIAN_DIFF_LAST_DUE_EXPECTATION_REALDATE,MEDIAN_DIFF_FIRST_DUE_EXPECTATION_REALDATE,DOWN_PAYMENT_TOTAL_CREDIT_RATIO
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,...,0,0,1,0,0,0,0,7.0,-210.0,0.10594
1,100002,1,9251.775,9251.775,9251.775,9251.775,1,179055.0,179055.0,179055.0,...,0,0,1,0,0,0,0,8.0,-690.0,0.0
2,100003,3,56553.99,98356.995,6737.31,169661.97,3,435436.5,900000.0,68809.5,...,0,0,3,0,0,0,0,8.0,-330.0,0.050585
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,...,0,0,1,0,0,0,0,10.0,-90.0,0.241719
4,100005,1,4813.2,4813.2,4813.2,4813.2,2,22308.75,44617.5,0.0,...,0,0,2,0,0,0,0,6.0,-330.0,0.111173


# LOAD DATA

In [18]:
# POS_cash
pos_cash= pd.read_csv('/home/convidado/Denise/POS_CASH_balance.csv')
print('Training data shape: ', pos_cash.shape)
pos_cash.head()

Training data shape:  (10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [19]:
# Number of unique classes in each object column
pos_cash.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_STATUS    9
dtype: int64

# FEATURE ENGINEERING : POS_CASH

In [20]:
# One-Hot-Encoding
pos_cash_cat, pc_cat = one_hot_encoder(pos_cash)

# Agreggations
# Group by the client id, calculate aggregation statistics
pos_cash_agg = pos_cash_cat.drop(columns = ['SK_ID_PREV']).\
groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

pos_cash_agg.head()

Unnamed: 0_level_0,SK_ID_CURR,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT,CNT_INSTALMENT,CNT_INSTALMENT,...,NAME_CONTRACT_STATUS_XNA,NAME_CONTRACT_STATUS_XNA,NAME_CONTRACT_STATUS_XNA,NAME_CONTRACT_STATUS_XNA,NAME_CONTRACT_STATUS_XNA,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,min,sum,count,mean,max,min,...,count,mean,max,min,sum,count,mean,max,min,sum
0,100001,9,-72.555556,-53,-96,-653,9,4.0,4.0,4.0,...,9,0.0,0,0,0,9,0,0,0,0
1,100002,19,-10.0,-1,-19,-190,19,24.0,24.0,24.0,...,19,0.0,0,0,0,19,0,0,0,0
2,100003,28,-43.785714,-18,-77,-1226,28,10.107143,12.0,6.0,...,28,0.0,0,0,0,28,0,0,0,0
3,100004,4,-25.5,-24,-27,-102,4,3.75,4.0,3.0,...,4,0.0,0,0,0,4,0,0,0,0
4,100005,11,-20.0,-15,-25,-220,10,11.7,12.0,9.0,...,11,0.0,0,0,0,11,0,0,0,0


In [21]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in pos_cash_agg.columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in pos_cash_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('pos_cash_%s_%s' % (var, stat))
            
# Assign the list of columns names as the dataframe column names
pos_cash_agg.columns = columns
pos_cash_agg.head()

Unnamed: 0,SK_ID_CURR,pos_cash_MONTHS_BALANCE_count,pos_cash_MONTHS_BALANCE_mean,pos_cash_MONTHS_BALANCE_max,pos_cash_MONTHS_BALANCE_min,pos_cash_MONTHS_BALANCE_sum,pos_cash_CNT_INSTALMENT_count,pos_cash_CNT_INSTALMENT_mean,pos_cash_CNT_INSTALMENT_max,pos_cash_CNT_INSTALMENT_min,...,pos_cash_NAME_CONTRACT_STATUS_XNA_count,pos_cash_NAME_CONTRACT_STATUS_XNA_mean,pos_cash_NAME_CONTRACT_STATUS_XNA_max,pos_cash_NAME_CONTRACT_STATUS_XNA_min,pos_cash_NAME_CONTRACT_STATUS_XNA_sum,pos_cash_NAME_CONTRACT_STATUS_nan_count,pos_cash_NAME_CONTRACT_STATUS_nan_mean,pos_cash_NAME_CONTRACT_STATUS_nan_max,pos_cash_NAME_CONTRACT_STATUS_nan_min,pos_cash_NAME_CONTRACT_STATUS_nan_sum
0,100001,9,-72.555556,-53,-96,-653,9,4.0,4.0,4.0,...,9,0.0,0,0,0,9,0,0,0,0
1,100002,19,-10.0,-1,-19,-190,19,24.0,24.0,24.0,...,19,0.0,0,0,0,19,0,0,0,0
2,100003,28,-43.785714,-18,-77,-1226,28,10.107143,12.0,6.0,...,28,0.0,0,0,0,28,0,0,0,0
3,100004,4,-25.5,-24,-27,-102,4,3.75,4.0,3.0,...,4,0.0,0,0,0,4,0,0,0,0
4,100005,11,-20.0,-15,-25,-220,10,11.7,12.0,9.0,...,11,0.0,0,0,0,11,0,0,0,0


# Differente between the installment paid and the minimum installment

In [22]:

# Get the maximum value
max_= pos_cash[['SK_ID_CURR', 'SK_ID_PREV','CNT_INSTALMENT']].groupby(["SK_ID_CURR", "SK_ID_PREV"], as_index = False).max().\
rename(index = str, columns = {'CNT_INSTALMENT': 'CNT_INSTALMENT_MAX'})

# Get the minimum value
min_ = pos_cash[['SK_ID_CURR', 'SK_ID_PREV','CNT_INSTALMENT']].groupby(["SK_ID_CURR", "SK_ID_PREV"], as_index = False).min().\
rename(index = str, columns = {'CNT_INSTALMENT': 'CNT_INSTALMENT_MIN'})


# Merge datasets
grp = max_.merge(min_[["SK_ID_PREV", "CNT_INSTALMENT_MIN"]], on = [ "SK_ID_PREV"], how = 'left' )

del max_, min_

# Get the differente
grp["DIFF_CNT_INSTALMENT"] = grp["CNT_INSTALMENT_MAX"] - grp["CNT_INSTALMENT_MIN"]

grp = grp.drop(columns = ["CNT_INSTALMENT_MIN", "CNT_INSTALMENT_MAX"])

# Check if the instalment change in the period (1 : yes, 0 : no)
grp["CHANGE_INSTALMENT"] =  np.where(grp['DIFF_CNT_INSTALMENT'] > 0, 1, 0)

# Get the median os changes
grp = grp[["SK_ID_CURR", "CHANGE_INSTALMENT"]].groupby("SK_ID_CURR", as_index = False).median()


# Merge datasets

pos_cash_agg = pos_cash_agg.merge(grp, on = [ "SK_ID_CURR"], how = 'left' )

del grp

pos_cash_agg.head()

Unnamed: 0,SK_ID_CURR,pos_cash_MONTHS_BALANCE_count,pos_cash_MONTHS_BALANCE_mean,pos_cash_MONTHS_BALANCE_max,pos_cash_MONTHS_BALANCE_min,pos_cash_MONTHS_BALANCE_sum,pos_cash_CNT_INSTALMENT_count,pos_cash_CNT_INSTALMENT_mean,pos_cash_CNT_INSTALMENT_max,pos_cash_CNT_INSTALMENT_min,...,pos_cash_NAME_CONTRACT_STATUS_XNA_mean,pos_cash_NAME_CONTRACT_STATUS_XNA_max,pos_cash_NAME_CONTRACT_STATUS_XNA_min,pos_cash_NAME_CONTRACT_STATUS_XNA_sum,pos_cash_NAME_CONTRACT_STATUS_nan_count,pos_cash_NAME_CONTRACT_STATUS_nan_mean,pos_cash_NAME_CONTRACT_STATUS_nan_max,pos_cash_NAME_CONTRACT_STATUS_nan_min,pos_cash_NAME_CONTRACT_STATUS_nan_sum,CHANGE_INSTALMENT
0,100001,9,-72.555556,-53,-96,-653,9,4.0,4.0,4.0,...,0.0,0,0,0,9,0,0,0,0,0.0
1,100002,19,-10.0,-1,-19,-190,19,24.0,24.0,24.0,...,0.0,0,0,0,19,0,0,0,0,0.0
2,100003,28,-43.785714,-18,-77,-1226,28,10.107143,12.0,6.0,...,0.0,0,0,0,28,0,0,0,0,0.0
3,100004,4,-25.5,-24,-27,-102,4,3.75,4.0,3.0,...,0.0,0,0,0,4,0,0,0,0,1.0
4,100005,11,-20.0,-15,-25,-220,10,11.7,12.0,9.0,...,0.0,0,0,0,11,0,0,0,0,1.0


# How many instalments left to pay

In [23]:
# Get the minimum value per credit
min_ = pos_cash[['SK_ID_CURR', 'SK_ID_PREV','CNT_INSTALMENT_FUTURE']].groupby(["SK_ID_CURR", "SK_ID_PREV"], as_index = False).min().\
rename(index = str, columns = {'CNT_INSTALMENT': 'CNT_INSTALMENT_FUTURE_MIN'})

grp = min_[["SK_ID_CURR", "CNT_INSTALMENT_FUTURE"]]

# Check if the client still have instalments open (1 : yes, 0 : no)
grp["STILL_INSTALMENT_REMAINING"] =  np.where(grp['CNT_INSTALMENT_FUTURE'] > 0, 1, 0)

del min_ 

# Get the median of remaining installments and the median "is there any installments remain?"

grp = grp.groupby("SK_ID_CURR", as_index = False).median().\
rename(index = str,columns = {'CNT_INSTALMENT_FUTURE': 'MEDIAN_REMAINING_INSTALLMENT',\
                              "STILL_INSTALLMENT_REMAINING": 'MEDIAN_STILL_INSTALMENT_REMAINING'})

# Merge dataframes

pos_cash_agg = pos_cash_agg.merge(grp, on = [ "SK_ID_CURR"], how = 'left' )

del grp

pos_cash_agg.head()


Unnamed: 0,SK_ID_CURR,pos_cash_MONTHS_BALANCE_count,pos_cash_MONTHS_BALANCE_mean,pos_cash_MONTHS_BALANCE_max,pos_cash_MONTHS_BALANCE_min,pos_cash_MONTHS_BALANCE_sum,pos_cash_CNT_INSTALMENT_count,pos_cash_CNT_INSTALMENT_mean,pos_cash_CNT_INSTALMENT_max,pos_cash_CNT_INSTALMENT_min,...,pos_cash_NAME_CONTRACT_STATUS_XNA_min,pos_cash_NAME_CONTRACT_STATUS_XNA_sum,pos_cash_NAME_CONTRACT_STATUS_nan_count,pos_cash_NAME_CONTRACT_STATUS_nan_mean,pos_cash_NAME_CONTRACT_STATUS_nan_max,pos_cash_NAME_CONTRACT_STATUS_nan_min,pos_cash_NAME_CONTRACT_STATUS_nan_sum,CHANGE_INSTALMENT,MEDIAN_REMAINING_INSTALLMENT,STILL_INSTALMENT_REMAINING
0,100001,9,-72.555556,-53,-96,-653,9,4.0,4.0,4.0,...,0,0,9,0,0,0,0,0.0,0.0,0.0
1,100002,19,-10.0,-1,-19,-190,19,24.0,24.0,24.0,...,0,0,19,0,0,0,0,0.0,6.0,1.0
2,100003,28,-43.785714,-18,-77,-1226,28,10.107143,12.0,6.0,...,0,0,28,0,0,0,0,0.0,0.0,0.0
3,100004,4,-25.5,-24,-27,-102,4,3.75,4.0,3.0,...,0,0,4,0,0,0,0,1.0,0.0,0.0
4,100005,11,-20.0,-15,-25,-220,10,11.7,12.0,9.0,...,0,0,11,0,0,0,0,1.0,0.0,0.0


# LOAD DATA

In [24]:
# Previous_application
credit_card_balance = pd.read_csv('/home/convidado/Denise/credit_card_balance.csv')
print('Training data shape: ', credit_card_balance.shape)

credit_card_balance.head()

Training data shape:  (3840312, 23)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [25]:
# Number of unique classes in each object column
credit_card_balance.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_STATUS    7
dtype: int64

In [26]:
# Drop some features

credit_card_balance_drop = credit_card_balance.drop(columns = ["AMT_DRAWINGS_ATM_CURRENT", \
                                                               "AMT_DRAWINGS_OTHER_CURRENT", \
                                                               "AMT_DRAWINGS_POS_CURRENT", \
                                                              "AMT_RECEIVABLE_PRINCIPAL" , \
                                                              "AMT_RECIVABLE"])

credit_card_balance_drop.shape

(3840312, 18)

# FEATURE ENGINEERING: CREDIT_CARD_BALANCE

In [27]:
# One-Hot-Encoding
cc_balance_cat, cc_cat = one_hot_encoder(credit_card_balance_drop)

# Agreggations
# Group by the client id, calculate aggregation statistics
cc_balance_agg = cc_balance_cat.drop(columns = ['SK_ID_PREV']).\
groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

cc_balance_agg.head()

Unnamed: 0_level_0,SK_ID_CURR,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,MONTHS_BALANCE,AMT_BALANCE,AMT_BALANCE,AMT_BALANCE,AMT_BALANCE,...,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan,NAME_CONTRACT_STATUS_nan
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,min,sum,count,mean,max,min,...,count,mean,max,min,sum,count,mean,max,min,sum
0,100006,6,-3.5,-1,-6,-21,6,0.0,0.0,0.0,...,6,0.0,0,0,0,6,0,0,0,0
1,100011,74,-38.5,-2,-75,-2849,74,54482.111149,189000.0,0.0,...,74,0.0,0,0,0,74,0,0,0,0
2,100013,96,-48.5,-1,-96,-4656,96,18159.919219,161420.22,0.0,...,96,0.0,0,0,0,96,0,0,0,0
3,100021,17,-10.0,-2,-18,-170,17,0.0,0.0,0.0,...,17,0.0,0,0,0,17,0,0,0,0
4,100023,8,-7.5,-4,-11,-60,8,0.0,0.0,0.0,...,8,0.0,0,0,0,8,0,0,0,0


In [28]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in cc_balance_agg.columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in cc_balance_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('cc_balance_%s_%s' % (var, stat))
            
# Assign the list of columns names as the dataframe column names
cc_balance_agg.columns = columns
cc_balance_agg.head()

Unnamed: 0,SK_ID_CURR,cc_balance_MONTHS_BALANCE_count,cc_balance_MONTHS_BALANCE_mean,cc_balance_MONTHS_BALANCE_max,cc_balance_MONTHS_BALANCE_min,cc_balance_MONTHS_BALANCE_sum,cc_balance_AMT_BALANCE_count,cc_balance_AMT_BALANCE_mean,cc_balance_AMT_BALANCE_max,cc_balance_AMT_BALANCE_min,...,cc_balance_NAME_CONTRACT_STATUS_Signed_count,cc_balance_NAME_CONTRACT_STATUS_Signed_mean,cc_balance_NAME_CONTRACT_STATUS_Signed_max,cc_balance_NAME_CONTRACT_STATUS_Signed_min,cc_balance_NAME_CONTRACT_STATUS_Signed_sum,cc_balance_NAME_CONTRACT_STATUS_nan_count,cc_balance_NAME_CONTRACT_STATUS_nan_mean,cc_balance_NAME_CONTRACT_STATUS_nan_max,cc_balance_NAME_CONTRACT_STATUS_nan_min,cc_balance_NAME_CONTRACT_STATUS_nan_sum
0,100006,6,-3.5,-1,-6,-21,6,0.0,0.0,0.0,...,6,0.0,0,0,0,6,0,0,0,0
1,100011,74,-38.5,-2,-75,-2849,74,54482.111149,189000.0,0.0,...,74,0.0,0,0,0,74,0,0,0,0
2,100013,96,-48.5,-1,-96,-4656,96,18159.919219,161420.22,0.0,...,96,0.0,0,0,0,96,0,0,0,0
3,100021,17,-10.0,-2,-18,-170,17,0.0,0.0,0.0,...,17,0.0,0,0,0,17,0,0,0,0
4,100023,8,-7.5,-4,-11,-60,8,0.0,0.0,0.0,...,8,0.0,0,0,0,8,0,0,0,0


# Mean of ratio betweeen the payment and the minimal installment

In [29]:
grp = credit_card_balance[["SK_ID_CURR", "SK_ID_PREV", "AMT_INST_MIN_REGULARITY", "AMT_PAYMENT_CURRENT"]]

# Get the differente
grp["RATIO_PAYMENT_INSTALL_MIN"] = grp["AMT_PAYMENT_CURRENT"] / grp["AMT_INST_MIN_REGULARITY"]
grp = grp[["SK_ID_CURR", "RATIO_PAYMENT_INSTALL_MIN"]].groupby("SK_ID_CURR", as_index = False).mean()


# Fill inf
grp = grp.replace(np.inf,0)
grp = grp.replace(-np.inf,0)

# Merge datasets
cc_balance_agg = cc_balance_agg.merge(grp, on = ['SK_ID_CURR'], how = 'left')

del grp

cc_balance_agg.head() 



Unnamed: 0,SK_ID_CURR,cc_balance_MONTHS_BALANCE_count,cc_balance_MONTHS_BALANCE_mean,cc_balance_MONTHS_BALANCE_max,cc_balance_MONTHS_BALANCE_min,cc_balance_MONTHS_BALANCE_sum,cc_balance_AMT_BALANCE_count,cc_balance_AMT_BALANCE_mean,cc_balance_AMT_BALANCE_max,cc_balance_AMT_BALANCE_min,...,cc_balance_NAME_CONTRACT_STATUS_Signed_mean,cc_balance_NAME_CONTRACT_STATUS_Signed_max,cc_balance_NAME_CONTRACT_STATUS_Signed_min,cc_balance_NAME_CONTRACT_STATUS_Signed_sum,cc_balance_NAME_CONTRACT_STATUS_nan_count,cc_balance_NAME_CONTRACT_STATUS_nan_mean,cc_balance_NAME_CONTRACT_STATUS_nan_max,cc_balance_NAME_CONTRACT_STATUS_nan_min,cc_balance_NAME_CONTRACT_STATUS_nan_sum,RATIO_PAYMENT_INSTALL_MIN
0,100006,6,-3.5,-1,-6,-21,6,0.0,0.0,0.0,...,0.0,0,0,0,6,0,0,0,0,
1,100011,74,-38.5,-2,-75,-2849,74,54482.111149,189000.0,0.0,...,0.0,0,0,0,74,0,0,0,0,0.0
2,100013,96,-48.5,-1,-96,-4656,96,18159.919219,161420.22,0.0,...,0.0,0,0,0,96,0,0,0,0,0.0
3,100021,17,-10.0,-2,-18,-170,17,0.0,0.0,0.0,...,0.0,0,0,0,17,0,0,0,0,
4,100023,8,-7.5,-4,-11,-60,8,0.0,0.0,0.0,...,0.0,0,0,0,8,0,0,0,0,


# Mean of ratio betweeen the AMT_PAYMENT_CURRENT and AMT_PAYMENT_TOTAL_CURRENT

In [30]:
grp = credit_card_balance[["SK_ID_CURR", "SK_ID_PREV", "AMT_PAYMENT_CURRENT", "AMT_PAYMENT_TOTAL_CURRENT"]]

# Get the differente
grp["RATIO_PAY_CURR_PAY_TOTAL"] = grp["AMT_PAYMENT_CURRENT"] / grp["AMT_PAYMENT_TOTAL_CURRENT"]
grp = grp[["SK_ID_CURR", "RATIO_PAY_CURR_PAY_TOTAL"]].groupby("SK_ID_CURR", as_index = False).mean()


# Fill inf
grp = grp.replace(np.inf,0)
grp = grp.replace(-np.inf,0)

# Merge datasets
cc_balance_agg = cc_balance_agg.merge(grp, on = ['SK_ID_CURR'], how = 'left')

del grp



In [31]:
cc_balance_agg.head() 

Unnamed: 0,SK_ID_CURR,cc_balance_MONTHS_BALANCE_count,cc_balance_MONTHS_BALANCE_mean,cc_balance_MONTHS_BALANCE_max,cc_balance_MONTHS_BALANCE_min,cc_balance_MONTHS_BALANCE_sum,cc_balance_AMT_BALANCE_count,cc_balance_AMT_BALANCE_mean,cc_balance_AMT_BALANCE_max,cc_balance_AMT_BALANCE_min,...,cc_balance_NAME_CONTRACT_STATUS_Signed_max,cc_balance_NAME_CONTRACT_STATUS_Signed_min,cc_balance_NAME_CONTRACT_STATUS_Signed_sum,cc_balance_NAME_CONTRACT_STATUS_nan_count,cc_balance_NAME_CONTRACT_STATUS_nan_mean,cc_balance_NAME_CONTRACT_STATUS_nan_max,cc_balance_NAME_CONTRACT_STATUS_nan_min,cc_balance_NAME_CONTRACT_STATUS_nan_sum,RATIO_PAYMENT_INSTALL_MIN,RATIO_PAY_CURR_PAY_TOTAL
0,100006,6,-3.5,-1,-6,-21,6,0.0,0.0,0.0,...,0,0,0,6,0,0,0,0,,
1,100011,74,-38.5,-2,-75,-2849,74,54482.111149,189000.0,0.0,...,0,0,0,74,0,0,0,0,0.0,0.0
2,100013,96,-48.5,-1,-96,-4656,96,18159.919219,161420.22,0.0,...,0,0,0,96,0,0,0,0,0.0,0.0
3,100021,17,-10.0,-2,-18,-170,17,0.0,0.0,0.0,...,0,0,0,17,0,0,0,0,,
4,100023,8,-7.5,-4,-11,-60,8,0.0,0.0,0.0,...,0,0,0,8,0,0,0,0,,


# Mean of ratio betweeen the AMT_RECIVABLE and AMT_TOTAL_RECEIVABLE

In [32]:
grp = credit_card_balance[["SK_ID_CURR", "SK_ID_PREV", "AMT_RECIVABLE", "AMT_TOTAL_RECEIVABLE"]]

# Get the differente
grp["RATIO_RECEIVABLE_TOTAL_RECEIVABLE"] = grp["AMT_RECIVABLE"] / grp["AMT_TOTAL_RECEIVABLE"]
grp = grp[["SK_ID_CURR", "RATIO_RECEIVABLE_TOTAL_RECEIVABLE"]].groupby("SK_ID_CURR", as_index = False).mean()


# Fill inf
grp = grp.replace(np.inf,0)
grp = grp.replace(-np.inf,0)

# Merge datasets
cc_balance_agg = cc_balance_agg.merge(grp, on = ['SK_ID_CURR'], how = 'left')

del grp

cc_balance_agg.head()


Unnamed: 0,SK_ID_CURR,cc_balance_MONTHS_BALANCE_count,cc_balance_MONTHS_BALANCE_mean,cc_balance_MONTHS_BALANCE_max,cc_balance_MONTHS_BALANCE_min,cc_balance_MONTHS_BALANCE_sum,cc_balance_AMT_BALANCE_count,cc_balance_AMT_BALANCE_mean,cc_balance_AMT_BALANCE_max,cc_balance_AMT_BALANCE_min,...,cc_balance_NAME_CONTRACT_STATUS_Signed_min,cc_balance_NAME_CONTRACT_STATUS_Signed_sum,cc_balance_NAME_CONTRACT_STATUS_nan_count,cc_balance_NAME_CONTRACT_STATUS_nan_mean,cc_balance_NAME_CONTRACT_STATUS_nan_max,cc_balance_NAME_CONTRACT_STATUS_nan_min,cc_balance_NAME_CONTRACT_STATUS_nan_sum,RATIO_PAYMENT_INSTALL_MIN,RATIO_PAY_CURR_PAY_TOTAL,RATIO_RECEIVABLE_TOTAL_RECEIVABLE
0,100006,6,-3.5,-1,-6,-21,6,0.0,0.0,0.0,...,0,0,6,0,0,0,0,,,
1,100011,74,-38.5,-2,-75,-2849,74,54482.111149,189000.0,0.0,...,0,0,74,0,0,0,0,0.0,0.0,1.0
2,100013,96,-48.5,-1,-96,-4656,96,18159.919219,161420.22,0.0,...,0,0,96,0,0,0,0,0.0,0.0,1.0
3,100021,17,-10.0,-2,-18,-170,17,0.0,0.0,0.0,...,0,0,17,0,0,0,0,,,
4,100023,8,-7.5,-4,-11,-60,8,0.0,0.0,0.0,...,0,0,8,0,0,0,0,,,


In [33]:
print("-----------------------Missing Values------------------------------")
missing_values = missing_values_table(cc_balance_agg)
missing_values.head(20)

-----------------------Missing Values------------------------------
Your selected dataframe has 119 columns.
There are 15 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
RATIO_RECEIVABLE_TOTAL_RECEIVABLE,33033,31.9
RATIO_PAYMENT_INSTALL_MIN,31638,30.6
RATIO_PAY_CURR_PAY_TOTAL,31581,30.5
cc_balance_AMT_PAYMENT_CURRENT_mean,31438,30.4
cc_balance_AMT_PAYMENT_CURRENT_max,31438,30.4
cc_balance_AMT_PAYMENT_CURRENT_min,31438,30.4
cc_balance_CNT_DRAWINGS_ATM_CURRENT_mean,31364,30.3
cc_balance_CNT_DRAWINGS_ATM_CURRENT_max,31364,30.3
cc_balance_CNT_DRAWINGS_ATM_CURRENT_min,31364,30.3
cc_balance_CNT_DRAWINGS_OTHER_CURRENT_mean,31364,30.3


# MERGE ALL DATASETS

In [34]:
previous_app_feat_agg = previous_app_agg.merge(pos_cash_agg, on = ['SK_ID_CURR'], how = 'left' )

In [35]:
previous_app_feat_agg = previous_app_feat_agg.merge(cc_balance_agg, on = ['SK_ID_CURR'], how = 'left' )

In [36]:
previous_app_feat_agg.head()

Unnamed: 0,SK_ID_CURR,previous_app_AMT_ANNUITY_count,previous_app_AMT_ANNUITY_mean,previous_app_AMT_ANNUITY_max,previous_app_AMT_ANNUITY_min,previous_app_AMT_ANNUITY_sum,previous_app_AMT_APPLICATION_count,previous_app_AMT_APPLICATION_mean,previous_app_AMT_APPLICATION_max,previous_app_AMT_APPLICATION_min,...,cc_balance_NAME_CONTRACT_STATUS_Signed_min,cc_balance_NAME_CONTRACT_STATUS_Signed_sum,cc_balance_NAME_CONTRACT_STATUS_nan_count,cc_balance_NAME_CONTRACT_STATUS_nan_mean,cc_balance_NAME_CONTRACT_STATUS_nan_max,cc_balance_NAME_CONTRACT_STATUS_nan_min,cc_balance_NAME_CONTRACT_STATUS_nan_sum,RATIO_PAYMENT_INSTALL_MIN,RATIO_PAY_CURR_PAY_TOTAL,RATIO_RECEIVABLE_TOTAL_RECEIVABLE
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,...,,,,,,,,,,
1,100002,1,9251.775,9251.775,9251.775,9251.775,1,179055.0,179055.0,179055.0,...,,,,,,,,,,
2,100003,3,56553.99,98356.995,6737.31,169661.97,3,435436.5,900000.0,68809.5,...,,,,,,,,,,
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,...,,,,,,,,,,
4,100005,1,4813.2,4813.2,4813.2,4813.2,2,22308.75,44617.5,0.0,...,,,,,,,,,,


# Get the CSV

In [37]:
# Save the CSV
previous_app_feat_agg.to_csv('/home/convidado/Denise/15_07/previous_app_feat_agg.csv')
