# Home Credit Loan Default Predection

In [None]:
# First of all, we're gonna import the libs necessary for our project
import pandas as pd
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import re
import time
import numpy as np
import gc
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import math
import pickle
import os

### Here we're gonna use the reduce_memory_usage so that our dataframes can run on notebook easier.
#### Ref = https://www.kaggle.com/rinnqd/reduce-memory-usage.

In [None]:
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum()/1024**2
    print(f'Memory before: {:.2start_mem} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory after: {:.2end_mem}MB'.format(end_mem))
    print(f'{:.1(100*(start_mem - end_mem) / start_mem)}% Less'.)
    
    return df

In [None]:
# Loading all data 
data_train = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\application_train.csv'))
data_test = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\application_test.csv'))
data_bureau = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\bureau.csv'))
data_bureau_balance = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\bureau_balance.csv'))
data_application = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\previous_application.csv'))
data_pos_cash = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\POS_CASH_balance.csv'))
data_installments = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\installments_payments.csv'))
data_credit_card = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\credit_card_balance.csv'))
data_sample = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\sample_submission.csv'))
data_description = reduce_memory_usage(pd.read_csv(r'C:\Users\lucas\OneDrive\Documentos\Codes\HomeCredit_Default_Risk\1-Data\HomeCredit_columns_description.csv'))

# Data-Train

In [None]:
data_train

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
data_train.columns.values

# Data-Test

In [None]:
data_test

In [None]:
data_test.info()

In [None]:
data_test.describe()

In [None]:
data_test.columns.values

# Data-Bureau

In [None]:
data_bureau

In [None]:
data_bureau.info()

In [None]:
data_bureau.describe()

In [None]:
data_bureau.columns.values

# Data-Bureau Balance

In [None]:
data_bureau_balance

In [None]:
data_bureau_balance.info()

In [None]:
data_bureau_balance.describe()

In [None]:
data_bureau_balance.columns.values

# Data-Previous Application

In [None]:
data_application

In [None]:
data_application.info()

In [None]:
data_application.describe()

In [None]:
data_application.columns.values

# Data-Pos Cash

In [None]:
data_pos_cash

In [None]:
data_pos_cash.info()

In [None]:
data_pos_cash.describe()

In [None]:
data_pos_cash.columns.values

# Data-Installments

In [None]:
data_installments

In [None]:
data_installments.info()

In [None]:
data_installments.describe()

In [None]:
data_installments.columns.values

# Data-Credit Card

In [None]:
data_credit_card

In [None]:
data_credit_card.info()

In [None]:
data_credit_card.describe()

In [None]:
data_credit_card.columns.values

# Data-Sample Application

In [None]:
data_sample

In [None]:
data_sample.info()

In [None]:
data_sample.describe()

In [None]:
data_sample.columns.values

# Data-Description

In [None]:
data_description

# Proportion

In [None]:
counted_values = data_train['TARGET'].value_counts()
print("Customers who won't repay the loan on time:", counted_values[1],"(", ((counted_values[1]/(counted_values[0]+counted_values[1]))*100).round(2),"%)")
print("Customers who will repay the loan on time:", counted_values[0],"(", ((counted_values[0]/(counted_values[0]+counted_values[1]))*100).round(2),"%)")

# Plots Functions

In [None]:
def stack_plot(data, xtick, col2='TARGET', col3='Total'):
    ind = np.arange(data.shape[0])
    
    if len(data[xtick].unique())<5:
        plt.figure(figsize=(5,5))
    elif len(data[xtick].unique())>5 & len(data[xtick].unique())<10:
        plt.figure(figsize=(7,7))
    else:
        plt.figure(figsize=(15,15))
    p1 = plt.bar(ind, data[col3].values)
    p2 = plt.bar(ind, data[col2].values)

    plt.ylabel('Loans')
    plt.title('Aproved vs Rejected')
    plt.xticks(ticks=ind,rotation=90,labels= list(data[xtick].values))
    plt.legend((p1[0], p2[0]), ('Capable', 'Not Capable'))
    plt.show()

In [None]:
def univariate_barplots(data, col1, col2='TARGET', top=False):
    # Count number of zeros in dataframe python
    temp = pd.DataFrame(data_train.groupby(col1)[col2].agg(lambda x: x.eq(1).sum())).reset_index()

    # Pandas dataframe groupby count
    temp['Total'] = pd.DataFrame(data_train.groupby(col1)[col2].agg(total='Count')).reset_index()['Total']
    temp['Average'] = pd.DataFrame(data_train.groupby(col1)[col2].agg(Avg='Mean')).reset_index()['Average']
    
    temp.sort_values(by=['Total'],inplace=True, ascending=False)
    
    if top:
        temp = temp[0:top]
    
    stack_plot(temp, xtick=col1, col2=col2, col3='Total')
    print(temp.head(5))
    print("="*50)
    print(temp.tail(5))

# Plots

In [None]:
univariate_barplots(data_train, 'NAME_CONTRACT_TYPE', 'TARGET', False)

In [None]:
univariate_barplots(data_train, 'CODE_GENDER', 'TARGET', False)

In [None]:
univariate_barplots(data_train, 'FLAG_OWN_CAR', 'TARGET', False)

In [None]:
univariate_barplots(data_train, 'FLAG_OWN_REALTY', 'TARGET', False)

In [None]:
univariate_barplots(data_train, 'CNT_CHILDREN', 'TARGET', False)

### Reset indexes of a dataframe for all groups in one step
#### Ref = https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step

In [None]:
income_data = data_train.groupby('SK_ID_CURR').agg({'AMT_INCOME_TOTAL':'mean'}).reset_index()
income_data.head(2)

In [None]:
# Creating new dataframes splited by the income of each group 
income_data_final = pd.merge(data_train, income_data, on='SK_ID_CURR', how='left')

approved_income = income_data_final[income_data_final['TARGET']==0]['AMT_INCOME_TOTAL_x'].values
rejected_income = income_data_final[income_data_final['TARGET']==1]['AMT_INCOME_TOTAL_x'].values

In [None]:
# Ploting the boxplot by income of each group
plt.boxplot([approved_income, rejected_income])
plt.title('Box Plots: Cost per Approved and Not Approved Loans')
plt.xticks([1,2],('Loans Approved','Loans Rejected'))
plt.ylabel('Price')
plt.grid()
plt.show()

In [None]:
# Ploting the table with each group
x = PrettyTable()
x.field_names = ["Percentile", "Loans Approved", "Loans Not Approved"]

for i in range(0,101,5):
    x.add_row([i,np.round(np.percentile(approved_income,i), 3),\
               np.round(np.percentile(rejected_income,i), 3)])
print(x)