## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from sklearn import metrics

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
loan_train_df = pd.read_csv('../data/loan-train.csv')
loan_test_df = pd.read_csv('../data/loan-test.csv')

In [3]:
loan_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
loan_train_df.Loan_ID.nunique()

614

We can see that all the entries of this column are unique. Hence, this column would not add any value to our analysis.
Let's drop this column.

## EDA and Data Preprocessing

### Dropping the Loan_ID column

In [7]:
loan_train_df.drop(columns=["Loan_ID"], inplace=True)
loan_test_df.drop(columns=["Loan_ID"], inplace=True)

### Summary Statistics for numerical columns

In [13]:
#Creating a list of numerical columns
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

loan_train_df[num_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12041,12.0,360.0,360.0,360.0,480.0


Observations:

- The average income of applicants is about 5.4K dollars. It has a large range of values from 150 to 81,000. It would be interesting to see if low applicant income implies a low loan amount.
- The average co-applicant income is about 1.6K dollars which is much lower than the applicant's income. The 25th percentile value is 0 and the median value is 1,188 dollars which implies that the number of joint home loans is more than the number of non-joint loans.
- The two columns ApplicantIncome and CoapplicantIncome give the same information i.e. income of applicants. It would be better to have that information in one column only. We can add these two columns to get the total income per application.
- The 25th, 50th, and 75th percentile value of the loan term is 360 months i.e. 30 years. This implies that the majority of home loans in this dataset are for 30 years of term.
- We can convert the scale of the loan term from months to years to make it easier to read.
- The average loan amount is 1.41K dollars. It has a large range of values, which is to be expected.
- The loan amount has zero values, which is absurd since loan amounts are meant to be non-zero. So we can treat 0's as missing values.

Before imputing the missing values and 0's in LoanAmount, let's do some feature engineering and check the distributions, counts, and outliers for different variables.

### Feature Engineering

In [14]:
#Converting the scale of loan term from months to years
loan_train_df['Loan_Amount_Term'] = loan_train_df['Loan_Amount_Term']/12

In [15]:
#Adding the applicant and co-applicant income to get the total income per application
loan_train_df['total_income'] = loan_train_df['ApplicantIncome'] + loan_train_df['CoapplicantIncome']

In [16]:
#Dropping the columns as we created a new column which captures the same information
loan_train_df.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], inplace=True)

### Checking the percentage of each category for categorical variables

In [None]:
#Creating list of categorical columns
cat_col= ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History','Property_Area', 'Loan_Status']

for col in cat_col:
    print(loan_train_df[col].value_counts(normalize=True))  #The parameter normalize=True gives the percentage of each category
    print('*'*40)                                  #Print the * 40 times to separate different variables

In [17]:
loan_train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,total_income
0,Male,No,0,Graduate,No,,30.0,1.0,Urban,Y,5849.0
1,Male,Yes,1,Graduate,No,128.0,30.0,1.0,Rural,N,6091.0
2,Male,Yes,0,Graduate,Yes,66.0,30.0,1.0,Urban,Y,3000.0
3,Male,Yes,0,Not Graduate,No,120.0,30.0,1.0,Urban,Y,4941.0
4,Male,No,0,Graduate,No,141.0,30.0,1.0,Urban,Y,6000.0


In [19]:
loan_test_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
