In [1]:
import pandas as pd                 # Data Processing
import numpy as np                  # Mathematics
import matplotlib.pyplot as plt     # Visualization


In [2]:
# Read the files of both test and train data
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [3]:
# Find general information regarding test data
print(test_data.head)
print(test_data.shape)
print(test_data.columns)

<bound method NDFrame.head of       Loan_ID Gender Married Dependents     Education Self_Employed  \
0    LP001015   Male     Yes          0      Graduate            No   
1    LP001022   Male     Yes          1      Graduate            No   
2    LP001031   Male     Yes          2      Graduate            No   
3    LP001035   Male     Yes          2      Graduate            No   
4    LP001051   Male      No          0  Not Graduate            No   
..        ...    ...     ...        ...           ...           ...   
362  LP002971   Male     Yes         3+  Not Graduate           Yes   
363  LP002975   Male     Yes          0      Graduate            No   
364  LP002980   Male      No          0      Graduate            No   
365  LP002986   Male     Yes          0      Graduate            No   
366  LP002989   Male      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               5720                  0     

In [4]:
# Determine null values within data set
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [5]:
# Delete rows with null values within data set
test_data = test_data.dropna()

In [6]:
# Ensure that all rows with null values have been deleted
print(test_data.shape)
test_data.isnull().sum()

(289, 12)


Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [7]:
# Display number of loan applications over time

In [8]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.colors as colors
pio.templates.default = "plotly_white"

# Display statistics regarding the gender of all loan applicants using pie chart

genderCount = test_data.groupby(['Gender']).size()
Gender = ['Male','Female']
genderFig = px.pie(genderCount, 
                   values=genderCount, 
                   names=['Female','Male'], 
                   hole = .5, 
                   color_discrete_sequence=px.colors.qualitative.Pastel)
genderFig.update_traces( textposition = 'inside', textinfo = 'percent+label+value')
genderFig.update_layout(title_text='Gender of Loan Applicants', title_font=dict(size=24))
genderFig.show()

In [9]:
# Display married status of applicants with pie chart
married = test_data.groupby(['Married']).size()
genderFig = px.pie(married, 
                   values=married, 
                   names=['No','Yes'], 
                   hole = .5, 
                   color_discrete_sequence=px.colors.qualitative.Pastel)
genderFig.update_traces( textposition = 'inside', textinfo = 'percent+label+value')
genderFig.update_layout(title_text='Married Status of Loan Applicants', title_font=dict(size=24))
genderFig.show()

In [10]:
# Display number of dependants using a bar graph
numDependants = test_data.groupby(['Dependents']).size()
print(numDependants)


fig = px.bar(numDependants, title='Number of Dependants of Applicants', labels={'Number of Dependents','Number of Applicants'})
fig.update_layout(xaxis_title = 'Number of Dependents', yaxis_title = 'Number of Applicants')


Dependents
0     167
1      42
2      50
3+     30
dtype: int64


In [20]:
# Display graduated status or not by pie chart

gradStatus = test_data.groupby(['Education']).size()
print(gradStatus)
gradFig = px.pie(gradStatus,  
                   names=['Graduate','Non-Graduate'], 
                   values=gradStatus,
                   hole = .5, 
                   color_discrete_sequence=px.colors.qualitative.Pastel)
gradFig.update_traces( textposition = 'inside', textinfo = 'percent+label+value')
gradFig.update_layout(title_text='Graduate Status of Loan Applicants', title_font=dict(size=24))
gradFig.show()

Education
Graduate        224
Not Graduate     65
dtype: int64


In [22]:
# Create pie chart for self employed status
selfEmployedStatus = test_data.groupby(['Self_Employed']).size()
print(selfEmployedStatus)
gradFig = px.pie(selfEmployedStatus,  
                   names=['Non Self-Employed','Self-Semployed'], 
                   values=selfEmployedStatus,
                   hole = .5, 
                   color_discrete_sequence=px.colors.qualitative.Pastel)
gradFig.update_traces( textposition = 'inside', textinfo = 'percent+label+value')
gradFig.update_layout(title_text='Self-Employed Status of Loan Applicants', title_font=dict(size=24))
gradFig.show()

Self_Employed
No     257
Yes     32
dtype: int64
