In [23]:
import pandas as pd

In [24]:
data = pd.read_csv('loan_sanction_train.csv')
data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [25]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [26]:
data.shape

(614, 13)

In [27]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [28]:
data.drop(['Gender', 'Married', 'Self_Employed','Dependents'], axis=1, inplace=True)

In [29]:
data.isnull().sum()

Loan_ID               0
Education             0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [30]:
data['LoanAmount'].fillna(data['LoanAmount'].mean(), inplace=True)

In [31]:
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean(), inplace=True)

In [32]:
data['Credit_History'].fillna(data['Credit_History'].mean(), inplace=True)

In [33]:
data.isnull().sum()

Loan_ID              0
Education            0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

<h2> Hypothesis Testing </h2>
<h3> Hypo 1: Higher income applicants are more likely to get loan approval </h3>
<p> Step 1: Formulate the null and alternative hypotheses:

Null Hypothesis (H0): There is no significant difference in loan approval rates between higher income applicants and lower income applicants.
Alternative Hypothesis (H1): Higher income applicants have a higher loan approval rate compared to lower income applicants.</p>

In [36]:
#importing library
from scipy.stats import chi2_contingency

<p> Step 2: Divide the dataset into two groups: </p>

In [41]:
# Divide the dataset into two groups based on income
thrshold = data['ApplicantIncome'].median()
higher_income = data[data['ApplicantIncome'] > thrshold]
lower_income = data[data['ApplicantIncome'] <= thrshold]

<p> Step 3 : Calculate the loan approval rates: </p>

In [47]:
higher_income_approval_rate = higher_income['Loan_Status'].value_counts(normalize=True)['Y']
lower_rate_approval_rate = lower_income['Loan_Status'].value_counts(normalize=True)['Y']

<p>Step 4 : Perform chi-square test</p>

In [52]:
observed = pd.DataFrame([[higher_income_approval_rate, 1 - higher_income_approval_rate],
                         [lower_rate_approval_rate, 1-lower_rate_approval_rate]],
                       columns=['Approved', 'Not Approved'], index=['Higher Income', 'Lower Income'])

chi2, p_value, _, _ = chi2_contingency(observed)

<p> Step 5 : Interpret the results </p>

In [53]:
significance_level = 0.05

In [54]:
if p_value < significance_level:
    print("Reject the null hypothesis.")
    print("Higher income applicants have a significantly higher loan approval rate compared to lower income applicants.")
else:
    print("Failed to reject the null hypothesis.")
    print("There is no significant difference in loan approval rates between higher income and lower income applicants.")

Failed to reject the null hypothesis.
There is no significant difference in loan approval rates between higher income and lower income applicants.


<p>
Based on the output, the hypothesis test results indicate that we failed to reject the null hypothesis. Therefore, we do not have sufficient evidence to conclude that there is a significant difference in loan approval rates between higher income and lower income applicants in the given dataset. This means that income may not be a strong determining factor for loan approval in this particular dataset.</p>

<h3> Hypo 2: Applicants with a good credit history are more likely to get loan approval </h3>

In [55]:
data['Credit_History'].value_counts()

1.000000    475
0.000000     89
0.842199     50
Name: Credit_History, dtype: int64

In [56]:
# Create contingency table for credit history and loan approval
contingency_table = pd.crosstab(data['Credit_History'], data['Loan_Status'])

# Perform chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Interpret the results
significance_level = 0.05

if p_value < significance_level:
    print("Reject the null hypothesis.")
    print("Applicants with a good credit history are more likely to get loan approval.")
else:
    print("Failed to reject the null hypothesis.")
    print("There is no significant difference in loan approval rates based on credit history.")

Reject the null hypothesis.
Applicants with a good credit history are more likely to get loan approval.
