In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px


In [2]:
#upload the dataset
df=pd.read_csv("/content/loan_prediction.csv")
print(df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [3]:
#drop the LoanId column and move further
df=df.drop("Loan_ID",axis=1)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
#shape of the dataset
df.shape

(614, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [6]:
#Checking for the null values
df.isnull().sum()

print()


df.isnull().sum()/len(df)*100    #percentage of missing values





Unnamed: 0,0
Gender,2.117264
Married,0.488599
Dependents,2.442997
Education,0.0
Self_Employed,5.211726
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,3.583062
Loan_Amount_Term,2.28013
Credit_History,8.143322


In [7]:
#descriptive statistics of the data
df.describe().round(2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.46,6109.04,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.25,2926.25,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.41,85.59,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12,12.0,360.0,360.0,360.0,480.0
Credit_History,564.0,0.84,0.36,0.0,1.0,1.0,1.0,1.0


In [8]:
#fill the null values in the categorical col using mode
cat_col=df.select_dtypes(include="object").columns
for col in cat_col:
  mode_val=df[col].mode()[0]
  df[col].fillna(mode_val,inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_val,inplace=True)


In [9]:
#fill missing values in LoanAmount with median
df["LoanAmount"].fillna(df["LoanAmount"].median(),inplace=True)

#fill missing values Loan_Amount_Term with mode
df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].mode()[0],inplace=True)

#fill missing values in Credit_History with mode
df["Credit_History"].fillna(df["Credit_History"].mode()[0],inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["LoanAmount"].fillna(df["LoanAmount"].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

# Exploratory Data Analysis


In [10]:
loan_status_count=df["Loan_Status"].value_counts()
print(loan_status_count)

fig_loan_count=px.pie(loan_status_count,names=loan_status_count.index,values=loan_status_count.values,title="Loan Status Count")
fig_loan_count.show()

Loan_Status
Y    422
N    192
Name: count, dtype: int64


In [11]:
#gender column
gender_count=df["Gender"].value_counts()
print(gender_count)

fig_gender_count=px.bar(gender_count,x=gender_count.index,y=gender_count.values,title="Gender Count")
fig_gender_count.show()

Gender
Male      502
Female    112
Name: count, dtype: int64


In [12]:
#Married column
maritial_status=df["Married"].value_counts()
print(maritial_status)

fig_maritial_count=px.bar(maritial_status,x=maritial_status.index,y=maritial_status.values,title="Maritial Count")
fig_maritial_count.show()


Married
Yes    401
No     213
Name: count, dtype: int64


In [13]:
#Education colum
education_count=df["Education"].value_counts()
print(education_count)

fig_education_count=px.bar(education_count,x=education_count.index,y=education_count.values,title="Education Count")
fig_education_count.show()

Education
Graduate        480
Not Graduate    134
Name: count, dtype: int64


In [14]:
#self-employment column
self_emp_count=df["Self_Employed"].value_counts()
print(self_emp_count)

fig_self_emp_count=px.bar(self_emp_count,x=self_emp_count.index,y=self_emp_count.values,title="Self-employment Count")
fig_self_emp_count.show()

Self_Employed
No     532
Yes     82
Name: count, dtype: int64


In [15]:
#distribution of Applicant Income
fig_applicant_income=px.histogram(df,x="ApplicantIncome",title="Distribution of Applicant Income")
fig_applicant_income.show()

In [16]:
fig_income=px.box(df,x="Loan_Status",y="ApplicantIncome",color="Loan_Status",title="Income vs Loan Status")
fig_income.show()

In [17]:
#remove the ouliers in the applicant income
Q1=df["ApplicantIncome"].quantile(0.25)
Q3=df["ApplicantIncome"].quantile(0.75)

IQR=Q3-Q1

lower_bound=Q1-(1.5*IQR)
upper_bound=Q3+(1.5*IQR)

df=df[(df["ApplicantIncome"]>=lower_bound) & (df["ApplicantIncome"]<=upper_bound)]

In [18]:
fig_income=px.box(df,x="Loan_Status",y="CoapplicantIncome",color="Loan_Status",title="CoapplicantIncome vs Loan Status")
fig_income.show()

In [19]:
#removing coapplicant ouliers
Q1=df["CoapplicantIncome"].quantile(0.25)
Q3=df["CoapplicantIncome"].quantile(0.75)

IQR=Q3-Q1

lower_bound=Q1-(1.5*IQR)
upper_bound=Q3+(1.5*IQR)

df=df[(df["CoapplicantIncome"]>=lower_bound) & (df["CoapplicantIncome"]<=upper_bound)]

In [20]:
fig_income=px.box(df,x="Loan_Status",y="LoanAmount",color="Loan_Status",title="CoapplicantIncome vs Loan Status")
fig_income.show()

In [21]:
#relation btw credit history and loan status
fig_credit_history=px.histogram(df,x="Credit_History",color="Loan_Status",barmode="group",title="Credit History vs Loan Status")
fig_credit_history.show()

In [22]:
fig_credit_history=px.histogram(df,x="Property_Area",color="Loan_Status",barmode="group",title="Credit History vs Loan Status")
fig_credit_history.show()

## Data Preparation and Training Data Model

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Convert categorical columns to numerical using one-hot encoding
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
df = pd.get_dummies(df, columns=cat_cols)



KeyError: "None of [Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',\n       'Property_Area'],\n      dtype='object')] are in the [columns]"

In [27]:
# Split the dataset into features (X) and target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical columns using StandardScaler
scaler = StandardScaler()
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

from sklearn.svm import SVC
model = SVC(random_state=42)
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)
print(y_pred)

['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'Y' 'Y'
 'Y' 'Y']


In [29]:
# Convert X_test to a DataFrame
X_test_df = pd.DataFrame(X_test, columns=X_test.columns)

# Add the predicted values to X_test_df
X_test_df['Loan_Status_Predicted'] = y_pred
print(X_test_df.head())

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
277        -0.544528          -0.037922   -0.983772          0.305159   
84         -0.067325          -0.931554   -1.571353         -1.430680   
275        -0.734870           0.334654   -0.298262          0.305159   
392        -0.824919           0.522317   -0.200332          0.305159   
537        -0.267373          -0.931554   -0.454950          0.305159   

     Credit_History  Gender_Female  Gender_Male  Married_No  Married_Yes  \
277        0.402248          False         True       False         True   
84         0.402248          False         True       False         True   
275        0.402248          False         True       False         True   
392        0.402248          False         True       False         True   
537        0.402248          False         True        True        False   

     Dependents_0  ...  Dependents_2  Dependents_3+  Education_Graduate  \
277          True  ...       

In [30]:
!pip freeze > requirements.txt

