In [None]:
#KNN (K-Nearest Neighbors) works like this:
#When a new loan applicant comes,
#The algorithm finds K nearest customers (similar profiles)
#Checks their loan status assigns the majority class

#Example:
        #If 5 nearest customers → 4 defaulted, 1 not defaulted
        #Prediction = Default
        #It is called a lazy learning algorithm because it does not build a model in advance.

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [66]:
#1. data loading
df=pd.read_csv("LoanDataSet.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [67]:
# Check missing values
print(df.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [68]:
print(df.duplicated())

0      False
1      False
2      False
3      False
4      False
       ...  
609    False
610    False
611    False
612    False
613    False
Length: 614, dtype: bool


In [69]:
# Fill categorical missing values with mode
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])
df["Married"] = df["Married"].fillna(df["Married"].mode()[0])
df["Dependents"] = df["Dependents"].fillna(df["Dependents"].mode()[0])
df["Self_Employed"] = df["Self_Employed"].fillna(df["Self_Employed"].mode()[0])



In [70]:
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [71]:
# Fill numerical missing values with median
df["LoanAmount"] = df["LoanAmount"].fillna(df["LoanAmount"].median())
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].median())

In [73]:
#Encoding (Convert Categorical → Numeric)
df.replace({
    "Loan_Status": {"N":0, "Y":1},
    "Gender": {"Male":0, "Female":1},
    "Education": {"Not Graduate":0, "Graduate":1},
    "Married": {"No":0, "Yes":1},
    "Self_Employed": {"No":0, "Yes":1}
}, inplace=True)



In [74]:
# One-hot encoding(convert categorical to 0/1)
df = pd.get_dummies(df, columns=["Property_Area", "Dependents"], drop_first=True)

#drop_first=True This removes the first category to avoid dummy variable trap (multicollinearity)
#if Property_Area_Semiurban & Property_Area_Urban both are 0, then it automatically means Rural
#This reduces redundancy and improves model stability.

In [75]:
y = df["Loan_Status"]
X = df.drop(["Loan_Status", "Loan_ID"], axis=1)

In [76]:
#Feature Scaling (Important for KNN)
#StandardScaler standardizes the data so that:
#Mean of each feature becomes 0 & Standard deviation becomes 1
#This process is called standardization or z-score normalization.
#Scaling makes all features contribute equally.
scaler = StandardScaler()

#Calculates the mean and standard deviation of each column in x_train.
#Uses those calculated values to scale the training data.
x_train = scaler.fit_transform(x_train)

#The model must see test data scaled using training data statistics.
x_test = scaler.transform(x_test)

In [81]:
#This replaces NaN values with the mean(x_train and x_test are the type of numpy)
import numpy as np

x_train = np.nan_to_num(x_train, nan=np.nanmean(x_train))
x_test = np.nan_to_num(x_test, nan=np.nanmean(x_test))

In [82]:
#Train KNN Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [83]:
#prediction
y_pred = knn_model.predict(x_test)

In [84]:
#Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))      #Generates a detailed performance report, #It prints multiple evaluation metrics for each class.

Accuracy: 0.7642276422764228

Confusion Matrix:
[[17 26]
 [ 3 77]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.40      0.54        43
           1       0.75      0.96      0.84        80

    accuracy                           0.76       123
   macro avg       0.80      0.68      0.69       123
weighted avg       0.78      0.76      0.74       123



In [None]:
#Precision->Out of all predicted positives, how many were correct
#Recall->Out of all actual positives, how many were found
#f1-score->Balance between precision and recall.
#support->Number of actual samples in each class.