Complete the following exercise using Python.

Loan Approval Prediction:

Using the Loan Approval dataset, create an end-to-end workflow for predicting loan approval. Your workflow should include:

- Data loading and exploration
- Data preprocessing (handling missing values, encoding categorical variables, feature scaling)
- Feature selection
- Model training (using logistic regression and KNN)
- Model evaluation (using accuracy, precision, recall, F1-score and ROC AUC score)

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
loan_data = pd.read_csv('https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Loan-Approval-Prediction.csv')

# Split features and target
X = pd.DataFrame(loan_data.drop('Loan_Status', axis=1))
y = pd.DataFrame(loan_data['Loan_Status'])

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [3]:
print(y.head(5))
print(y.value_counts())

  Loan_Status
0           Y
1           N
2           Y
3           Y
4           Y
Loan_Status
Y              422
N              192
Name: count, dtype: int64


In [4]:
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(y))
y.sum()

  y = column_or_1d(y, warn=True)


0    422
dtype: int64

In [5]:
X.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [7]:
X.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
dtype: int64

In [8]:
X['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [9]:
X[X['LoanAmount'].isnull() | X['Loan_Amount_Term'].isnull()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
19,LP001041,Male,Yes,0,Graduate,,2600,3500.0,115.0,,1.0,Urban
35,LP001106,Male,Yes,0,Graduate,No,2275,2067.0,,360.0,1.0,Urban
36,LP001109,Male,Yes,0,Graduate,No,1828,1330.0,100.0,,0.0,Urban
44,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban
45,LP001137,Female,No,0,Graduate,No,3410,0.0,88.0,,1.0,Urban
63,LP001213,Male,Yes,1,Graduate,No,4945,0.0,,360.0,0.0,Rural
73,LP001250,Male,Yes,3+,Not Graduate,No,4755,0.0,95.0,,0.0,Semiurban
81,LP001266,Male,Yes,1,Graduate,Yes,2395,0.0,,360.0,1.0,Semiurban
95,LP001326,Male,No,0,Graduate,,6782,0.0,,360.0,,Urban


In [10]:
# Drop rows with missing values in LoanAmount or Loan_Amount_Term
X = X.dropna(subset=['LoanAmount', 'Loan_Amount_Term'])
y = y.loc[X.index]

In [11]:
# Drop Loan_ID column as it's not useful for modeling
X = X.drop('Loan_ID', axis=1)

In [12]:
X['ApplicantIncome']=X['ApplicantIncome'].astype(float)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578 entries, 1 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             565 non-null    object 
 1   Married            576 non-null    object 
 2   Dependents         566 non-null    object 
 3   Education          578 non-null    object 
 4   Self_Employed      548 non-null    object 
 5   ApplicantIncome    578 non-null    float64
 6   CoapplicantIncome  578 non-null    float64
 7   LoanAmount         578 non-null    float64
 8   Loan_Amount_Term   578 non-null    float64
 9   Credit_History     529 non-null    float64
 10  Property_Area      578 non-null    object 
dtypes: float64(5), object(6)
memory usage: 54.2+ KB


In [14]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578 entries, 1 to 613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       578 non-null    int64
dtypes: int64(1)
memory usage: 9.0 KB


In [15]:
y.sum()

0    403
dtype: int64

In [16]:
# 1. Define pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Fill missing values with median
    ('scaler', StandardScaler())                   # Standardize features (mean=0, variance=1)
])

# 2. Define pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Fill missing with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Convert categories to binary vectors
])

# 3. Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # Apply numerical pipeline to specific numeric columns
        ('num', numerical_transformer, ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                                         'Loan_Amount_Term','Credit_History']),
        # Apply categorical pipeline to specific categorical columns
        ('cat', categorical_transformer, ['Gender', 'Married', 'Dependents', 'Education',
                                          'Self_Employed', 'Property_Area'])
    ])

# 4. Create the full end-to-end pipeline including the model
# This ensures raw data flows through preprocessing directly into the model
model = LogisticRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

In [22]:
# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

pipeline.fit(X_train, y_train.values.ravel())

#probs = pipeline.predict_proba(X_test)[:, 1]
#custom_threshold = 0.55
#preds = (probs >= custom_threshold).astype(int)

preds = pipeline.predict(X_test)

# Calculate classification metrics
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

# Calculate ROC curve and AUC score
# Note: We use predict_proba for ROC/AUC to get probability scores instead of class labels
fpr, tpr, thresholds = roc_curve(y_test, pipeline.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Output results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f"AUC: {roc_auc:.2f}")

Accuracy: 0.81
Precision: 0.82
Recall: 0.93
F1 Score: 0.87
AUC: 0.79


In [23]:
from sklearn.neighbors import KNeighborsClassifier
# 1. Define pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Fill missing values with median
    ('scaler', StandardScaler())                   # Standardize features (mean=0, variance=1)
])

# 2. Define pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Fill missing with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Convert categories to binary vectors
])

# 3. Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # Apply numerical pipeline to specific numeric columns
        ('num', numerical_transformer, ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                                         'Loan_Amount_Term','Credit_History']),
        # Apply categorical pipeline to specific categorical columns
        ('cat', categorical_transformer, ['Gender', 'Married', 'Dependents', 'Education',
                                          'Self_Employed', 'Property_Area'])
    ])

# 4. Create the full end-to-end pipeline including the model
# This ensures raw data flows through preprocessing directly into the model
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

In [24]:
# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

pipeline.fit(X_train, y_train.values.ravel())
preds = pipeline.predict(X_test)

# Calculate classification metrics
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

# Calculate ROC curve and AUC score
# Note: We use predict_proba for ROC/AUC to get probability scores instead of class labels
fpr, tpr, thresholds = roc_curve(y_test, pipeline.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Output results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f"AUC: {roc_auc:.2f}")

Accuracy: 0.81
Precision: 0.82
Recall: 0.93
F1 Score: 0.87
AUC: 0.79
