In [1]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
#To download file in google colab
#!gdown 1LhwzBt5d9sBF5DsDtLjIl2_-L0D0oYFq

In [3]:
# Load dataset
df = pd.read_csv('data/lpp.csv')

In [4]:
target = df["Loan_Status"]

In [5]:
# Drop identifier column
df.drop(columns=["Loan_ID", "Loan_Status"], inplace=True)

In [6]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [7]:
# Handle missing values
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

In [8]:
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols))

In [9]:
# Merge numerical and categorical features
X = pd.concat([df[numerical_cols], categorical_df], axis=1)
feature_names = X.columns.tolist()  # Save feature order
y = LabelEncoder().fit_transform(target)

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train model
model = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression())
])
model.fit(X_train, y_train)

In [12]:
# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7886
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123

Confusion Matrix:
 [[18 25]
 [ 1 79]]


In [13]:
models_save_dir = "./../models"

In [14]:
# Save model and processing objects
joblib.dump(model, os.path.join(models_save_dir, "loan_prediction_model.pkl"))
joblib.dump(encoder, os.path.join(models_save_dir, "encoder.pkl"))
joblib.dump(imputer_cat, os.path.join(models_save_dir, "imputer_cat.pkl"))
joblib.dump(imputer_num, os.path.join(models_save_dir, "imputer_num.pkl"))
joblib.dump(feature_names, os.path.join(models_save_dir, "feature_names.pkl"))

['./../models/feature_names.pkl']

In [15]:
def predict_loan_status(data):
    """Predict loan approval status for input data."""
    model = joblib.load(os.path.join(models_save_dir, "loan_prediction_model.pkl"))
    encoder = joblib.load(os.path.join(models_save_dir, "encoder.pkl"))
    imputer_cat = joblib.load(os.path.join(models_save_dir, "imputer_cat.pkl"))
    imputer_num = joblib.load(os.path.join(models_save_dir, "imputer_num.pkl"))
    feature_names = joblib.load(os.path.join(models_save_dir, "feature_names.pkl"))

    # Convert input data to DataFrame
    data_df = pd.DataFrame([data])

    # Ensure all categorical columns exist
    for col in categorical_cols:
        if col not in data_df:
            data_df[col] = np.nan
    data_df[categorical_cols] = imputer_cat.transform(data_df[categorical_cols])

    # Ensure all numerical columns exist
    for col in numerical_cols:
        if col not in data_df:
            data_df[col] = np.nan
    data_df[numerical_cols] = imputer_num.transform(data_df[numerical_cols])

    # Encode categorical variables
    categorical_encoded = encoder.transform(data_df[categorical_cols])
    categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Merge numerical and categorical features
    X_input = pd.concat([data_df[numerical_cols].reset_index(drop=True), categorical_df], axis=1)

    # Ensure correct column order
    for col in feature_names:
        if col not in X_input:
            X_input[col] = 0  # Add missing features with default value
    X_input = X_input[feature_names]

    # Predict
    prediction = model.predict(X_input)
    return "Approved" if prediction[0] == 1 else "Rejected"

In [16]:
sample_data = {
    "Gender": "Male", "Married": "Yes", "Dependents": "1", "Education": "Graduate", "Self_Employed": "No",
    "ApplicantIncome": 5000, "CoapplicantIncome": 0, "LoanAmount": 120, "Loan_Amount_Term": 360,
    "Credit_History": 1, "Property_Area": "Urban"
}
print(predict_loan_status(sample_data))

Approved


In [17]:
sample_data_rejected = {
    "Gender": "Male",
    "Married": "No",
    "Dependents": "0",
    "Education": "Not Graduate",
    "Self_Employed": "Yes",
    "ApplicantIncome": 1500,  # Low income
    "CoapplicantIncome": 0,   # No co-applicant support
    "LoanAmount": 200,        # High loan amount
    "Loan_Amount_Term": 360,
    "Credit_History": 0,      # Poor credit history
    "Property_Area": "Rural"  # Less preferred property area
}

print(predict_loan_status(sample_data_rejected))


Rejected


In [18]:
sample_data_extreme_rejection = {
    "Gender": "Male",
    "Married": "No",
    "Dependents": "3+",
    "Education": "Not Graduate",
    "Self_Employed": "Yes",
    "ApplicantIncome": 1000,  # Extremely low income
    "CoapplicantIncome": 0,
    "LoanAmount": 300,        # Very high loan amount
    "Loan_Amount_Term": 360,
    "Credit_History": 0,      # No credit history (bad)
    "Property_Area": "Rural"
}

print(predict_loan_status(sample_data_extreme_rejection))


Rejected


In [19]:
import numpy as np

if hasattr(model.named_steps["classifier"], "coef_"):  # For Logistic Regression
    feature_names = X_train.columns
    importance = model.named_steps["classifier"].coef_[0]
    sorted_idx = np.argsort(np.abs(importance))[::-1]  # Sort by absolute importance

    print("Feature Importance (Most Impactful Features First):")
    for idx in sorted_idx:
        print(f"{feature_names[idx]}: {importance[idx]:.4f}")


Feature Importance (Most Impactful Features First):
Credit_History: 1.3093
Property_Area_Semiurban: 0.4588
Married_Yes: 0.3426
Dependents_1: -0.1824
LoanAmount: -0.1511
CoapplicantIncome: -0.1479
Education_Not Graduate: -0.1357
Property_Area_Urban: 0.1251
Dependents_3+: 0.1205
Dependents_2: 0.1044
Gender_Male: -0.0512
Self_Employed_Yes: 0.0436
ApplicantIncome: -0.0268
Loan_Amount_Term: 0.0066
