In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

In [27]:
df = pd.read_csv(r'data/cleaned_data.csv')

## Model Building

In [30]:
X = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status']

In [32]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Scale numerical features
scaler = StandardScaler()
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
            'TotalIncome', 'EMI', 'BalanceIncome']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [36]:
# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [38]:
# Evaluate model
y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.7723577235772358

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.47      0.59        43
           1       0.77      0.94      0.84        80

    accuracy                           0.77       123
   macro avg       0.78      0.70      0.72       123
weighted avg       0.78      0.77      0.75       123



In [40]:
# Save model and scaler
pickle.dump(model, open('loan_model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [44]:
def predict_loan_eligibility(input_data):
    """
    Predict loan eligibility in real-time based on customer details
    
    Parameters:
    input_data (dict): Dictionary containing customer details with these keys:
        - 'Gender' (str): 'Male' or 'Female'
        - 'Married' (str): 'Yes' or 'No'
        - 'Dependents' (int): 0, 1, 2, or 3
        - 'Education' (str): 'Graduate' or 'Not Graduate'
        - 'Self_Employed' (str): 'Yes' or 'No'
        - 'ApplicantIncome' (float)
        - 'CoapplicantIncome' (float)
        - 'LoanAmount' (float)
        - 'Loan_Amount_Term' (float)
        - 'Credit_History' (int): 0 or 1
        - 'Property_Area' (str): 'Urban', 'Semiurban', or 'Rural'
    
    Returns:
    dict: Prediction and probability with keys:
        - 'eligible' (bool): True if approved
        - 'probability' (float): Confidence score
        - 'message' (str): Explanation
    """
    # Load model and scaler
    model = pickle.load(open('loan_model.pkl', 'rb'))
    scaler = pickle.load(open('scaler.pkl', 'rb'))
    
    # Create DataFrame from input
    input_df = pd.DataFrame([input_data])
    
    # Feature engineering (same as training)
    input_df['TotalIncome'] = input_df['ApplicantIncome'] + input_df['CoapplicantIncome']
    input_df['EMI'] = input_df['LoanAmount'] / input_df['Loan_Amount_Term']
    input_df['BalanceIncome'] = input_df['TotalIncome'] - (input_df['EMI'] * 1000)
    
    # Convert categorical to numerical
    input_df['Gender'] = input_df['Gender'].map({'Male': 1, 'Female': 0})
    input_df['Married'] = input_df['Married'].map({'Yes': 1, 'No': 0})
    input_df['Education'] = input_df['Education'].map({'Graduate': 1, 'Not Graduate': 0})
    input_df['Self_Employed'] = input_df['Self_Employed'].map({'Yes': 1, 'No': 0})
    
    # One-hot encoding for Property_Area
    input_df['Property_Area_Semiurban'] = (input_df['Property_Area'] == 'Semiurban').astype(int)
    input_df['Property_Area_Urban'] = (input_df['Property_Area'] == 'Urban').astype(int)
    input_df.drop('Property_Area', axis=1, inplace=True)
    
    # Scale numerical features
    num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                'TotalIncome', 'EMI', 'BalanceIncome']
    input_df[num_cols] = scaler.transform(input_df[num_cols])
    
    # Make prediction
    proba = model.predict_proba(input_df)[0][1]
    prediction = model.predict(input_df)[0]
    
    # Prepare response
    result = {
        'eligible': bool(prediction),
        'probability': float(proba),
        'message': 'Approved' if prediction else 'Not Approved'
    }
    
    # Add reason for rejection
    if not prediction:
        if input_data['Credit_History'] == 0:
            result['message'] += ' - Poor Credit History'
        elif input_df['BalanceIncome'].values[0] < 0:
            result['message'] += ' - Insufficient Balance Income'
        elif input_df['EMI'].values[0] > 0.5 * input_df['TotalIncome'].values[0]:
            result['message'] += ' - High EMI to Income Ratio'
    
    return result

In [46]:
# Example customer data
customer_data = {
    'Gender': 'Male',
    'Married': 'Yes',
    'Dependents': 2,
    'Education': 'Graduate',
    'Self_Employed': 'No',
    'ApplicantIncome': 5000,
    'CoapplicantIncome': 2000,
    'LoanAmount': 150,
    'Loan_Amount_Term': 360,
    'Credit_History': 1,
    'Property_Area': 'Urban'
}

# Get prediction
prediction = predict_loan_eligibility(customer_data)
print("\nPrediction Result:")
for key, value in prediction.items():
    print(f"{key}: {value}")


Prediction Result:
eligible: True
probability: 0.96
message: Approved
