In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
loan_data = pd.read_csv('../data/loan_approval_dataset.csv', index_col='loan_id')

# Data Preprocessing
# Convert loan_status to binary
loan_data['loan_status'] = loan_data['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Convert categorical variables to numeric
loan_data['education'] = loan_data['education'].map({'Graduate': 1, 'NotGraduate': 0})
loan_data['self_employed'] = loan_data['self_employed'].map({'Yes': 1, 'No': 0})

# Feature Engineering
loan_data['loan_to_income_ratio'] = loan_data['loan_amount'] / loan_data['income_annum']
loan_data['emi'] = loan_data['loan_amount'] / (loan_data['loan_term'] * 12)
loan_data['total_assets'] = (loan_data['residential_assets_value'] + 
                             loan_data['commercial_assets_value'] + 
                             loan_data['luxury_assets_value'] + 
                             loan_data['bank_asset_value'])
loan_data['loan_to_assets_ratio'] = loan_data['loan_amount'] / loan_data['total_assets']
loan_data['balance_income'] = loan_data['income_annum'] - (loan_data['emi'] * 12)

# Select features based on the results from step 1
selected_features = ['loan_term', 'cibil_score', 'loan_to_income_ratio', 'emi', 'loan_to_assets_ratio', 'balance_income']
X = loan_data[selected_features]
y = loan_data['loan_status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the Random Forest model with best parameters
rf_model = RandomForestClassifier(
    n_estimators=231,
    max_depth=30,
    min_samples_split=4,
    min_samples_leaf=1,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and scaler
joblib.dump(rf_model, '../models/loan_eligibility_model.joblib')
joblib.dump(scaler, '../models/feature_scaler.joblib')

print("Model and scaler have been saved successfully.")