In [1]:
import pandas as pd

df = pd.read_csv('C:\\Users\\DVK\\Desktop\\EMIPredict AI\\data\\processed\\emi_prediction_dataset_cleaned.csv')
df.head()

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan EMI,850000.0,15.0,Not_Eligible,500.0
1,38,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19.0,Not_Eligible,700.0
2,38,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education EMI,306000.0,16.0,Eligible,27775.0
3,58,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle EMI,304000.0,83.0,Eligible,16170.0
4,48,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances EMI,252000.0,7.0,Not_Eligible,500.0


In [2]:
# ================================
# FEATURE ENGINEERING - EMI PROJECT
# ================================

import numpy as np
import pandas as pd

# ---------- 1. TOTAL MONTHLY EXPENSE ----------
df['total_monthly_expense'] = (
    df['school_fees'] +
    df['college_fees'] +
    df['travel_expenses'] +
    df['groceries_utilities'] +
    df['other_monthly_expenses']
)

# ---------- 2. DEBT TO INCOME RATIO (DTI) ----------
df['debt_to_income'] = (
    df['current_emi_amount'] + df['total_monthly_expense']
) / df['monthly_salary']

# ---------- 3. EXPENSE TO INCOME RATIO ----------
df['expense_to_income'] = (
    df['total_monthly_expense'] / df['monthly_salary']
)

# ---------- 4. DISPOSABLE INCOME ----------
df['disposable_income'] = (
    df['monthly_salary'] -
    (df['current_emi_amount'] + df['total_monthly_expense'])
)

# ---------- 5. AFFORDABILITY RATIO ----------
df['affordability_ratio'] = (
    df['disposable_income'] / df['monthly_salary']
)

# ---------- 6. EMI BURDEN RATIO ----------
df['emi_burden_ratio'] = (
    df['current_emi_amount'] / df['monthly_salary']
)

# ---------- 7. EMPLOYMENT STABILITY SCORE ----------
df['employment_stability_score'] = np.where(
    df['years_of_employment'] >= 5, 1,
    np.where(df['years_of_employment'] >= 2, 0.5, 0)
)

# ---------- 8. CREDIT SCORE NORMALIZATION ----------
df['credit_score_norm'] = df['credit_score'] / 850

# ---------- 9. EMERGENCY FUND RATIO ----------
df['emergency_fund_ratio'] = (
    df['emergency_fund'] / df['monthly_salary']
)

# ---------- 10. COMPOSITE FINANCIAL RISK SCORE ----------
df['financial_risk_score'] = (
    0.35 * df['debt_to_income'] +
    0.25 * df['emi_burden_ratio'] +
    0.20 * (1 - df['credit_score_norm']) +
    0.20 * (1 - df['employment_stability_score'])
)

# ---------- 11. HANDLE INF & NA VALUES ----------
df.replace([np.inf, -np.inf], 0, inplace=True)
df.fillna(0, inplace=True)

# ---------- 12. FINAL CHECK ----------
df[['debt_to_income',
    'affordability_ratio',
    'financial_risk_score']].describe()


Unnamed: 0,debt_to_income,affordability_ratio,financial_risk_score
count,187767.0,187767.0,187767.0
mean,0.758152,0.241848,0.421354
std,0.550262,0.550262,0.232281
min,0.011144,-19.430448,-0.065987
25%,0.522644,0.14433,0.303804
50%,0.670659,0.329341,0.400029
75%,0.85567,0.477356,0.500904
max,20.430448,0.988856,8.154311


In [3]:
# =====================================
# ENCODING & SCALING - EMI PROJECT
# =====================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# ---------- 1. COPY DATA ----------
df_model = df.copy()

# ---------- 2. TARGET VARIABLES ----------
y_classification = df_model['emi_eligibility']      # Classification target
y_regression = df_model['max_monthly_emi']          # Regression target

# ---------- 3. DROP TARGETS FROM FEATURES ----------
X = df_model.drop(['emi_eligibility', 'max_monthly_emi'], axis=1)

# ---------- 4. LABEL ENCODING (TARGET) ----------
le_target = LabelEncoder()
y_classification = le_target.fit_transform(y_classification)

# Save mapping for Streamlit
label_mapping = dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))

# ---------- 5. ENCODE CATEGORICAL FEATURES ----------
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# ---------- 6. SCALING NUMERICAL FEATURES ----------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------- 7. FINAL DATA SHAPE CHECK ----------
print("Features shape:", X_scaled.shape)
print("Classification target shape:", y_classification.shape)
print("Regression target shape:", y_regression.shape)

# ---------- 8. SAMPLE CHECK ----------
pd.DataFrame(X_scaled, columns=X.columns).head()


Features shape: (187767, 46)
Classification target shape: (187767,)
Regression target shape: (187767,)


Unnamed: 0,age,monthly_salary,years_of_employment,monthly_rent,family_size,dependents,school_fees,college_fees,travel_expenses,groceries_utilities,...,company_type_Mid-size,company_type_Small,company_type_Startup,house_type_Own,house_type_Rented,existing_loans_Yes,emi_scenario_Education EMI,emi_scenario_Home Appliances EMI,emi_scenario_Personal Loan EMI,emi_scenario_Vehicle EMI
0,-0.091098,0.533969,-0.734212,1.648183,0.05792,0.05792,-0.911441,-0.554644,0.445834,0.959485,...,1.730668,-0.230156,-0.420119,-0.740542,1.22958,1.224321,-0.500413,-0.500296,2.005723,-0.499572
1,-0.091098,-0.877852,0.270131,-0.669574,-0.870809,-0.870809,0.096215,-0.554644,-1.264786,-1.060891,...,-0.577811,-0.230156,-0.420119,-0.740542,-0.813286,1.224321,-0.500413,-0.500296,-0.498573,-0.499572
2,-0.091098,0.614843,0.072556,-0.669574,0.986648,0.986648,-0.911441,-0.554644,1.330637,0.945156,...,-0.577811,-0.230156,2.38028,1.350363,-0.813286,-0.816779,1.998351,-0.500296,-0.498573,-0.499572
3,2.06184,0.168883,-0.520172,-0.669574,1.915376,1.915376,1.340967,-0.554644,0.150899,-0.129512,...,1.730668,-0.230156,-0.420119,1.350363,-0.813286,-0.816779,-0.500413,-0.500296,-0.498573,2.001713
4,0.985371,-0.050631,-0.322596,-0.669574,0.986648,0.986648,0.945807,2.359161,-0.61593,0.486631,...,1.730668,-0.230156,-0.420119,-0.740542,-0.813286,-0.816779,-0.500413,1.998816,-0.498573,-0.499572


In [4]:
# =====================================
# TRAIN - TEST SPLIT (CLASS + REG)
# =====================================

from sklearn.model_selection import train_test_split

# ---------- 1. CLASSIFICATION SPLIT ----------
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_scaled,
    y_classification,
    test_size=0.2,
    random_state=42,
    stratify=y_classification
)

# ---------- 2. REGRESSION SPLIT ----------
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_scaled,
    y_regression,
    test_size=0.2,
    random_state=42
)

# ---------- 3. SHAPE VERIFICATION ----------
print("Classification Train:", X_train_c.shape, y_train_c.shape)
print("Classification Test :", X_test_c.shape, y_test_c.shape)

print("Regression Train:", X_train_r.shape, y_train_r.shape)
print("Regression Test :", X_test_r.shape, y_test_r.shape)


Classification Train: (150213, 46) (150213,)
Classification Test : (37554, 46) (37554,)
Regression Train: (150213, 46) (150213,)
Regression Test : (37554, 46) (37554,)


In [5]:
# =====================================
# CLASSIFICATION MODELS - EMI PROJECT
# =====================================

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# ---------- 1. LOGISTIC REGRESSION ----------
log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train_c, y_train_c)
y_pred_lr = log_reg.predict(X_test_c)
y_prob_lr = log_reg.predict_proba(X_test_c)[:, 1]   # ✅ CORRECT


# ---------- 2. RANDOM FOREST CLASSIFIER ----------
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_clf.fit(X_train_c, y_train_c)
y_pred_rf = rf_clf.predict(X_test_c)
y_prob_rf = rf_clf.predict_proba(X_test_c)[:, 1]

# ---------- 3. XGBOOST CLASSIFIER ----------
xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)

xgb_clf.fit(X_train_c, y_train_c)
y_pred_xgb = xgb_clf.predict(X_test_c)
y_prob_xgb = xgb_clf.predict_proba(X_test_c)[:, 1]

# ---------- 4. EVALUATION FUNCTION ----------
def evaluate_model(y_true, y_pred, y_prob, model_name):
    print(f"\n{model_name}")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted'))
    print("Recall   :", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score :", f1_score(y_true, y_pred, average='weighted'))

    # ---- SAFE ROC-AUC HANDLING ----
    try:
        # Multiclass case (probability matrix)
        if y_prob.ndim == 2 and y_prob.shape[1] > 2:
            roc_auc = roc_auc_score(y_true, y_prob, multi_class='ovr')
        # Binary case (single probability vector)
        else:
            roc_auc = roc_auc_score(y_true, y_prob)
        print("ROC-AUC  :", roc_auc)
    except Exception as e:
        print("ROC-AUC  : Not computable", "| Reason:", e)
# ---------- 5. MODEL EVALUATION ----------
evaluate_model(y_test_c, y_pred_lr, y_prob_lr, "Logistic Regression")
evaluate_model(y_test_c, y_pred_rf, y_prob_rf, "Random Forest")
evaluate_model(y_test_c, y_pred_xgb, y_prob_xgb, "XGBoost")



Logistic Regression
Accuracy : 0.8919688981200404
Precision: 0.8591591690017035
Recall   : 0.8919688981200404
F1 Score : 0.8725317784369375
ROC-AUC  : Not computable | Reason: multi_class must be in ('ovo', 'ovr')

Random Forest
Accuracy : 0.8991851733503755
Precision: 0.8583382225494609
Recall   : 0.8991851733503755
F1 Score : 0.8768900125946196
ROC-AUC  : Not computable | Reason: multi_class must be in ('ovo', 'ovr')

XGBoost
Accuracy : 0.9518293657133727
Precision: 0.9432934426554551
Recall   : 0.9518293657133727
F1 Score : 0.935366363942379
ROC-AUC  : Not computable | Reason: multi_class must be in ('ovo', 'ovr')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
# =====================================
# REGRESSION MODELS - EMI PROJECT
# =====================================

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

import numpy as np

# ---------- 1. LINEAR REGRESSION ----------
lr_reg = LinearRegression()

lr_reg.fit(X_train_r, y_train_r)
y_pred_lr = lr_reg.predict(X_test_r)

# ---------- 2. RANDOM FOREST REGRESSOR ----------
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_reg.fit(X_train_r, y_train_r)
y_pred_rf = rf_reg.predict(X_test_r)

# ---------- 3. XGBOOST REGRESSOR ----------
xgb_reg = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_reg.fit(X_train_r, y_train_r)
y_pred_xgb = xgb_reg.predict(X_test_r)

# ---------- 4. EVALUATION FUNCTION ----------
def evaluate_regression(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name}")
    print("RMSE :", rmse)
    print("MAE  :", mae)
    print("R²   :", r2)

# ---------- 5. MODEL EVALUATION ----------
evaluate_regression(y_test_r, y_pred_lr, "Linear Regression")
evaluate_regression(y_test_r, y_pred_rf, "Random Forest Regressor")
evaluate_regression(y_test_r, y_pred_xgb, "XGBoost Regressor")



Linear Regression
RMSE : 4158.652735934089
MAE  : 2957.0045622213356
R²   : 0.717254055042997

Random Forest Regressor
RMSE : 1366.1830434841497
MAE  : 651.9552861220453
R²   : 0.9694853176438478

XGBoost Regressor
RMSE : 910.9867942482056
MAE  : 406.75234495602996
R²   : 0.9864320187449065


In [7]:
import pickle



In [8]:
from xgboost import XGBClassifier


model = XGBClassifier(n_estimators=200)
model.fit(X_train_c, y_train_c)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [9]:
with open("emi_eligibility_model.pkl", "wb") as file:
    pickle.dump(model, file)


In [10]:

with open("emi_eligibility_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)


In [11]:
prediction = loaded_model.predict(X_test_c)


In [12]:
from xgboost import XGBRegressor


model = XGBRegressor(n_estimators=200)
model.fit(X_train_c, y_train_c)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
with open("max_emi_model.pkl", "wb") as file:
    pickle.dump(model, file)


In [14]:

with open("max_emi_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)


In [15]:
prediction = loaded_model.predict(X_test_c)


In [16]:
# ===============================
# Classification Data Split
# ===============================

from sklearn.model_selection import train_test_split

# Features & Target
X_c = df.drop(columns=["emi_eligibility", "max_monthly_emi"])
y_c = df["emi_eligibility"]

# Train-Test Split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_c,
    y_c,
    test_size=0.2,
    random_state=42,
    stratify=y_c
)
