In [1]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer


df=pd.read_csv('emi_data_final2.csv')
df

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi,total_expenses,expense_to_income_ratio,employment_group,affordability_ratio,credit_risk,employment_stability
0,38,Female,Married,Professional,82600,Private,0.9,Mid-size,Rented,20000.0,...,850000.0,15,Not_Eligible,500.0,59900.0,0.725182,0–2 yrs,0.006053,Medium Risk,Unstable
1,38,Female,Married,Graduate,21500,Private,7.0,MNC,Family,0.0,...,128000.0,19,Not_Eligible,700.0,15400.0,0.716279,6–10 yrs,0.032558,Medium Risk,Stable
2,38,Male,Married,Professional,86100,Private,5.8,Startup,Own,0.0,...,306000.0,16,Eligible,27775.0,35600.0,0.413473,6–10 yrs,0.322590,Medium Risk,Stable
3,58,Female,Married,High School,66800,Private,2.2,Mid-size,Own,0.0,...,304000.0,83,Eligible,16170.0,37400.0,0.559880,3–5 yrs,0.242066,Medium Risk,Moderate
4,48,Female,Married,Professional,57300,Private,3.4,Mid-size,Family,0.0,...,252000.0,7,Not_Eligible,500.0,58600.0,1.022688,3–5 yrs,0.008726,Low Risk,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400019,27,Male,Married,Graduate,32400,Private,5.0,Large Indian,Rented,10400.0,...,506000.0,47,Not_Eligible,500.0,33400.0,1.030864,3–5 yrs,0.015432,Medium Risk,Moderate
400020,38,Male,Married,Post Graduate,49200,Private,1.9,MNC,Own,0.0,...,708000.0,33,Not_Eligible,5200.0,38800.0,0.788618,0–2 yrs,0.105691,Medium Risk,Unstable
400021,32,Male,Single,Graduate,25700,Private,3.2,MNC,Rented,6300.0,...,93000.0,21,High_Risk,5665.0,15400.0,0.599222,3–5 yrs,0.220428,Medium Risk,Moderate
400022,48,Male,Married,Graduate,47200,Private,3.0,MNC,Own,0.0,...,144000.0,36,Eligible,14460.0,23100.0,0.489407,3–5 yrs,0.306356,Low Risk,Moderate


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400024 entries, 0 to 400023
Data columns (total 33 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   age                      400024 non-null  int64  
 1   gender                   400024 non-null  object 
 2   marital_status           400024 non-null  object 
 3   education                400024 non-null  object 
 4   monthly_salary           400024 non-null  int64  
 5   employment_type          400024 non-null  object 
 6   years_of_employment      400024 non-null  float64
 7   company_type             400024 non-null  object 
 8   house_type               400024 non-null  object 
 9   monthly_rent             400024 non-null  float64
 10  family_size              400024 non-null  int64  
 11  dependents               400024 non-null  int64  
 12  school_fees              400024 non-null  float64
 13  college_fees             400024 non-null  float64
 14  trav

In [3]:
categorial_cols=df.select_dtypes(include=['object']).columns 

In [4]:
for col in categorial_cols:
    print(col)

gender
marital_status
education
employment_type
company_type
house_type
existing_loans
emi_scenario
emi_eligibility
employment_group
credit_risk
employment_stability


In [5]:
for col in categorial_cols:
    print(col,':',df[col].unique())

gender : ['Female' 'Male']
marital_status : ['Married' 'Single']
education : ['Professional' 'Graduate' 'High School' 'Post Graduate' 'Unknown']
employment_type : ['Private' 'Government' 'Self-employed']
company_type : ['Mid-size' 'MNC' 'Startup' 'Large Indian' 'Small']
house_type : ['Rented' 'Family' 'Own']
existing_loans : ['Yes' 'No']
emi_scenario : ['Personal Loan EMI' 'E-commerce Shopping EMI' 'Education EMI'
 'Vehicle EMI' 'Home Appliances EMI']
emi_eligibility : ['Not_Eligible' 'Eligible' 'High_Risk']
employment_group : ['0–2 yrs' '6–10 yrs' '3–5 yrs' '20+ yrs' '11–20 yrs']
credit_risk : ['Medium Risk' 'Low Risk' 'High Risk']
employment_stability : ['Unstable' 'Stable' 'Moderate' 'Highly Stable']


In [6]:
nominal_cols = ['gender','marital_status','employment_type','company_type','house_type','existing_loans','emi_scenario']
ordinal_cols = ['education','employment_group','credit_risk','employment_stability']  

In [7]:
education_order = ['Unknown','High School','Graduate','Post Graduate','Professional']
employment_group_order = ['0–2 yrs', '6–10 yrs', '3–5 yrs', '20+ yrs', '11–20 yrs'] 
credit_risk_order=['Low Risk','Medium Risk','High Risk'] 
employment_stability_order=['Unstable','Moderate','Stable','Highly Stable']

In [8]:
ct=ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), nominal_cols),
        ('ordinal', OrdinalEncoder(categories=[education_order, employment_group_order, credit_risk_order, employment_stability_order]), ordinal_cols)
    ], remainder='passthrough'
)

In [9]:
from sklearn.model_selection import train_test_split
X= df.drop('emi_eligibility', axis=1)
y= df['emi_eligibility']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

In [10]:
le=LabelEncoder()

X_train_encoded=ct.fit_transform(X_train)
X_test_encoded=ct.transform(X_test)

y_train_encoded=le.fit_transform(y_train)
y_test_encoded=le.transform(y_test)
print(X_train_encoded.shape, X_test_encoded.shape, y_train_encoded.shape, y_test_encoded.shape)

(320019, 40) (80005, 40) (320019,) (80005,)


In [11]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights to handle any class imbalance
classes = np.unique(y_train_encoded)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_encoded)
class_weight = dict(zip(classes, weights))
print("Class weights:", class_weight)

Class weights: {np.int64(0): np.float64(1.8128101420705593), np.int64(1): np.float64(7.713159797541576), np.int64(2): np.float64(0.4312721098061412)}


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix

log_reg_model=LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_encoded, y_train_encoded)

y_pred_lr=log_reg_model.predict(X_test_encoded) 

accuracy=accuracy_score(y_test_encoded, y_pred_lr)
precision=precision_score(y_test_encoded, y_pred_lr, average='weighted')
recall=recall_score(y_test_encoded, y_pred_lr, average='weighted')
f1=f1_score(y_test_encoded, y_pred_lr, average='weighted')
print("Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_lr))

Logistic Regression Model Performance:
Accuracy: 0.8830
Precision: 0.8509
Recall: 0.8830
F1 Score: 0.8636

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76     14711
           1       0.15      0.01      0.01      3458
           2       0.91      0.96      0.94     61836

    accuracy                           0.88     80005
   macro avg       0.61      0.57      0.57     80005
weighted avg       0.85      0.88      0.86     80005



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.ensemble import RandomForestClassifier 
RF_model=RandomForestClassifier(n_estimators=200, random_state=42)

RF_model.fit(X_train_encoded, y_train_encoded)
y_pred_rf=RF_model.predict(X_test_encoded)


accuracy_rf=accuracy_score(y_test_encoded, y_pred_rf)
precision_rf=precision_score(y_test_encoded, y_pred_rf, average='weighted')
recall_rf=recall_score(y_test_encoded, y_pred_rf, average='weighted')
f1_rf=f1_score(y_test_encoded, y_pred_rf, average='weighted')


print("Random Forest Model Performance:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_rf))

Random Forest Model Performance:
Accuracy: 0.9512
Precision: 0.9484
Recall: 0.9512
F1 Score: 0.9328

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     14711
           1       0.88      0.04      0.08      3458
           2       0.96      1.00      0.98     61836

    accuracy                           0.95     80005
   macro avg       0.92      0.67      0.67     80005
weighted avg       0.95      0.95      0.93     80005



In [14]:
from xgboost import XGBClassifier

# sample_weights = np.array([class_weight[i] for i in y_train_encoded])
xgb_model = XGBClassifier(
    objective='multi:softmax',
    learning_rate=0.1,
    max_depth=5,
    n_estimators=200,
    random_state=42,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train_encoded, y_train_encoded) 

y_pred_xgb=xgb_model.predict(X_test_encoded) 

accuracy_xgb=accuracy_score(y_test_encoded, y_pred_xgb)
precision_xgb=precision_score(y_test_encoded, y_pred_xgb, average='weighted')
recall_xgb=recall_score(y_test_encoded, y_pred_xgb, average='weighted')
f1_xgb=f1_score(y_test_encoded, y_pred_xgb, average='weighted')
print("XGBoost Model Performance:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1 Score: {f1_xgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_xgb))

XGBoost Model Performance:
Accuracy: 0.9829
Precision: 0.9824
Recall: 0.9829
F1 Score: 0.9814

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     14711
           1       0.94      0.65      0.77      3458
           2       0.99      1.00      0.99     61836

    accuracy                           0.98     80005
   macro avg       0.96      0.88      0.91     80005
weighted avg       0.98      0.98      0.98     80005



In [15]:
import pickle 
model_filename='CL_XGB_model.pkl'
model_filename2='CL_RF_model.pkl'
model_filename3='CL_Log_reg_model.pkl'

with open(model_filename, 'wb') as f:
    pickle.dump(xgb_model, f)

with open(model_filename2, 'wb') as f:
    pickle.dump(RF_model, f)

with open(model_filename3, 'wb') as f:
    pickle.dump(log_reg_model, f)

In [16]:
df

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi,total_expenses,expense_to_income_ratio,employment_group,affordability_ratio,credit_risk,employment_stability
0,38,Female,Married,Professional,82600,Private,0.9,Mid-size,Rented,20000.0,...,850000.0,15,Not_Eligible,500.0,59900.0,0.725182,0–2 yrs,0.006053,Medium Risk,Unstable
1,38,Female,Married,Graduate,21500,Private,7.0,MNC,Family,0.0,...,128000.0,19,Not_Eligible,700.0,15400.0,0.716279,6–10 yrs,0.032558,Medium Risk,Stable
2,38,Male,Married,Professional,86100,Private,5.8,Startup,Own,0.0,...,306000.0,16,Eligible,27775.0,35600.0,0.413473,6–10 yrs,0.322590,Medium Risk,Stable
3,58,Female,Married,High School,66800,Private,2.2,Mid-size,Own,0.0,...,304000.0,83,Eligible,16170.0,37400.0,0.559880,3–5 yrs,0.242066,Medium Risk,Moderate
4,48,Female,Married,Professional,57300,Private,3.4,Mid-size,Family,0.0,...,252000.0,7,Not_Eligible,500.0,58600.0,1.022688,3–5 yrs,0.008726,Low Risk,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400019,27,Male,Married,Graduate,32400,Private,5.0,Large Indian,Rented,10400.0,...,506000.0,47,Not_Eligible,500.0,33400.0,1.030864,3–5 yrs,0.015432,Medium Risk,Moderate
400020,38,Male,Married,Post Graduate,49200,Private,1.9,MNC,Own,0.0,...,708000.0,33,Not_Eligible,5200.0,38800.0,0.788618,0–2 yrs,0.105691,Medium Risk,Unstable
400021,32,Male,Single,Graduate,25700,Private,3.2,MNC,Rented,6300.0,...,93000.0,21,High_Risk,5665.0,15400.0,0.599222,3–5 yrs,0.220428,Medium Risk,Moderate
400022,48,Male,Married,Graduate,47200,Private,3.0,MNC,Own,0.0,...,144000.0,36,Eligible,14460.0,23100.0,0.489407,3–5 yrs,0.306356,Low Risk,Moderate


In [17]:
df.columns 

Index(['age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure', 'emi_eligibility',
       'max_monthly_emi', 'total_expenses', 'expense_to_income_ratio',
       'employment_group', 'affordability_ratio', 'credit_risk',
       'employment_stability'],
      dtype='object')

In [18]:
X.columns

Index(['age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure', 'max_monthly_emi',
       'total_expenses', 'expense_to_income_ratio', 'employment_group',
       'affordability_ratio', 'credit_risk', 'employment_stability'],
      dtype='object')

In [19]:
import pickle 

In [20]:
# Save ColumnTransformer
with open("CL_ct.pkl", "wb") as f:
    pickle.dump(ct, f)

# Save LabelEncoder
with open("CL_le.pkl", "wb") as f:
    pickle.dump(le, f)


In [22]:
df['credit_risk'].unique()

array(['Medium Risk', 'Low Risk', 'High Risk'], dtype=object)