In [10]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, recall_score, roc_auc_score, classification_report



In [11]:
train_path = '/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/train.csv'
val_path = '/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/val.csv'

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

train_df.head()

Unnamed: 0.1,Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,fe_recent_delay,fe_underpaid_last_month,fe_high_utilization_flag,SEX_2,EDUCATION_2,EDUCATION_3,EDUCATION_4,MARRIAGE_2,MARRIAGE_3,default.payment.next.month
0,8291,-0.905498,-0.812074,0.014861,0.111736,0.138865,0.188746,0.234917,0.253137,-0.03875,...,0.014861,0.524322,1.661144,0.810161,1.0669,-0.442752,-0.125886,0.937672,-0.112812,0
1,17433,0.327685,1.032193,0.904712,-1.558876,-1.532192,-1.521944,-1.530046,-1.486041,-0.695642,...,0.904712,-1.907224,-0.601995,0.810161,1.0669,-0.442752,-0.125886,0.937672,-0.112812,0
2,22227,1.715017,-0.812074,0.014861,0.111736,0.138865,0.188746,0.234917,0.253137,0.128658,...,0.014861,0.524322,-0.601995,0.810161,-0.937295,-0.442752,-0.125886,-1.066471,-0.112812,0
3,29239,-1.13672,-1.24602,0.014861,0.111736,0.138865,0.188746,-1.530046,-1.486041,-0.479168,...,0.014861,0.524322,-0.601995,-1.234323,1.0669,-0.442752,-0.125886,0.937672,-0.112812,0
4,5383,-1.13672,-1.137534,0.904712,1.782348,2.64545,1.899436,0.234917,0.253137,-0.469009,...,0.904712,0.524322,1.661144,-1.234323,-0.937295,-0.442752,-0.125886,0.937672,-0.112812,1


In [12]:
target_col = 'default.payment.next.month'

X_train = train_df.drop(columns = [target_col])
y_train = train_df[target_col]

X_val = val_df.drop(columns = [target_col])
y_val = val_df[target_col]

X_train.shape , y_train.shape , X_val.shape , y_val.shape

((24000, 48), (24000,), (3000, 48), (3000,))

In [14]:
models = {
    "Logistic Regression" : LogisticRegression(max_iter = 1000 , class_weight = 'balanced') , 
    "Decesion Tree" : DecisionTreeClassifier(random_state = 7 , class_weight = 'balanced') , 
    "Naive Bayes" : GaussianNB()
}

In [15]:
results = {}

for name, model in models.items():

    model.fit(X_train, y_train)
    

    y_pred = model.predict(X_val)
    

    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None


    f1 = f1_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_proba) if y_proba is not None else None


    results[name] = {
        "F1-score": f1,
        "Recall": recall,
        "ROC-AUC": roc_auc
    }


    print(f"\n=== {name} ===")
    print(classification_report(y_val, y_pred, digits=4))
    if roc_auc is not None:
        print("ROC-AUC:", round(roc_auc, 4))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression ===
              precision    recall  f1-score   support

           0     0.8828    0.7899    0.8338      2337
           1     0.4598    0.6305    0.5318       663

    accuracy                         0.7547      3000
   macro avg     0.6713    0.7102    0.6828      3000
weighted avg     0.7894    0.7547    0.7670      3000

ROC-AUC: 0.7832

=== Decesion Tree ===
              precision    recall  f1-score   support

           0     0.8274    0.8327    0.8300      2337
           1     0.3966    0.3876    0.3921       663

    accuracy                         0.7343      3000
   macro avg     0.6120    0.6102    0.6110      3000
weighted avg     0.7322    0.7343    0.7332      3000

ROC-AUC: 0.6102

=== Naive Bayes ===
              precision    recall  f1-score   support

           0     0.9096    0.3188    0.4721      2337
           1     0.2701    0.8884    0.4142       663

    accuracy                         0.4447      3000
   macro avg     0.5899

In [16]:

results_df = pd.DataFrame(results).T

print("\n📊 Summary of Results:")
print(results_df)


best_model_recall = results_df["Recall"].idxmax()
best_model_f1 = results_df["F1-score"].idxmax()

print("\n✅ Best Model (Recall):", best_model_recall)
print("✅ Best Model (F1):", best_model_f1)



📊 Summary of Results:
                     F1-score    Recall   ROC-AUC
Logistic Regression  0.531807  0.630468  0.783169
Decesion Tree        0.392067  0.387632  0.610162
Naive Bayes          0.414205  0.888386  0.747318

✅ Best Model (Recall): Naive Bayes
✅ Best Model (F1): Logistic Regression
