In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, f1_score


In [3]:
train_path = "/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/train.csv"
df = pd.read_csv(train_path)

target_col = "default.payment.next.month"


X = df.drop(columns=[target_col])
y = df[target_col]


In [4]:
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)


scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score)
}


In [5]:
log_reg = LogisticRegression(C=0.01, solver='lbfgs', max_iter=10000)


acc_scores_lr = cross_val_score(log_reg, X, y, cv=skf, scoring='accuracy')
f1_scores_lr = cross_val_score(log_reg, X, y, cv=skf, scoring=make_scorer(f1_score))

print("Logistic Regression - Accuracy: %.4f ± %.4f" % (acc_scores_lr.mean(), acc_scores_lr.std()))
print("Logistic Regression - F1-Score: %.4f ± %.4f" % (f1_scores_lr.mean(), f1_scores_lr.std()))


Logistic Regression - Accuracy: 0.8077 ± 0.0022
Logistic Regression - F1-Score: 0.3942 ± 0.0121


In [6]:
gnb = GaussianNB(var_smoothing=1e-07)

acc_scores_gnb = cross_val_score(gnb, X, y, cv=skf, scoring='accuracy')
f1_scores_gnb = cross_val_score(gnb, X, y, cv=skf, scoring=make_scorer(f1_score))

print("GaussianNB - Accuracy: %.4f ± %.4f" % (acc_scores_gnb.mean(), acc_scores_gnb.std()))
print("GaussianNB - F1-Score: %.4f ± %.4f" % (f1_scores_gnb.mean(), f1_scores_gnb.std()))


GaussianNB - Accuracy: 0.8022 ± 0.0019
GaussianNB - F1-Score: 0.3273 ± 0.0217


In [8]:
if acc_scores_lr.mean() > acc_scores_gnb.mean():
    print("✅ Logistic Regression performs better based on cross-validation.")
    best_model = log_reg
else:
    print("✅ GaussianNB performs better based on cross-validation.")
    best_model = gnb


✅ Logistic Regression performs better based on cross-validation.


In [None]:
from joblib import dump


train_df = pd.read_csv("/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/train.csv")
val_df = pd.read_csv("/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/val.csv")


full_train_df = pd.concat([train_df, val_df], ignore_index=True)


X_full = full_train_df.drop(columns=['default'])
y_full = full_train_df['default']


log_reg = LogisticRegression(C=0.01, solver='lbfgs', max_iter=10000)


log_reg.fit(X_full, y_full)


dump(log_reg, "/home/danial/Data Science/Credit Risk Analysis/models/logistic_regression_final.joblib")

print("✅ Final Logistic Regression model has been trained on Train+Val and saved successfully.")
