In [6]:
# --- Cell 1: Imports ---
import pandas as pd
import numpy as np
import joblib
import json

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from sklearn.base import clone
from sklearn.model_selection import cross_validate, StratifiedKFold


In [7]:
train = pd.read_csv('/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/train.csv')
val = pd.read_csv('/home/danial/Data Science/Credit Risk Analysis/data/processed/Final/val.csv')

target_col = 'default.payment.next.month'

X_train , y_train = train.drop(columns = [target_col] , axis = 1) , train[target_col]
X_val , y_val = val.drop(columns = [target_col] , axis = 1) , val[target_col]



In [8]:
num_features = X_train.columns.tolist()


preprocessor = ColumnTransformer(
    [("num", StandardScaler(), num_features)],
    remainder="drop"
)


In [9]:

pipeline_lr = Pipeline([("preprocessor", clone(preprocessor)),
                        ("clf", LogisticRegression(max_iter=2000, random_state=7))])

pipeline_gnb = Pipeline([("preprocessor", clone(preprocessor)),
                         ("clf", GaussianNB())])


In [10]:
pipeline_lr.fit(X_train, y_train)
pipeline_gnb.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,priors,
,var_smoothing,1e-09


In [11]:

y_pred_train_lr = pipeline_lr.predict(X_train)
y_pred_val_lr   = pipeline_lr.predict(X_val)

y_pred_train_gnb = pipeline_gnb.predict(X_train)
y_pred_val_gnb   = pipeline_gnb.predict(X_val)


metrics_lr = {
    "train_acc": accuracy_score(y_train, y_pred_train_lr),
    "val_acc": accuracy_score(y_val, y_pred_val_lr),
    "train_f1": f1_score(y_train, y_pred_train_lr),
    "val_f1": f1_score(y_val, y_pred_val_lr)
}
metrics_gnb = {
    "train_acc": accuracy_score(y_train, y_pred_train_gnb),
    "val_acc": accuracy_score(y_val, y_pred_val_gnb),
    "train_f1": f1_score(y_train, y_pred_train_gnb),
    "val_f1": f1_score(y_val, y_pred_val_gnb)
}

print("LR metrics:", metrics_lr)
print("GNB metrics:", metrics_gnb)


LR metrics: {'train_acc': 0.8069583333333333, 'val_acc': 0.8153333333333334, 'train_f1': 0.3969803462189249, 'val_f1': 0.4192872117400419}
GNB metrics: {'train_acc': 0.399625, 'val_acc': 0.41533333333333333, 'train_f1': 0.39673435210383085, 'val_f1': 0.40823211875843457}


In [13]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_lr = cross_val_score(pipeline_lr, X_train, y_train, cv=cv, scoring="f1")
cv_gnb = cross_val_score(pipeline_gnb, X_train, y_train, cv=cv, scoring="f1")

print("LR CV mean F1:", cv_lr.mean())
print("GNB CV mean F1:", cv_gnb.mean())


LR CV mean F1: 0.3967051560650045
GNB CV mean F1: 0.39910957765664967


In [16]:


model_dir = "/home/danial/Data Science/Credit Risk Analysis/models/"


joblib.dump(pipeline_lr, model_dir + "pipeline_logreg_v1.pkl")
joblib.dump(pipeline_gnb, model_dir + "pipeline_gnb_v1.pkl")


all_metrics = {"LR": metrics_lr, "GNB": metrics_gnb}
with open(model_dir + "baseline_metrics_v1.json", "w") as f:
    json.dump(all_metrics, f, indent=2)

print("Models and metrics saved in:", model_dir)


Models and metrics saved in: /home/danial/Data Science/Credit Risk Analysis/models/
