In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import mlflow
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import os

In [None]:
# read the dataset and split into training and test sets
loans = pd.read_csv("../data/loan_data.csv")
X = loans.drop(columns="loan_status")
y = loans["loan_status"]

seed = 666
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)
X_train.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
10591,24.0,male,Master,84007.0,2,MORTGAGE,7200.0,MEDICAL,7.88,0.09,2.0,601,Yes
20512,27.0,male,Bachelor,60018.0,3,RENT,4800.0,DEBTCONSOLIDATION,14.65,0.08,7.0,676,Yes
42547,24.0,female,Bachelor,71962.0,0,RENT,9958.0,MEDICAL,14.71,0.14,4.0,674,No
1526,26.0,male,High School,49040.0,4,RENT,1400.0,VENTURE,11.86,0.03,4.0,611,Yes
33678,26.0,male,Associate,116414.0,2,MORTGAGE,5000.0,VENTURE,11.84,0.04,5.0,661,No


In [3]:
# build preprocessor that one hot encodes categorical features and passes through numerical features
categorical_features = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[("ohe", OneHotEncoder(drop="first"), categorical_features)],
    remainder="passthrough"
)

In [4]:
# dictionary of models to be compared
models = {"logreg": LogisticRegression(),
          "rf": RandomForestClassifier(),
          "xgb": xgb.XGBClassifier()}

In [5]:
# define model preprocessing and evaluation pipelines
def create_pipeline(name, model):
    return Pipeline(
        steps=[("preprocessor", preprocessor),
               ("scaler", StandardScaler()),
               (name, model)])

def fit_evaluate_log(name, model):
    mlflow.set_experiment(f"{name}_model_tuning")
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.best_estimator_.predict(X_test)    
        accuracy = round(accuracy_score(y_test, y_pred), 2)
        precision = round(precision_score(y_test, y_pred), 2)
        recall = round(recall_score(y_test, y_pred), 2)
        f1 = round(f1_score(y_test, y_pred), 2)
        mlflow.log_param("model_type", f"{name}")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)
        mlflow.set_tag("Model tuning", "Hyperparameter tuning")
    return f1, model.best_params_

In [6]:
# random forest and xgboost hyperparameters
logreg_params = {"logreg__max_iter": np.arange(100, 1100, 100), "logreg__tol": [0.0001, 0.001, 0.01], "logreg__C": np.logspace(-4, 4, 20)}
rf_params = {"rf__n_estimators": randint(200, 800), "rf__max_depth": randint(20, 100), "rf__min_samples_split": randint(2, 10), "rf__min_samples_leaf": randint(2, 10)}
xgb_params = {"xgb__n_estimators": randint(200, 800), "xgb__max_depth": randint(20, 100), "xgb__colsample_bytree": uniform(0.5, 0.5), "xgb__learning_rate": uniform(0.01, 0.3)}

In [7]:
# train and compare model f1 metrics and log to mlflow
model = []
f1_scores = []
best_params = []

path = os.path.abspath(os.path.join(os.getcwd(), ".."))
mlflow.set_tracking_uri(f"file://{path}/mlruns")
for name, clf in models.items():
    pipeline = create_pipeline(name, clf)
    match name:
        case "logreg":
            logreg_rand_search = RandomizedSearchCV(pipeline, param_distributions=logreg_params, cv=10, n_iter=10, scoring="f1", random_state=seed, n_jobs=-1)
            f1, params = fit_evaluate_log(name, logreg_rand_search)
        case "rf":
            rf_rand_search = RandomizedSearchCV(pipeline, param_distributions=rf_params, cv=10, n_iter=10, scoring="f1", random_state=seed, n_jobs=-1)
            f1, params = fit_evaluate_log(name, rf_rand_search)
        case "xgb":
            xgb_rand_search = RandomizedSearchCV(pipeline, param_distributions=xgb_params, cv=10, n_iter=10, scoring="f1", random_state=seed, n_jobs=-1)
            f1, params = fit_evaluate_log(name, xgb_rand_search)
    model.append(name)
    f1_scores.append(f1)
    best_params.append(params)
scores = pd.DataFrame({"Models": model, "F1": f1_scores, "Best Params": best_params})
scores

2025/10/12 21:14:39 INFO mlflow.tracking.fluent: Experiment with name 'logreg_model_tuning' does not exist. Creating a new experiment.


2025/10/12 21:14:48 INFO mlflow.tracking.fluent: Experiment with name 'rf_model_tuning' does not exist. Creating a new experiment.
2025/10/12 21:19:07 INFO mlflow.tracking.fluent: Experiment with name 'xgb_model_tuning' does not exist. Creating a new experiment.


Unnamed: 0,Models,F1,Best Params
0,logreg,0.77,"{'logreg__tol': 0.001, 'logreg__max_iter': 100..."
1,rf,0.83,"{'rf__max_depth': 31, 'rf__min_samples_leaf': ..."
2,xgb,0.85,"{'xgb__colsample_bytree': 0.5244063969, 'xgb__..."


In [8]:
scores[scores["Models"] == "xgb"]["Best Params"].values

array([{'xgb__colsample_bytree': np.float64(0.5244063969), 'xgb__learning_rate': np.float64(0.03997856839636343), 'xgb__max_depth': 83, 'xgb__n_estimators': 600}],
      dtype=object)