In [3]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import src.helpers.model_helpers as mh
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, uniform
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import src.utils.file_utils as fu
import joblib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Load modeling dataset and the split indices
X_train, y_train, X_test, y_test = mh.load_model_dataset()

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet


In [5]:
# We need to log transform and scale our numeric features before training
transform_features = ["income", "property_value", "loan_amount", "combined_loan_to_value_ratio"]

log_scale_pipe = Pipeline([
    ("log1p",  FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("logscale", log_scale_pipe, transform_features),
    ],
    remainder="passthrough", # Don't modify columns not in transform_features
    verbose_feature_names_out=False
).set_output(transform="pandas")

In [6]:
# Create base classifier
base_log_reg = LogisticRegression(
    solver="saga", # Helps with larger datasets and elasticnet
    penalty="elasticnet",
    max_iter=5000,
    tol=1e-3, # Loosening convergence criteria slightly to save compute/time
    n_jobs=-1, # No limit on parallel threads
    random_state=42
)

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("clf",  base_log_reg)
])

# Specify how to randomly sample hyperparameters
param_grid = {
    "clf__C": loguniform(1e-3, 1e2),
    "clf__l1_ratio": uniform(0.0, 1.0),
    "clf__class_weight": [None, "balanced"]
}

# Able to shuffle data for splits
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    n_iter=1, # Lowered the iterations (and k-fold splits) for functional testing before longer runs
    scoring="f1",
    cv=cv,
    refit=True,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


0,1,2
,estimator,Pipeline(step... tol=0.001))])
,param_distributions,"{'clf__C': <scipy.stats....t 0x129d82f90>, 'clf__class_weight': [None, 'balanced'], 'clf__l1_ratio': <scipy.stats....t 0x12b29e990>}"
,n_iter,1
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('logscale', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'elasticnet'
,dual,False
,tol,0.001
,C,np.float64(0.0745934328572655)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,5000


In [7]:
# Calculate F1 for validation and test
print("Best params:", search.best_params_)
print("Best CV F1:", round(search.best_score_, 4))

best_lr = search.best_estimator_
y_pred = best_lr.predict(X_test)
print("Test F1:", round(f1_score(y_test, y_pred), 4))

Best params: {'clf__C': np.float64(0.0745934328572655), 'clf__class_weight': None, 'clf__l1_ratio': np.float64(0.1834347898661638)}
Best CV F1: 0.4541
Test F1: 0.4709


In [8]:
# Persist fitted model
model_path = fu.get_path("log_reg_model")
joblib.dump(best_lr, model_path)

['/Users/c1burns/Documents/UTD/BUAN 6341/project_repo/models/logreg_model.pkl']