In [17]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import src.helpers.model_helpers as mh
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Load modeling dataset and the split indices
X_train, y_train, X_test, y_test = mh.load_model_dataset()

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet


In [None]:
# Create and train model
random_forest_classifier = RandomForestClassifier(n_jobs=-1, bootstrap=True, random_state=42)

param_grid = {
    "n_estimators": randint(200, 800),
    "max_depth": [None, 10, 20, 40],
    "max_features": ["sqrt", 0.3, 0.5],
    "min_samples_split": randint(2, 21),
    "min_samples_leaf": randint(1, 9),
    "class_weight": [None, "balanced"],
    "max_samples": [None, 0.5, 0.7, 0.9],
}

search = RandomizedSearchCV(
    random_forest_classifier,
    param_distributions=param_grid,
    n_iter=30,
    scoring="f1",
    cv=3,
    random_state=42,
    n_jobs=-1,
)

search.fit(X_train, y_train)

In [22]:
# Calculate F1 for validation
mh.output_cv_summary(search)

Best params: {'class_weight': 'balanced', 'max_depth': 40, 'max_features': 0.3, 'max_samples': 0.7, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 661}
Best CV F1: 0.7108821542275495


In [23]:
# Calculate metrics
results = mh.calculate_test_metrics(search, X_test, y_test)

# Save metrics to CSV
mh.save_metrics_to_csv(results, "random_forest_metrics_csv")

# Display metrics
print("Test Performance:")
display(results)

Test Performance:


Unnamed: 0,Score
F1,0.721074
Accuracy,0.877846
Precision,0.794989
Recall,0.659735
ROC AUC,0.897096
PR AUC,0.807997
