## Import Library

In [49]:
# Data processing, JSON-handling, & visualization libraries
import joblib
import pandas as pd
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

# Sklearn libraries, preprocessing steps, & decision tree model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from mlflow import (
    set_tracking_uri, get_tracking_uri, set_experiment, start_run, 
    log_params, log_metric
)
from mlflow.models.signature import infer_signature
import mlflow.sklearn

# Metric to track and log for MLflow
from sklearn.metrics import accuracy_score

from sklearn.model_selection import (
    train_test_split,
    ParameterGrid
)


In [50]:
# Load the *fitted* ColumnTransformer you saved earlier
preprocessor = joblib.load("../data/processed/14_processed_df.pkl")

# Rebuild X (same as in notebook A)
import pandas as pd, json
df = pd.read_csv("../data/processed/11_biz_merged_clean.csv")

In [51]:
preprocessor

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False


Ordinary: 
PriceRange2: [1,2,3,4] → [0,1,2,3]
NoiseLevel: ["quiet","average","loud","very_loud"] → [0,1,2,3]
Attire: ["casual","dressy","formal"] → [0,1,2]

In [52]:
df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,review_count,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,...,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rev_count_2019,avg_stars_2019,first_review_2019,last_review_2019,rl_word_mean,rl_share_short24
0,MTSW4McQd7CbVtyjqoe9mw,Philadelphia,PA,39.955505,-75.155564,80.0,True,4.394449,False,False,...,0,0,1,0,20,4.55,2019-03-12 17:04:09,2021-11-01 18:22:07,81.45,0.05
1,CF33F8-E6oudUQ46HnavjQ,Ashland City,TN,36.269593,-87.058943,6.0,True,1.94591,False,True,...,0,0,0,0,3,1.333333,2020-06-26 19:22:36,2021-03-06 07:18:00,70.0,0.0
2,bBDDEgkFA1Otx9Lfe7BZUQ,Nashville,TN,36.208102,-86.76817,10.0,True,2.397895,False,True,...,0,0,0,0,5,1.8,2019-01-05 01:28:55,2021-04-15 19:16:33,111.2,0.0
3,eEOYSgkmpB90uNA7lDOMRA,Tampa Bay,FL,27.955269,-82.45632,10.0,True,2.397895,,,...,0,0,0,0,8,4.25,2019-01-16 18:22:34,2022-01-03 01:18:29,91.875,0.0
4,il_Ro8jwPlHresjw9EGmBg,Indianapolis,IN,39.637133,-86.127217,28.0,True,3.367296,,True,...,0,0,0,0,12,2.25,2019-01-01 19:58:17,2021-04-22 13:58:42,97.833336,0.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_id                      36261 non-null  object 
 1   city                             36261 non-null  object 
 2   state                            36261 non-null  object 
 3   latitude                         36261 non-null  float64
 4   longitude                        36261 non-null  float64
 5   review_count                     36261 non-null  float64
 6   is_open                          36261 non-null  bool   
 7   review_count_log1p               36261 non-null  float64
 8   attr_ByAppointmentOnly           3139 non-null   object 
 9   attr_BusinessAcceptsCreditCards  31372 non-null  object 
 10  attr_BikeParking                 26853 non-null  object 
 11  attr_RestaurantsPriceRange2      29672 non-null  float64
 12  attr_RestaurantsTa

In [54]:
import pandas as pd
import numpy as np

# --- 1. Convert datetime columns ---
datetime_cols = ["first_review_2019", "last_review_2019"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# --- 2. Convert boolean columns (True/False or Yes/No or t/f) ---
bool_cols = [
    "is_open", "attr_ByAppointmentOnly", "attr_BusinessAcceptsCreditCards",
    "attr_BikeParking", "attr_RestaurantsTakeOut", "attr_RestaurantsDelivery",
    "attr_Caters", "attr_WheelchairAccessible", "attr_HappyHour",
    "attr_OutdoorSeating", "attr_HasTV", "attr_RestaurantsReservations",
    "attr_DogsAllowed", "attr_GoodForKids", "attr_RestaurantsTableService",
    "attr_RestaurantsGoodForGroups", "attr_DriveThru", "has_hours_info"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace(
            {"True": True, "False": False, "Yes": True, "No": False, "None": np.nan, "nan": np.nan}
        )
        df[col] = df[col].astype("boolean")

# --- 3. Convert category columns ---
category_cols = [
    "attr_RestaurantsPriceRange2", "attr_WiFi", "attr_Alcohol",
    "attr_RestaurantsAttire", "attr_NoiseLevel", "attr_Smoking"
]
for col in category_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- 4. Convert small integer columns to int8 for memory efficiency ---
int8_cols = [
    "cat__Sandwiches", "cat__American (Traditional)", "cat__Pizza",
    "cat__Fast Food", "cat__Breakfast & Brunch", "cat__American (New)",
    "cat__Burgers", "cat__Mexican", "cat__Italian", "cat__Coffee & Tea",
    "cat__Seafood", "cat__Chinese", "cat__Salad", "cat__Chicken Wings",
    "cat__Cafes", "cat__Delis", "cat__Caterers", "cat__Specialty Food",
    "cat__Bakeries", "cat__Desserts"
]
for col in int8_cols:
    if col in df.columns:
        df[col] = df[col].astype("int8")

# --- 5. Convert others explicitly to float if not already ---
float_cols = [
    "latitude", "longitude", "review_count", "review_count_log1p",
    "total_weekly_hours", "days_open", "weekend_hours", "avg_daily_hours",
    "avg_stars_2019", "rl_word_mean", "rl_share_short24"
]
for col in float_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

# --- 6. Optional: compress integers like rev_count_2019 ---
df["rev_count_2019"] = df["rev_count_2019"].astype("int64")




  df[col] = df[col].astype(str).str.strip().replace(
  df[col] = df[col].astype(str).str.strip().replace(


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   business_id                      36261 non-null  object        
 1   city                             36261 non-null  object        
 2   state                            36261 non-null  object        
 3   latitude                         36261 non-null  float64       
 4   longitude                        36261 non-null  float64       
 5   review_count                     36261 non-null  float64       
 6   is_open                          36261 non-null  boolean       
 7   review_count_log1p               36261 non-null  float64       
 8   attr_ByAppointmentOnly           3139 non-null   boolean       
 9   attr_BusinessAcceptsCreditCards  31372 non-null  boolean       
 10  attr_BikeParking                 26853 non-null  boolean  

In [56]:
# Target
y = df["avg_stars_2019"].astype(float)

# Columns to exclude from predictors
exclude = {
    "business_id", "city", "state",
    "avg_stars_2019", "review_count",
    "rev_count_2019", "first_review_2019", "last_review_2019",
}

# Build X (everything except target + excluded)

feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()



In [57]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 53 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   latitude                         36261 non-null  float64 
 1   longitude                        36261 non-null  float64 
 2   is_open                          36261 non-null  boolean 
 3   review_count_log1p               36261 non-null  float64 
 4   attr_ByAppointmentOnly           3139 non-null   boolean 
 5   attr_BusinessAcceptsCreditCards  31372 non-null  boolean 
 6   attr_BikeParking                 26853 non-null  boolean 
 7   attr_RestaurantsPriceRange2      29672 non-null  category
 8   attr_RestaurantsTakeOut          33187 non-null  boolean 
 9   attr_RestaurantsDelivery         31363 non-null  boolean 
 10  attr_Caters                      25596 non-null  boolean 
 11  attr_WiFi                        27525 non-null  category
 12  attr

In [58]:

# Configure this week into central ML repository for course
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Set experiment name for this module
mlflow.set_experiment("BDA602 Yelp project")


<Experiment: artifact_location='mlflow-artifacts:/786926549055850120', creation_time=1759981678118, experiment_id='786926549055850120', last_update_time=1759981678118, lifecycle_stage='active', name='BDA602 Yelp project', tags={'mlflow.experimentKind': 'custom_model_development'}>

## Binary Classification Target
Make a binary target (≥4★ = 1, else 0)

In [59]:
# Binary classification target
y_cls = (y >= 4.0).astype(int)
y_cls.value_counts(normalize=True).round(3)  # quick class balance check

avg_stars_2019
0    0.614
1    0.386
Name: proportion, dtype: float64

## Build Pipeline

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression



log_rec = Pipeline(steps=[
    ("preprocessor", preprocessor),          # your ColumnTransformer
    ("model", LogisticRegression(max_iter=5000, solver="lbfgs"))
])


In [61]:
log_rec

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


## 1. Condition default set with 80 20 split

In [62]:
# Sklearn imports for optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Partition predictors & response into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cls,
    test_size = 0.2,   # Reserve 20% for "hold-out" data, 
    random_state=42
)

X_train.head(5)

Unnamed: 0,latitude,longitude,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_BikeParking,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_RestaurantsDelivery,...,cat__Salad,cat__Chicken Wings,cat__Cafes,cat__Delis,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rl_word_mean,rl_share_short24
8442,38.738936,-90.397281,True,4.574711,,True,False,2.0,True,True,...,0,0,0,0,0,0,0,0,71.375,0.208333
5934,40.209943,-75.225566,True,4.812184,,True,True,2.0,True,True,...,1,0,0,0,0,0,0,0,106.0625,0.09375
31281,39.752035,-75.541795,True,2.079442,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,66.0,0.0
18393,53.517787,-113.50945,True,4.025352,,,False,2.0,True,True,...,0,0,0,0,0,0,0,0,145.42857,0.0
21544,30.02008,-90.2506,True,2.397895,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,65.85714,0.142857


In [63]:
log_rec.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [64]:
# 3) Predict on the held-out test set
y_pred  = log_rec.predict(X_test)
y_proba = log_rec.predict_proba(X_test)[:, 1]   # needed for ROC AUC

In [65]:
# 4) Compute metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

acc   = accuracy_score(y_test, y_pred)
prec  = precision_score(y_test, y_pred, zero_division=0)
rec   = recall_score(y_test, y_pred, zero_division=0)
f1    = f1_score(y_test, y_pred, zero_division=0)
auc   = roc_auc_score(y_test, y_proba)
cm    = confusion_matrix(y_test, y_pred)

print(f"Accuracy   : {acc:.3f}")
print(f"Precision  : {prec:.3f}")
print(f"Recall     : {rec:.3f}")
print(f"F1         : {f1:.3f}")
print(f"ROC AUC    : {auc:.3f}")
print("\nConfusion matrix:\n", cm)

# Optional: nice per-class table
print("\nClassification report:\n",
      classification_report(y_test, y_pred, zero_division=0))


Accuracy   : 0.700
Precision  : 0.617
Recall     : 0.546
F1         : 0.579
ROC AUC    : 0.761

Confusion matrix:
 [[3580  931]
 [1244 1498]]

Classification report:
               precision    recall  f1-score   support

           0       0.74      0.79      0.77      4511
           1       0.62      0.55      0.58      2742

    accuracy                           0.70      7253
   macro avg       0.68      0.67      0.67      7253
weighted avg       0.69      0.70      0.70      7253



## 2. Grid Search Hyperprameter and CV

In [66]:
params= [
    # L2-only solvers
    {
        "model__solver": ["lbfgs", "newton-cg", "sag"],
        "model__penalty": ["l2"],
        "model__C": [100, 10, 1.0, 0.1, 0.01],
    },
    # saga supports L1 and L2
    {
        "model__solver": ["saga"],
        "model__penalty": ["l1", "l2"],
        "model__C": [100, 10, 1.0, 0.1, 0.01],
    },
    # saga + elasticnet needs l1_ratio
    {
        "model__solver": ["saga"],
        "model__penalty": ["elasticnet"],
        "model__l1_ratio": [0.3, 0.5, 0.7],
        "model__C": [100, 10, 1.0, 0.1, 0.01],

    },
]

In [67]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [68]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from mlflow.models.signature import infer_signature
import mlflow

# Use multi-metric scoring so CV accuracy/F1 exist
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_cv = GridSearchCV(
    estimator=log_rec,                # your pipeline with step name 'model'
    param_grid=params,                # uses keys like model__C, model__solver, ...
    scoring={"roc_auc":"roc_auc", "f1":"f1", "accuracy":"accuracy"},
    refit="roc_auc",                  # pick best by AUC
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Fit search
grid_search_cv.fit(X_train, y_train)

# Best model on test
best_model = grid_search_cv.best_estimator_
y_pred  = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Pull CV-best metrics at chosen params
cv = grid_search_cv.cv_results_
idx = grid_search_cv.best_index_
cv_best_auc = float(cv["mean_test_roc_auc"][idx])
cv_best_f1  = float(cv["mean_test_f1"][idx])
cv_best_acc = float(cv["mean_test_accuracy"][idx])

# Metrics in your exact format
metrics = {
    "test_auc": float(roc_auc_score(y_test, y_proba)),
    "test_f1": float(f1_score(y_test, y_pred)),
    "test_accuracy": float(accuracy_score(y_test, y_pred)),
    # extras to match your Elastic Net / RF logs
    "cv_best_auc": cv_best_auc,
    "cv_best_f1": cv_best_f1,
    "cv_best_accuracy": cv_best_acc,
}

# Log to MLflow
with mlflow.start_run(run_name="logreg_grid_search"):
    mlflow.log_params(grid_search_cv.best_params_)  # e.g., {'model__solver': ..., 'model__C': ...}
    for k, v in metrics.items():
        mlflow.log_metric(k, v)
    signature = infer_signature(X_test, y_pred)  # input -> predicted labels
    mlflow.sklearn.log_model(best_model, "model", signature=signature)

print("Best CV AUC:", cv_best_auc)
print("Test metrics:", metrics)
print("Best params:", grid_search_cv.best_params_)
print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")



Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END model__C=100, model__penalty=l2, model__solver=lbfgs; total time=   3.4s
[CV] END model__C=100, model__penalty=l2, model__solver=lbfgs; total time=   3.3s
[CV] END model__C=100, model__penalty=l2, model__solver=newton-cg; total time=   3.5s
[CV] END model__C=100, model__penalty=l2, model__solver=newton-cg; total time=   3.5s
[CV] END model__C=100, model__penalty=l2, model__solver=lbfgs; total time=   3.8s
[CV] END model__C=100, model__penalty=l2, model__solver=newton-cg; total time=   2.7s
[CV] END model__C=100, model__penalty=l2, model__solver=lbfgs; total time=   3.6s
[CV] END model__C=100, model__penalty=l2, model__solver=lbfgs; total time=   3.7s
[CV] END model__C=100, model__penalty=l2, model__solver=newton-cg; total time=   3.9s
[CV] END model__C=100, model__penalty=l2, model__solver=newton-cg; total time=   2.5s
[CV] END .model__C=100, model__penalty=l2, model__solver=sag; total time=  11.6s
[CV] END model__C



🏃 View run logreg_grid_search at: http://127.0.0.1:5000/#/experiments/786926549055850120/runs/fd1abf78d97c4b3da8bd35c54fbd3457
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/786926549055850120
Best CV AUC: 0.7681066389649052
Test metrics: {'test_auc': 0.7606723883153926, 'test_f1': 0.5803966878490275, 'test_accuracy': 0.6995725906521439, 'cv_best_auc': 0.7681066389649052, 'cv_best_f1': 0.5892103250755116, 'cv_best_accuracy': 0.7056670446518719}
Best params: {'model__C': 100, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Test ROC AUC (unseen data): 0.7607


### Run Optimization Method & View Best Performing Model

In [69]:
# # Run optimization process
# from sklearn.model_selection import GridSearchCV
# grid_search_cv.fit(X_train, y_train)
# # Print best hyperparameter configuration
# print("Best Parameters:", grid_search_cv.best_params_)
# print("Best CV Score:", grid_search_cv.best_score_)

# # Extract best model and use dire#ctly to predict
# best_model = grid_search_cv.best_estimator_
# y_pred = best_model.predict(X_test)

In [70]:
# # Extract best model and use directly to predict
# best_model = grid_search_cv.best_estimator_
# y_pred = best_model.predict(X_test)
# y_proba = best_model.predict_proba(X_test)[:, 1]

In [71]:
# # After you've fitted grid_search_cv and computed y_pred, y_proba
# from mlflow.models.signature import infer_signature
# import mlflow, mlflow.sklearn

# # --- Pull best CV metrics (robust to different AUC variants) ---
# cv  = grid_search_cv.cv_results_
# idx = grid_search_cv.best_index_

# auc_keys = [k for k in cv.keys() if k.startswith("mean_test_roc_auc")]
# cv_best_auc = float(cv[auc_keys[0]][idx]) if auc_keys else None
# cv_best_f1  = float(cv["mean_test_f1"][idx]) if "mean_test_f1" in cv else None
# cv_best_acc = float(cv["mean_test_accuracy"][idx]) if "mean_test_accuracy" in cv else None

# # --- Metrics dict in the exact format you requested ---
# metrics = {
#     "test_auc": float(roc_auc_score(y_test, y_proba)),
#     "test_f1": float(f1_score(y_test, y_pred)),
#     "test_accuracy": float(accuracy_score(y_test, y_pred)),
#     # extras to match your Elastic Net / RF logs
#     "cv_best_auc": cv_best_auc,
#     "cv_best_f1": cv_best_f1,
#     "cv_best_accuracy": cv_best_acc,
# }

# # --- Log to MLflow (same keys as Elastic Net) ---
# with mlflow.start_run(run_name="logreg_grid_search"):
#     mlflow.log_params(grid_search_cv.best_params_)  # e.g., {'clf__C': ..., 'clf__penalty': ...}
#     for k, v in metrics.items():
#         if v is not None:
#             mlflow.log_metric(k, v)

#     signature = infer_signature(X_test, y_pred)  # input -> predicted labels
#     mlflow.sklearn.log_model(best_model, "model", signature=signature)

# print("Best CV AUC:", cv_best_auc)
# print("Test metrics:", metrics)
# print("Best params:", grid_search_cv.best_params_)
# print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")

In [72]:
# # Evaluate on the hold-out test set
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# print("Test Accuracy :", accuracy_score(y_test, y_pred))
# print("Test Precision:", precision_score(y_test, y_pred, zero_division=0))
# print("Test Recall   :", recall_score(y_test, y_pred, zero_division=0))
# print("Test F1       :", f1_score(y_test, y_pred, zero_division=0))
# print("Test ROC AUC  :", roc_auc_score(y_test, y_proba))
# print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
# print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))