In [6]:
# ===============================================
# STEP 0 — Imports, config, and speed knobs
# ===============================================
import warnings, time, numpy as np, pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix,
    classification_report, precision_recall_curve
)
import joblib

# ---- paths / label (EDIT as needed) ----
CSV_PATH     = "../data/processed/11_biz_merged_clean.csv"
PKL_PATH    = "../data/processed/14_processed_df.pkl"


TARGET_COL   = "avg_stars_2019"
POSITIVE_MIN = 4  # binarize: >= 4 stars => 1

# ---- speed knobs ----
CV_FOLDS             = 5
EN_N_ITER            = 12
RF_N_ITER            = 16
SUBSAMPLE_FOR_TUNING = None   # e.g. 25000 to tune on subset, then refit on full; or None
OHE_MIN_FREQ         = 20     # merge rare levels to reduce feature count
SEARCH_N_JOBS        = -1
VERBOSE              = 2

warnings.filterwarnings("ignore", message="Skipping features without any observed values")

In [7]:
# ===============================================
# STEP 1 — Load data
# ===============================================
df = pd.read_csv(CSV_PATH)
print("Loaded:", df.shape)

Loaded: (36261, 61)


In [8]:
# Load the *fitted* ColumnTransformer you saved earlier
preprocessor = joblib.load(PKL_PATH)

In [9]:
preprocessor

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False


In [10]:
# Configure this week into central ML repository for course
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Set experiment name for this module
mlflow.set_experiment("BDA602 Yelp project")

<Experiment: artifact_location='mlflow-artifacts:/786926549055850120', creation_time=1759981678118, experiment_id='786926549055850120', last_update_time=1759981678118, lifecycle_stage='active', name='BDA602 Yelp project', tags={'mlflow.experimentKind': 'custom_model_development'}>

## Coercing

In [11]:
import pandas as pd
import numpy as np

# --- 1. Convert datetime columns ---
datetime_cols = ["first_review_2019", "last_review_2019"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# --- 2. Convert boolean columns (True/False or Yes/No or t/f) ---
bool_cols = [
    "is_open", "attr_ByAppointmentOnly", "attr_BusinessAcceptsCreditCards",
    "attr_BikeParking", "attr_RestaurantsTakeOut", "attr_RestaurantsDelivery",
    "attr_Caters", "attr_WheelchairAccessible", "attr_HappyHour",
    "attr_OutdoorSeating", "attr_HasTV", "attr_RestaurantsReservations",
    "attr_DogsAllowed", "attr_GoodForKids", "attr_RestaurantsTableService",
    "attr_RestaurantsGoodForGroups", "attr_DriveThru", "has_hours_info"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace(
            {"True": True, "False": False, "Yes": True, "No": False, "None": np.nan, "nan": np.nan}
        )
        df[col] = df[col].astype("boolean")

# --- 3. Convert category columns ---
category_cols = [
    "attr_RestaurantsPriceRange2", "attr_WiFi", "attr_Alcohol",
    "attr_RestaurantsAttire", "attr_NoiseLevel", "attr_Smoking"
]
for col in category_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- 4. Convert small integer columns to int8 for memory efficiency ---
int8_cols = [
    "cat__Sandwiches", "cat__American (Traditional)", "cat__Pizza",
    "cat__Fast Food", "cat__Breakfast & Brunch", "cat__American (New)",
    "cat__Burgers", "cat__Mexican", "cat__Italian", "cat__Coffee & Tea",
    "cat__Seafood", "cat__Chinese", "cat__Salad", "cat__Chicken Wings",
    "cat__Cafes", "cat__Delis", "cat__Caterers", "cat__Specialty Food",
    "cat__Bakeries", "cat__Desserts"
]
for col in int8_cols:
    if col in df.columns:
        df[col] = df[col].astype("int8")

# --- 5. Convert others explicitly to float if not already ---
float_cols = [
    "latitude", "longitude", "review_count", "review_count_log1p",
    "total_weekly_hours", "days_open", "weekend_hours", "avg_daily_hours",
    "avg_stars_2019", "rl_word_mean", "rl_share_short24"
]
for col in float_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

# --- 6. Optional: compress integers like rev_count_2019 ---
df["rev_count_2019"] = df["rev_count_2019"].astype("int64")


  df[col] = df[col].astype(str).str.strip().replace(
  df[col] = df[col].astype(str).str.strip().replace(


## Define Target and Predictor

In [12]:
# Target
y = df["avg_stars_2019"].astype(float)

# Columns to exclude from predictors
exclude = {
    "business_id", "city", "state",
    "avg_stars_2019", "review_count",
    "rev_count_2019", "first_review_2019", "last_review_2019",
}

# Build X (everything except target + excluded)

feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   business_id                      36261 non-null  object        
 1   city                             36261 non-null  object        
 2   state                            36261 non-null  object        
 3   latitude                         36261 non-null  float64       
 4   longitude                        36261 non-null  float64       
 5   review_count                     36261 non-null  float64       
 6   is_open                          36261 non-null  boolean       
 7   review_count_log1p               36261 non-null  float64       
 8   attr_ByAppointmentOnly           3139 non-null   boolean       
 9   attr_BusinessAcceptsCreditCards  31372 non-null  boolean       
 10  attr_BikeParking                 26853 non-null  boolean  

In [14]:
# Binary classification target
y_cls = (y >= 4.0).astype(int)
y_cls.value_counts(normalize=True).round(3)  # quick class balance check

avg_stars_2019
0    0.614
1    0.386
Name: proportion, dtype: float64

## Split X train and y train

In [15]:
# Sklearn imports for optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Partition predictors & response into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cls,
    test_size = 0.2,   # Reserve 20% for "hold-out" data, 
    random_state=42
)

X_train.head(5)

Unnamed: 0,latitude,longitude,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_BikeParking,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_RestaurantsDelivery,...,cat__Salad,cat__Chicken Wings,cat__Cafes,cat__Delis,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rl_word_mean,rl_share_short24
8442,38.738936,-90.397281,True,4.574711,,True,False,2.0,True,True,...,0,0,0,0,0,0,0,0,71.375,0.208333
5934,40.209943,-75.225566,True,4.812184,,True,True,2.0,True,True,...,1,0,0,0,0,0,0,0,106.0625,0.09375
31281,39.752035,-75.541795,True,2.079442,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,66.0,0.0
18393,53.517787,-113.50945,True,4.025352,,,False,2.0,True,True,...,0,0,0,0,0,0,0,0,145.42857,0.0
21544,30.02008,-90.2506,True,2.397895,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,65.85714,0.142857


In [16]:
# # ===============================================
# # STEP 7 — Random Forest (fast search) + STEP 7B refit
# # ===============================================
# pipe_rf = SkPipeline([
#     ("preprocessor", preprocessor),
#     ("clf", RandomForestClassifier(
#         n_estimators=400, random_state=42,
#         class_weight="balanced_subsample", n_jobs=-1
#     )),
# ])

# param_dist_rf = {
#     "clf__n_estimators": [300, 400, 600, 800],
#     "clf__max_depth": [None, 10, 20, 40],
#     "clf__min_samples_split": [2, 5, 10],
#     "clf__min_samples_leaf": [1, 2, 4],
#     "clf__max_features": ["sqrt", "log2", None, 0.3],
# }

# print("[RF] RandomizedSearch starting...")
# t0 = time.time()
# rs_rf = RandomizedSearchCV(
#     estimator=pipe_rf,
#     param_distributions=param_dist_rf,
#     n_iter=RF_N_ITER,
#     scoring={"f1":"f1","roc_auc":"roc_auc"},
#     refit="f1",
#     cv=5,
#     n_jobs=SEARCH_N_JOBS,
#     verbose=VERBOSE,
#     random_state=42,
#     error_score="raise"
# )
# rs_rf.fit(X_train, y_train)


# best_params_rf = rs_rf.best_params_
# pipe_rf.set_params(**best_params_rf)
# pipe_rf.fit(X_train, y_train)
# print("[RF] Refit complete.")

In [17]:
# --- Imports ---
import time
import numpy as np
from scipy.stats import randint  # distributions for RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, roc_auc_score, accuracy_score,
    confusion_matrix, classification_report
)
from mlflow.models.signature import infer_signature
import mlflow

# --- Pipeline (step name 'clf' matches param keys below) ---
pipe_rf = SkPipeline([
    ("preprocessor", preprocessor),  # your fitted ColumnTransformer
    ("clf", RandomForestClassifier(
        n_estimators=400,            # will be overridden by search
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    )),
])

# --- Search space (wide but sensible) ---
param_dist_rf = {
    "clf__n_estimators": randint(300, 1001),              # 300–1000
    "clf__max_depth": [None] + list(range(8, 41, 4)),     # None or 8..40
    "clf__min_samples_split": randint(2, 51),             # 2..50
    "clf__min_samples_leaf": randint(1, 21),              # 1..20
    # categorical choices or numeric fractions in (0,1]
    "clf__max_features": ["sqrt", "log2", None, 0.3, 0.5, 0.8],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rs_rf = RandomizedSearchCV(
    estimator=pipe_rf,
    param_distributions=param_dist_rf,
    n_iter=30,  # bump up if you have time
    scoring={"roc_auc": "roc_auc", "f1": "f1", "accuracy": "accuracy"},
    refit="roc_auc",      # choose by ROC AUC
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# --- Run the search ONCE and time it ---
t0 = time.time()
rs_rf.fit(X_train, y_train)
elapsed = time.time() - t0

# --- Extract best params + CV metrics ---
best_index  = rs_rf.best_index_
best_params = rs_rf.best_params_
best_cv_auc = float(rs_rf.best_score_)  # because refit="roc_auc"
best_cv_f1  = float(rs_rf.cv_results_["mean_test_f1"][best_index])
best_cv_acc = float(rs_rf.cv_results_["mean_test_accuracy"][best_index])

print("Best params (CV by ROC AUC):", best_params)
print(f"Best CV ROC AUC: {best_cv_auc:.4f}")
print(f"Best CV F1:      {best_cv_f1:.4f}")
print(f"Best CV Acc:     {best_cv_acc:.4f}")
print(f"Search elapsed:  {elapsed:.1f}s")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END clf__max_depth=20, clf__max_features=log2, clf__min_samples_leaf=19, clf__min_samples_split=24, clf__n_estimators=630; total time=  13.0s
[CV] END clf__max_depth=20, clf__max_features=log2, clf__min_samples_leaf=19, clf__min_samples_split=24, clf__n_estimators=630; total time=  12.9s
[CV] END clf__max_depth=20, clf__max_features=log2, clf__min_samples_leaf=19, clf__min_samples_split=24, clf__n_estimators=630; total time=  13.1s
[CV] END clf__max_depth=20, clf__max_features=log2, clf__min_samples_leaf=19, clf__min_samples_split=24, clf__n_estimators=630; total time=  13.1s
[CV] END clf__max_depth=20, clf__max_features=log2, clf__min_samples_leaf=19, clf__min_samples_split=24, clf__n_estimators=630; total time=  13.5s
[CV] END clf__max_depth=28, clf__max_features=0.3, clf__min_samples_leaf=15, clf__min_samples_split=44, clf__n_estimators=371; total time=  16.2s
[CV] END clf__max_depth=28, clf__max_features=0.3, clf__m

In [18]:
import mlflow

In [19]:
# --- Evaluate the refit best estimator on the holdout set ---
best_rf  = rs_rf.best_estimator_           # already refit on full train
y_proba  = best_rf.predict_proba(X_test)[:, 1]
y_pred   = best_rf.predict(X_test)

test_auc = float(roc_auc_score(y_test, y_proba))
test_f1  = float(f1_score(y_test, y_pred))
test_acc = float(accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\n=== RandomForest (test set) ===")
print(f"Accuracy:  {test_acc:.3f} | F1: {test_f1:.3f} | ROC-AUC: {test_auc:.3f}")
print("Confusion matrix (TN FP / FN TP):\n", cm)
print("\n" + classification_report(y_test, y_pred, digits=3))

# --- Package metrics for logging ---
metrics = {
    "test_auc": test_auc,
    "test_f1": test_f1,
    "test_accuracy": test_acc,
    # extras (handy for comparing runs)
    "cv_best_auc": best_cv_auc,
    "cv_best_f1": best_cv_f1,
    "cv_best_accuracy": best_cv_acc,
    "search_seconds": float(elapsed),
}

# --- Log to MLflow ---
with mlflow.start_run(run_name="randomforest_randomsearch"):
    # Log best params (clf__* keys match your search space)
    mlflow.log_params(best_params)

    # Log metrics
    for k, v in metrics.items():
        mlflow.log_metric(k, v)

    # Log the fitted pipeline as "model"
    signature = infer_signature(X_test, y_pred)  # input -> predicted labels
    mlflow.sklearn.log_model(best_rf, "model", signature=signature)

print("\nSummary")
print("Best params:", best_params)
print("Best CV AUC:", best_cv_auc)
print("Test metrics:", metrics)
print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")



=== RandomForest (test set) ===
Accuracy:  0.713 | F1: 0.647 | ROC-AUC: 0.788
Confusion matrix (TN FP / FN TP):
 [[3264 1247]
 [ 834 1908]]

              precision    recall  f1-score   support

           0      0.796     0.724     0.758      4511
           1      0.605     0.696     0.647      2742

    accuracy                          0.713      7253
   macro avg      0.701     0.710     0.703      7253
weighted avg      0.724     0.713     0.716      7253





🏃 View run randomforest_randomsearch at: http://127.0.0.1:5000/#/experiments/786926549055850120/runs/6b4809b74a114ada97ce822d33c5c195
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/786926549055850120

Summary
Best params: {'clf__max_depth': 24, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 13, 'clf__n_estimators': 613}
Best CV AUC: 0.7977969926195121
Test metrics: {'test_auc': 0.7875512504404099, 'test_f1': 0.6471086993386468, 'test_accuracy': 0.7130842410037226, 'cv_best_auc': 0.7977969926195121, 'cv_best_f1': 0.659838646436876, 'cv_best_accuracy': 0.7233521470254893, 'search_seconds': 858.4823400974274}
Test ROC AUC (unseen data): 0.7876


In [20]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "plotly"])

import plotly.graph_objects as go




In [21]:
import numpy as np, pandas as pd
import plotly.graph_objects as go

res = pd.DataFrame(rs_rf.cv_results_)[
    ["param_clf__n_estimators", "param_clf__max_features", "mean_test_roc_auc"]
].copy()

# Clean types / labels
res["n_estimators"] = res["param_clf__n_estimators"].astype(int)
res["max_features_label"] = res["param_clf__max_features"].apply(lambda v: "None" if v is None else str(v))

# Pivot to 2D grid (rows = max_features, cols = n_estimators)
pivot = res.pivot_table(index="max_features_label", columns="n_estimators",
                        values="mean_test_roc_auc", aggfunc="mean")

x = pivot.columns.to_list()         # n_estimators
y_labels = pivot.index.to_list()    # max_features categories
z = pivot.values                    # ROC AUC matrix (NaNs possible if combos not sampled)


In [22]:
fig = go.Figure(data=[go.Heatmap(z=z, x=x, y=y_labels, colorbar_title="CV ROC AUC")])
fig.update_layout(
    title="RF: CV ROC AUC over (n_estimators, max_features)",
    xaxis_title="n_estimators",
    yaxis_title="max_features"
)
fig.show()


In [23]:
y_pos = np.arange(len(y_labels))
fig = go.Figure(data=[go.Contour(z=z, x=x, y=y_pos, contours=dict(showlabels=True))])
fig.update_layout(
    title="RF: CV ROC AUC (contour)",
    xaxis_title="n_estimators",
    yaxis=dict(title="max_features", tickmode="array", tickvals=y_pos, ticktext=y_labels)
)
fig.show()


In [24]:
import numpy as np, pandas as pd
import plotly.graph_objects as go

# 1) Build the grid from RandomizedSearchCV results (using ROC AUC)
res = pd.DataFrame(rs_rf.cv_results_)[
    ["param_clf__n_estimators", "param_clf__max_features", "mean_test_roc_auc"]
].copy()

# Clean types / labels
res["n_estimators"] = res["param_clf__n_estimators"].astype(int)
res["max_features_label"] = res["param_clf__max_features"].apply(lambda v: "None" if v is None else str(v))

# Pivot to 2D matrix: rows = max_features, cols = n_estimators
pivot = (res
         .pivot_table(index="max_features_label", columns="n_estimators",
                      values="mean_test_roc_auc", aggfunc="mean")
         .sort_index(axis=0).sort_index(axis=1))

# 2) Prepare x, y, z
x = pivot.columns.to_numpy()                  # n_estimators (numeric)
y_labels = pivot.index.to_list()              # categorical labels
y = np.arange(len(y_labels))                  # numeric positions for 3D axis
z = pivot.to_numpy(dtype=float)               # ROC AUC matrix (may contain NaNs)

# Optional: fill small holes so the surface is continuous (keeps edges)
z = (pd.DataFrame(z)
       .interpolate(axis=1, limit_direction="both")
       .interpolate(axis=0, limit_direction="both")
       .to_numpy())

# 3) Plot 3D surface
fig = go.Figure(data=[go.Surface(z=z, x=x, y=y)])
fig.update_layout(
    title="RF: CV ROC AUC Surface (n_estimators × max_features)",
    scene=dict(
        xaxis_title="n_estimators",
        yaxis_title="max_features",
        yaxis=dict(tickmode="array", tickvals=y, ticktext=y_labels),
        zaxis_title="CV ROC AUC",
    ),
    width=800, height=800, margin=dict(l=65, r=50, b=65, t=90)
)
fig.show()
