## Import Library

In [79]:
#!mlflow

In [80]:
# Data processing, JSON-handling, & visualization libraries
import joblib
import pandas as pd
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns


# Sklearn libraries, preprocessing steps, & decision tree model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Manual Grid Search
from sklearn.model_selection import (
    train_test_split,
    ParameterGrid
)

# XGBoost Classifier Algorithm
from xgboost import XGBClassifier
# MLflow Library Imports and OS Directory Handling
import os
from mlflow import (
    set_tracking_uri, get_tracking_uri, set_experiment, start_run, 
    log_params, log_metric
)
from mlflow.models.signature import infer_signature
import mlflow.sklearn

# Metric to track and log for MLflow
from sklearn.metrics import accuracy_score



In [81]:
# Load the *fitted* ColumnTransformer you saved earlier
preprocessor = joblib.load("../data/processed/14_processed_df.pkl")

# Rebuild X (same as in notebook A)
import pandas as pd, json
df = pd.read_csv("../data/processed/11_biz_merged_clean.csv")

In [82]:
preprocessor

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False


Ordinary: 
PriceRange2: [1,2,3,4] → [0,1,2,3]
NoiseLevel: ["quiet","average","loud","very_loud"] → [0,1,2,3]
Attire: ["casual","dressy","formal"] → [0,1,2]

In [83]:
df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,review_count,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,...,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rev_count_2019,avg_stars_2019,first_review_2019,last_review_2019,rl_word_mean,rl_share_short24
0,MTSW4McQd7CbVtyjqoe9mw,Philadelphia,PA,39.955505,-75.155564,80.0,True,4.394449,False,False,...,0,0,1,0,20,4.55,2019-03-12 17:04:09,2021-11-01 18:22:07,81.45,0.05
1,CF33F8-E6oudUQ46HnavjQ,Ashland City,TN,36.269593,-87.058943,6.0,True,1.94591,False,True,...,0,0,0,0,3,1.333333,2020-06-26 19:22:36,2021-03-06 07:18:00,70.0,0.0
2,bBDDEgkFA1Otx9Lfe7BZUQ,Nashville,TN,36.208102,-86.76817,10.0,True,2.397895,False,True,...,0,0,0,0,5,1.8,2019-01-05 01:28:55,2021-04-15 19:16:33,111.2,0.0
3,eEOYSgkmpB90uNA7lDOMRA,Tampa Bay,FL,27.955269,-82.45632,10.0,True,2.397895,,,...,0,0,0,0,8,4.25,2019-01-16 18:22:34,2022-01-03 01:18:29,91.875,0.0
4,il_Ro8jwPlHresjw9EGmBg,Indianapolis,IN,39.637133,-86.127217,28.0,True,3.367296,,True,...,0,0,0,0,12,2.25,2019-01-01 19:58:17,2021-04-22 13:58:42,97.833336,0.0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_id                      36261 non-null  object 
 1   city                             36261 non-null  object 
 2   state                            36261 non-null  object 
 3   latitude                         36261 non-null  float64
 4   longitude                        36261 non-null  float64
 5   review_count                     36261 non-null  float64
 6   is_open                          36261 non-null  bool   
 7   review_count_log1p               36261 non-null  float64
 8   attr_ByAppointmentOnly           3139 non-null   object 
 9   attr_BusinessAcceptsCreditCards  31372 non-null  object 
 10  attr_BikeParking                 26853 non-null  object 
 11  attr_RestaurantsPriceRange2      29672 non-null  float64
 12  attr_RestaurantsTa

In [85]:
import pandas as pd
import numpy as np

# --- 1. Convert datetime columns ---
datetime_cols = ["first_review_2019", "last_review_2019"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# --- 2. Convert boolean columns (True/False or Yes/No or t/f) ---
bool_cols = [
    "is_open", "attr_ByAppointmentOnly", "attr_BusinessAcceptsCreditCards",
    "attr_BikeParking", "attr_RestaurantsTakeOut", "attr_RestaurantsDelivery",
    "attr_Caters", "attr_WheelchairAccessible", "attr_HappyHour",
    "attr_OutdoorSeating", "attr_HasTV", "attr_RestaurantsReservations",
    "attr_DogsAllowed", "attr_GoodForKids", "attr_RestaurantsTableService",
    "attr_RestaurantsGoodForGroups", "attr_DriveThru", "has_hours_info"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace(
            {"True": True, "False": False, "Yes": True, "No": False, "None": np.nan, "nan": np.nan}
        )
        df[col] = df[col].astype("boolean")

# --- 3. Convert category columns ---
category_cols = [
    "attr_RestaurantsPriceRange2", "attr_WiFi", "attr_Alcohol",
    "attr_RestaurantsAttire", "attr_NoiseLevel", "attr_Smoking"
]
for col in category_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- 4. Convert small integer columns to int8 for memory efficiency ---
int8_cols = [
    "cat__Sandwiches", "cat__American (Traditional)", "cat__Pizza",
    "cat__Fast Food", "cat__Breakfast & Brunch", "cat__American (New)",
    "cat__Burgers", "cat__Mexican", "cat__Italian", "cat__Coffee & Tea",
    "cat__Seafood", "cat__Chinese", "cat__Salad", "cat__Chicken Wings",
    "cat__Cafes", "cat__Delis", "cat__Caterers", "cat__Specialty Food",
    "cat__Bakeries", "cat__Desserts"
]
for col in int8_cols:
    if col in df.columns:
        df[col] = df[col].astype("int8")

# --- 5. Convert others explicitly to float if not already ---
float_cols = [
    "latitude", "longitude", "review_count", "review_count_log1p",
    "total_weekly_hours", "days_open", "weekend_hours", "avg_daily_hours",
    "avg_stars_2019", "rl_word_mean", "rl_share_short24"
]
for col in float_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

# --- 6. Optional: compress integers like rev_count_2019 ---
df["rev_count_2019"] = df["rev_count_2019"].astype("int64")




  df[col] = df[col].astype(str).str.strip().replace(
  df[col] = df[col].astype(str).str.strip().replace(


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   business_id                      36261 non-null  object        
 1   city                             36261 non-null  object        
 2   state                            36261 non-null  object        
 3   latitude                         36261 non-null  float64       
 4   longitude                        36261 non-null  float64       
 5   review_count                     36261 non-null  float64       
 6   is_open                          36261 non-null  boolean       
 7   review_count_log1p               36261 non-null  float64       
 8   attr_ByAppointmentOnly           3139 non-null   boolean       
 9   attr_BusinessAcceptsCreditCards  31372 non-null  boolean       
 10  attr_BikeParking                 26853 non-null  boolean  

In [87]:
# Target
y = df["avg_stars_2019"].astype(float)

# Columns to exclude from predictors
exclude = {
    "business_id", "city", "state",
    "avg_stars_2019", "review_count",
    "rev_count_2019", "first_review_2019", "last_review_2019",
}

# Build X (everything except target + excluded)

feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()



In [88]:
#!pip install xgboost

## Define Target
Make a binary target (≥4★ = 1, else 0)

In [89]:
# Binary classification target
y_cls = (y >= 4.0).astype(int)
y_cls.value_counts(normalize=True).round(3)  # quick class balance check

avg_stars_2019
0    0.614
1    0.386
Name: proportion, dtype: float64

In [90]:
# Sklearn imports for optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Partition predictors & response into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cls,
    test_size = 0.2,   # Reserve 20% for "hold-out" data, 
    random_state=42
)

X_train.head(5)

Unnamed: 0,latitude,longitude,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_BikeParking,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_RestaurantsDelivery,...,cat__Salad,cat__Chicken Wings,cat__Cafes,cat__Delis,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rl_word_mean,rl_share_short24
8442,38.738936,-90.397281,True,4.574711,,True,False,2.0,True,True,...,0,0,0,0,0,0,0,0,71.375,0.208333
5934,40.209943,-75.225566,True,4.812184,,True,True,2.0,True,True,...,1,0,0,0,0,0,0,0,106.0625,0.09375
31281,39.752035,-75.541795,True,2.079442,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,66.0,0.0
18393,53.517787,-113.50945,True,4.025352,,,False,2.0,True,True,...,0,0,0,0,0,0,0,0,145.42857,0.0
21544,30.02008,-90.2506,True,2.397895,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,65.85714,0.142857


## Configure MLflow Directory

In [91]:
# Configure this week into central ML repository for course
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Set experiment name for this module
mlflow.set_experiment("BDA602 Yelp project")

<Experiment: artifact_location='mlflow-artifacts:/786926549055850120', creation_time=1759981678118, experiment_id='786926549055850120', last_update_time=1759981678118, lifecycle_stage='active', name='BDA602 Yelp project', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [92]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from scipy.stats import loguniform, randint, uniform
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd


## Manual Hyperparameter RandomizedSearchCV with MLflow Tracking

In [93]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from scipy.stats import loguniform, randint, uniform
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd

In [94]:
import time
import numpy as np
from xgboost import XGBClassifier
from scipy.stats import randint, loguniform
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from mlflow.models.signature import infer_signature
import mlflow

# --- Base model ---
base_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",      # or "gpu_hist" if you have a GPU
    n_jobs=-1,
    random_state=42
)

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),     # your fitted ColumnTransformer
    ("classifier", base_xgb)
])

# --- Search space ---
param_distributions = {
    "classifier__n_estimators": randint(400, 1400),
    "classifier__learning_rate": loguniform(1e-2, 3e-1),  # ~0.01–0.3
    "classifier__max_depth": randint(3, 9),               # 3–8
    # Optional extras:
    # "classifier__subsample": loguniform(0.6, 1.0),
    # "classifier__colsample_bytree": loguniform(0.6, 1.0),
    # "classifier__reg_lambda": loguniform(1e-3, 10),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=80,                                  # 50–100 is a good first pass
    scoring={"roc_auc": "roc_auc", "f1": "f1", "accuracy": "accuracy"},
    refit="roc_auc",                            # choose best by ROC AUC
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# --- Fit & time the search ---
t0 = time.time()
search.fit(X_train, y_train)
elapsed = time.time() - t0

# --- CV metrics for the chosen params ---
best_idx     = search.best_index_
best_params  = search.best_params_
cv_best_auc  = float(search.best_score_)  # refit='roc_auc'
cv_best_f1   = float(search.cv_results_["mean_test_f1"][best_idx])
cv_best_acc  = float(search.cv_results_["mean_test_accuracy"][best_idx])

# --- Evaluate on the untouched test set ---
best_xgb = search.best_estimator_
y_proba  = best_xgb.predict_proba(X_test)[:, 1]
y_pred   = (y_proba >= 0.5).astype(int)

metrics = {
    "test_auc": float(roc_auc_score(y_test, y_proba)),
    "test_f1": float(f1_score(y_test, y_pred)),
    "test_accuracy": float(accuracy_score(y_test, y_pred)),
    # extras to match your Elastic Net / RF logs
    "cv_best_auc": cv_best_auc,
    "cv_best_f1": cv_best_f1,
    "cv_best_accuracy": cv_best_acc,
}

# # --- Log to MLflow (same keys as Elastic Net) ---
# with mlflow.start_run(run_name="xgb_randomsearch"):
#     mlflow.log_params(best_params)
#     for k, v in metrics.items():
#         mlflow.log_metric(k, v)

#     signature = infer_signature(X_test, y_pred)  # input -> predicted labels
#     mlflow.sklearn.log_model(search.best_estimator_, "model", signature=signature)

print("Best CV AUC:", cv_best_auc)
print("Test metrics:", metrics)
print("Best params:", best_params)
print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")


Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best CV AUC: 0.8017660322516452
Test metrics: {'test_auc': 0.7872820729488383, 'test_f1': 0.6101246694370986, 'test_accuracy': 0.7154280987177719, 'cv_best_auc': 0.8017660322516452, 'cv_best_f1': 0.6383313312309237, 'cv_best_accuracy': 0.7333147935779476}
Best params: {'classifier__learning_rate': 0.08167611317882542, 'classifier__max_depth': 4, 'classifier__n_estimators': 421}
Test ROC AUC (unseen data): 0.7873


In [95]:
print(f"Unseen test set -> ROC AUC: {test_auc:.4f} | F1: {test_f1:.4f} | Accuracy: {test_acc:.4f}")

NameError: name 'test_auc' is not defined

In [None]:
pipe

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
