# AirBnb NY Listing Price Prediction: ML Tuning


Distributed hyperparameter tuning:
* Fugue Tune [link](https://fugue-tutorials.readthedocs.io/tutorials/tune/index.html), youtube [demo](https://www.youtube.com/watch?v=_GBjqskD8Qk&t=1s)
* Official Github repo [link](https://github.com/fugue-project/tune)


In [0]:
%pip install mlflow
%pip install scikit-learn==1.4.1.post1
%pip install fugue
%pip install tune
%pip install hyperopt
%pip install optuna

In [0]:
#dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import mlflow

from typing import Any
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from tune import Space, Rand, RandInt, Grid, suggest_for_noniterative_objective
from tune_hyperopt import HyperoptLocalOptimizer
from tune_optuna import  OptunaLocalOptimizer

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### Split into Train & Test

In [0]:
catalog_ = f"price_prediction"
schema_ = f"ny_listing"
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

In [0]:
SEED_ = 142

gold_data = spark.sql("SELECT * from gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))
#display(test_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-1].to_numpy()
yTrain = train_df.loc[:, 'price'].to_numpy()

xTest = test_df.iloc[:, 1:-1].to_numpy()
yTest = test_df.loc[:, 'price'].to_numpy()

### Hyperparameter tuning with Fugue-Tune

In [0]:
_mlflow_exp = '/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings'

In [0]:
## Define an objective function
num_CV = 3
x = xTrain
y = yTrain

def objective(model:Any, **hp) -> float:
  model_iter = model(**hp) # leave naming as model due to tune.Space()
  scores = cross_val_score(model_iter, x, y, cv = num_CV,
                           scoring = make_scorer(mean_absolute_percentage_error)) # MAPE
  return scores.mean()

In [0]:
# Train basic models as baseline
print(objective(RandomForestRegressor))
print(objective(GradientBoostingRegressor))

In [0]:
## Define Search spaces
## Each grid combination will be trained

RFspace1 = Space(
  model = RandomForestRegressor
)

XBGspace1 = Space(
  model = GradientBoostingRegressor,
  learning_rate = Rand(0.05, 1.6, 0.05)
).sample(3, seed = SEED_)

space2 = Space(
  random_state = SEED_,
  max_features = Grid('sqrt', 'log2', None), # Max number of features to consider when looking for the best split
  max_depth = Grid(5, 50, None), # Maximum depth of the individual estimators
  n_estimators = RandInt(80, 1000, include_high=True) # Nr of trees, Bayesian
)

space_RF = RFspace1 * space2
space_XGB = XBGspace1 * space2

print("RF tot combinations: %3d\n" % len(list(space_RF)))
#print(list(space_RF))

print("XGB tot combinations: %3d\n" % len(list(space_XGB)))
#print(list(space_XGB))

In [0]:
## Run hyper parameter search on RF
Max_Iter = 10
experiment_ = mlflow.set_experiment(_mlflow_exp)
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="RF_FugueTune") as run:

  result_ = suggest_for_noniterative_objective(
    objective = objective,
    space = space_RF,
    local_optimizer = HyperoptLocalOptimizer(max_iter = Max_Iter), # HyperoptLocalOptimizer or OptunaLocalOptimizer
    execution_engine = spark,
    logger = run)

In [0]:
## Run hyper parameter search on XGB
Max_Iter = 15
experiment_ = mlflow.set_experiment(_mlflow_exp)
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="XGB_FugueTune") as run:

  result_ = suggest_for_noniterative_objective(
    objective = objective,
    space = space_XGB,
    local_optimizer = OptunaLocalOptimizer(max_iter = Max_Iter), # HyperoptLocalOptimizer or OptunaLocalOptimizer
    execution_engine = spark,
    logger = run)