# AirBnb NY Listing Price Prediction: ML Models Set up


Distributed hyperparameter tuning:
* Fugue Tune [link](https://fugue-tutorials.readthedocs.io/tutorials/tune/index.html), youtube [demo](https://www.youtube.com/watch?v=_GBjqskD8Qk&t=1s)
* Official Github repo [link](https://github.com/fugue-project/tune)


In [0]:
%pip install mlflow
%pip install scikit-learn==1.4.1.post1
%pip install fugue
%pip install tune
%pip install hyperopt

In [0]:
import pandas as pd
import numpy as np
import mlflow

from typing import Any
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from tune import Space, Rand, RandInt, Grid, suggest_for_noniterative_objective
from tune_hyperopt import HyperoptLocalOptimizer

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### Split into Train & Test

In [0]:
catalog_ = f"price_prediction"
schema_ = f"ny_listing"
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

In [0]:
SEED_ = 142

gold_data = spark.sql("SELECT * from gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))
#display(test_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-1].to_numpy()
yTrain = train_df.loc[:, 'price'].to_numpy()

xTest = test_df.iloc[:, 1:-1].to_numpy()
yTest = test_df.loc[:, 'price'].to_numpy()

### Hyperparameter tuning with Fugue-Tune

In [0]:
## Define an objective function
num_CV = 3
x = xTrain
y = yTrain

def objective(model:Any, **hp) -> float:
  model_iter = model(**hp) # leave naming as model due to tune.Space()
  scores = cross_val_score(model_iter, x, y, cv = num_CV,
                           scoring = make_scorer(mean_absolute_percentage_error))
  return scores.mean()

In [0]:
## Define a Search space
RFspace1 = Space(
  model = RandomForestRegressor,
  max_features = Grid('sqrt', 'log2', None),
  bootstrap = Grid(True, False)
)
XBGspace1 = Space(
  model = GradientBoostingRegressor,
  max_depth = RandInt(3, 10)
).sample(3, seed = SEED_)

space2_bayesian = Space(
  n_estimators = RandInt(100,120) # Nr of trees
)

space_RF = RFspace1 * space2_bayesian
space_XGB = XBGspace1 * space2_bayesian

print("Tot combinations to be tested: %3d\n" % (len(list(space_RF))+len(list(space_XGB))) )
print(list(space_RF))
print(list(space_XGB))

In [0]:
## Run hyper parameter search
experiment_ = mlflow.set_experiment("/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings_Testing")
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="Fugue Tune RF") as run:

  result_ = suggest_for_noniterative_objective(
    objective = objective,
    space = space_RF,
    local_optimizer = HyperoptLocalOptimizer(max_iter = 20), # max 20 iterations HyperoptLocalOptimizer(max_iter = 20)
    execution_engine = spark,
    logger = run
    )

### Baseline model performances

In [0]:
RF_baseline = objective(RandomForestRegressor)
print("RF Baseline score: %.4f" % RF_baseline)

XGB_baseline = objective(GradientBoostingRegressor)
print("XGB Baseline score: %.4f" % XGB_baseline)

### Random Forest : RandomSearch

In [0]:
base_model_ = RandomForestRegressor()

# Set a wide range of hyper parameters
hyper_par = {}
hyper_par['n_estimators'] = [] + list(range(2, 2000, 10)) # Nr of trees
hyper_par['max_depth'] = [] + list(range(10, 110, 11)) # Maximum number of levels in tree
hyper_par['max_depth'].append(None)
hyper_par['min_samples_split'] = [2, 5, 10] # Minimum number of samples required to split a node
hyper_par['min_samples_leaf'] = [1, 2, 4] # Minimum number of samples required at each leaf node
hyper_par['max_features'] = ['auto', 'sqrt', None] # Number of features to consider at every split
hyper_par['bootstrap'] = [True, False] # Method of selecting samples for training each tree

print(hyper_par)

In [0]:
# Start Random Search (4hrs in pure python)
RFRandom = RandomizedSearchCV(
  estimator = base_model_,
  param_distributions = hyper_par,
  scoring = 'r2',
  n_iter = 100,
  cv = 3,
  verbose=2,
  random_state=SEED_,
  n_jobs = -1)

mlflow.sklearn.autolog()
experiment_ = mlflow.set_experiment("/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings_Testing")
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="RandomForest Random Search") as run:
  RFRandom.fit(xTrain,yTrain)

In [0]:
RFRandom_result_df = pd.DataFrame(RFRandom.cv_results_)
spark.createDataFrame(RFRandom_result_df).createOrReplaceTempView("RF_Random_Search")

In [0]:
%sql
DROP TABLE IF EXISTS RandomForest_RandomSearch;
CREATE TABLE RandomForest_RandomSearch AS
  select * from RF_Random_Search;

In [0]:
%sql
select * from RandomForest_RandomSearch order by rank_test_score limit 200;

### Random Forest : GridSearch

In [0]:
base_model_ = RandomForestRegressor(bootstrap=True, max_features=None, min_samples_leaf=1, min_samples_split=2)

# Set a wide range of hyper parameters
hyper_par = {}
hyper_par['n_estimators'] = [] + list(range(250, 2100, 250)) # Nr of trees
hyper_par['max_depth'] = [] + list(range(50, 101, 25)) # Maximum number of levels in tree

# Reducing grid for testing purposes
# hyper_par = {'max_depth' : [None],'max_features': [None],'n_estimators': [10,30]}

In [0]:
# Create Grid Search
RFSearch = GridSearchCV(
    base_model_,
    param_grid = hyper_par,
    scoring = 'r2',
    cv = 2,  # cv = None: (default 5-fold cross validation)
    n_jobs = -1,
    verbose=2)

In [0]:
# Start and Log GridSearch
mlflow.sklearn.autolog()
experiment_ = mlflow.set_experiment("/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings_Testing")
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="RandomForest Grid Search") as run:
  RFSearch.fit(xTrain,yTrain)

In [0]:
RFSearch_result_df = pd.DataFrame(RFSearch.cv_results_)
spark.createDataFrame(RFSearch_result_df).createOrReplaceTempView("RF_Grid_Search")

In [0]:
%sql
DROP TABLE IF EXISTS RandomForest_GridSearch;
CREATE TABLE RandomForest_GridSearch AS
  select * from RF_Grid_Search;
-- select * from RandomForest_GridSearch order by rank_test_score limit 200;

In [0]:
%sql
select * from RandomForest_GridSearch order by rank_test_score limit 200;