# AirBnb NY Listing Price Prediction: Python Model Tuning


Distributed hyperparameter tuning:
* Fugue Tune [link](https://fugue-tutorials.readthedocs.io/tutorials/tune/index.html), youtube [demo](https://www.youtube.com/watch?v=_GBjqskD8Qk&t=1s)
* Official Github repo [link](https://github.com/fugue-project/tune)

In this example, a Gradient Boost Model will be tuned from sklearn framework


In [0]:
import pandas as pd
import os
import numpy as np
import mlflow
from datetime import datetime

from typing import Any
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate
from tune import Space, Rand, RandInt, Grid, suggest_for_noniterative_objective
from tune_hyperopt import HyperoptLocalOptimizer
from tune_optuna import  OptunaLocalOptimizer

from sklearn.ensemble import GradientBoostingRegressor

### Split into Train & Test

In [0]:
# Set up variables
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

SEED_ = 111
target_metric = 'price_log'
experiment_name_ = 'Airbnb_NY_Tuning'

In [0]:
# Split data
gold_data = spark.sql("SELECT * from airbnb_ny_gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-2].to_numpy()
yTrain = train_df.loc[:, target_metric].to_numpy()

xTest = test_df.iloc[:, 1:-2].to_numpy()
yTest = test_df.loc[:, target_metric].to_numpy()

### Hyperparameter tuning with Fugue-Tune

In [0]:
## Define Cross Validation params
num_CV = 3
x = xTrain
y = yTrain

In [0]:
## Define an objective function (single metric)
def objective(model:Any, **hp) -> float:
  model_iter = model(**hp) # leave naming as model due to tune.Space()
  scores = cross_val_score(model_iter, x, y, cv = num_CV,
                           scoring = make_scorer(r2_score))
  
  return scores.mean()

In [0]:
# Train basic models as baseline
print(objective(GradientBoostingRegressor))

In [0]:
## Define Search space
## Each grid combination will be trained

XBGspace1 = Space(
  model = GradientBoostingRegressor,
  learning_rate = Rand(0.03, 1.6, 0.03)
).sample(3, seed = SEED_)

space2 = Space(
  random_state = SEED_,
  max_features = Grid('sqrt', 'log2', None), # Max number of features to consider when looking for the best split
  max_depth = Grid(5, 50, None), # Maximum depth of the individual estimators
  n_estimators = RandInt(10, 700, include_high=True) # Nr of trees, Bayesian
)

space_XGB = XBGspace1 * space2

print("XGB tot combinations: %3d\n" % len(list(space_XGB)))
#print(list(space_XGB))

In [0]:
## Set up the MLFlow Experiment
experiment_path = f'/Users/gabriele.albini@databricks.com/{experiment_name_}'
experiment = mlflow.get_experiment_by_name(experiment_path)

if experiment is not None:
    experiment_id = experiment.experiment_id
else:
    experiment_id = mlflow.create_experiment(name=experiment_path)

print(experiment_id)

In [0]:
## Run hyper parameter search on XGB
Max_Iter = 10

with mlflow.start_run(experiment_id=experiment_id, run_name="XGB_sklearn_FugueTune") as run:
    result_ = suggest_for_noniterative_objective(
        objective=objective,
        space=space_XGB,
        local_optimizer=OptunaLocalOptimizer(max_iter=Max_Iter), # HyperoptLocalOptimizer or OptunaLocalOptimizer
        execution_engine=spark,
        logger=run
    )