# AirBnb NY Listing Price Prediction: ML Best Models

In [0]:
import pandas as pd
import numpy as np
import mlflow
import os

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import clone

In [0]:
### Define Global Variables
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

SEED_ = 334
Train_tbl_Name = 'airbnb_ny_gold_data'
Target_Var_ = 'price'
experiment_name_ = 'Airbnb_NY_Tuning'

### Split into Train & Test

In [0]:
gold_data = spark.sql("SELECT * from "+Train_tbl_Name)
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-2].to_numpy()
yTrain = train_df.loc[:, Target_Var_].to_numpy()

xTest = test_df.iloc[:, 1:-2].to_numpy()
yTest = test_df.loc[:, Target_Var_].to_numpy()

### Extract Model Params from Runs

In [0]:
## Define Base Models
rf_base = RandomForestRegressor(random_state = SEED_)
xgb_base = GradientBoostingRegressor(random_state = SEED_)

In [0]:
## Set up the MLFlow Experiment
experiment_path = f'/Users/gabriele.albini@databricks.com/{experiment_name_}'
experiment = mlflow.get_experiment_by_name(experiment_path)

if experiment is not None:
    experiment_id = experiment.experiment_id
else:
    experiment_id = mlflow.create_experiment(name=experiment_path)

mlflow.sklearn.autolog(disable=True)

In [0]:
## Define Model details to retrieve
models_meta = {
  'RandomForestRegressor': {
    'model': rf_base, # use set_params() here
    'run_ids': [
      'c4a291febe8144649fc3b032ed905e06',
      '2ca308a2e2a54c6fae2f047a6eb57274',
      '92848115fcc941698edddec111dea043'
    ],
    'params_tuned': { # (param name logged into mlflow, corresponding param name in model) this is needed as tuning was done on spark.ml while here we're using sklearn.ensemble
      'featureSubsetStrategy': 'max_features',
      'maxDepth': 'max_depth',
      'numTrees': 'n_estimators'
    },
    'eval_metrics': {
      'mape': mean_absolute_percentage_error,
      'r2': r2_score,
      'mse': mean_squared_error
    }
  },
  'GradientBoostingRegressor': {
    'model': xgb_base,
    'run_ids': [
      '9eb02478b0a24684a1d0392e3b4ab809',
      '89be137be9c54f8c800a414f22bbc134'
    ],
    'params_tuned': {
      'learning_rate': 'learning_rate',
      'max_depth': 'max_depth',
      'max_features': 'max_features',
      'n_estimators': 'n_estimators'
    },
    'eval_metrics': {
      'mape': mean_absolute_percentage_error,
      'r2': r2_score,
      'mse': mean_squared_error
    }
  }
}

In [0]:
## Retrieve info from MLflow and Initialize Models
def init_models(models_meta_):
  models_fin = []
  client = mlflow.tracking.MlflowClient()
  
  for model_type in models_meta_.keys():

    ### Get model type details
    run_ids_ = models_meta_[model_type]['run_ids']
    params_ = models_meta[model_type]['params_tuned']

    for r in run_ids_:

      ### Initialize a new model
      model_ = clone(models_meta_[model_type]['model'])

      ### Extract parameters from mlflow run and convert them to the corresponding name and type
      run = client.get_run(r)
      parameters = run.data.params

      parameters_to_set = {}
      for param_k, param_v in parameters.items():
        if param_k in params_.keys():
          param_name = params_[param_k]
          param_value = param_v
          if param_value.isdigit() and param_name != 'learning_rate':
            param_value_fin = int(param_value)
          elif param_value == 'onethird':
            param_value_fin = float(1/3)
          elif param_value == 'None':
            param_value_fin = None
          elif param_name == 'learning_rate':
            param_value_fin = float(param_value)
          else:
            param_value_fin = param_value
          #print("Mlflow logged %s with value %s becomes: %s" % (param_k, param_value, str(param_value_fin)))
          parameters_to_set[param_name] = param_value_fin

      ### Set identified parameters to the model object
      model_.set_params(**parameters_to_set)
      models_fin.append(model_)

  return models_fin

In [0]:
my_models_ = init_models(models_meta)
for m in my_models_:
  print(m)

### Train Models

In [0]:
with mlflow.start_run(experiment_id=experiment_id, run_name="Best Models") as run:

  for j in range(len(my_models_)):
    
    model_ = my_models_[j]

    ## Determine model type
    model_type = None
    types_ = list(models_meta.keys())
    for i in range(len(types_)):
      if types_[i] in str(type(model_)):
        break
    model_type = types_[i] #print(model_type) ## e.g. 'GradientBoostingRegressor'
    metrics = models_meta[model_type]['eval_metrics']

    with mlflow.start_run(experiment_id=experiment_id, run_name="Model-"+str(j), nested=True) as run:

      ## Train & predict
      m_trained_ = model_.fit(xTrain, yTrain)
      pred_yTrain = m_trained_.predict(xTrain) 
      pred_yTest = m_trained_.predict(xTest)

      ## Log mlflow params
      mlflow.log_params(m_trained_.get_params())

      ## Calculate and log evaluation metrics
      for metric_k, metric_f in metrics.items():
        mlflow.log_metric(metric_k+'_train', metric_f(yTrain, pred_yTrain))
        mlflow.log_metric(metric_k+'_test', metric_f(yTest, pred_yTest))

      ## Log mlflow model
      signature_ = mlflow.models.infer_signature(model_input=xTrain[:10], model_output=pred_yTrain[:10])
      dataset_train = mlflow.data.load_delta(table_name= str(catalog_+'.'+schema_+'.'+Train_tbl_Name))
      mlflow.log_input(dataset_train, context="training") # Allows to build UC Lineage
      mlflow.sklearn.log_model(sk_model = m_trained_, artifact_path = "model", signature=signature_, input_example=xTest)
      #run_id = mlflow.active_run().info.run_id