# AirBnb NY Listing Price Prediction: ML Models Set up


Resources on distributing Python with Ray:
* Databricks [doc](https://docs.databricks.com/en/machine-learning/ray-integration.html)
* Ray [documentation](https://docs.ray.io/en/latest/ray-more-libs/joblib.html) for Scikit-learn
* Random Search [link](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)
* Logging GridSearchCV in MLFlow: [link](https://seunghan96.github.io/mlops/mlops4/)


In [0]:
%pip install mlflow
%pip install scikit-learn==1.4.1.post1

In [0]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### Split into Train & Test

In [0]:
catalog_ = f"price_prediction"
schema_ = f"ny_listing"
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

In [0]:
SEED_ = 142

gold_data = spark.sql("SELECT * from gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))
#display(test_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-1].to_numpy()
yTrain = train_df.loc[:, 'price'].to_numpy()

xTest = test_df.iloc[:, 1:-1].to_numpy()
yTest = test_df.loc[:, 'price'].to_numpy()

### Random Forest : RandomSearch

In [0]:
base_model_ = RandomForestRegressor()

# Set a wide range of hyper parameters
hyper_par = {}
hyper_par['n_estimators'] = [] + list(range(2, 2000, 10)) # Nr of trees
hyper_par['max_depth'] = [] + list(range(10, 110, 11)) # Maximum number of levels in tree
hyper_par['max_depth'].append(None)
hyper_par['min_samples_split'] = [2, 5, 10] # Minimum number of samples required to split a node
hyper_par['min_samples_leaf'] = [1, 2, 4] # Minimum number of samples required at each leaf node
hyper_par['max_features'] = ['auto', 'sqrt', None] # Number of features to consider at every split
hyper_par['bootstrap'] = [True, False] # Method of selecting samples for training each tree

print(hyper_par)

In [0]:
# Start Random Search
RFRandom = RandomizedSearchCV(
  estimator = base_model_,
  param_distributions = hyper_par,
  scoring = 'r2',
  n_iter = 100,
  cv = 3,
  verbose=2,
  random_state=SEED_,
  n_jobs = -1)

mlflow.sklearn.autolog()
experiment_ = mlflow.set_experiment("/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings_Testing")
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="RandomForest Random Search") as run:
  RFRandom.fit(xTrain,yTrain)

In [0]:
RFRandom_result_df = pd.DataFrame(RFRandom.cv_results_)
display(RFRandom_result_df.head(50))

In [0]:
spark.createDataFrame(RFRandom_result_df).createOrReplaceTempView("RF_Random_Search")

In [0]:
%sql
DROP TABLE IF EXISTS RandomForest_RandomSearch;
CREATE TABLE RandomForest_RandomSearch AS
  select * from RF_Random_Search;

### Random Forest : GridSearch

In [0]:
base_model_ = RandomForestRegressor()

hyper_par = {'max_depth' : [None],
             'max_features': ['sqrt', 'log2', None],
             'n_estimators': [10,50,100,150,200]}

# Reducing grid for testing purposes
# hyper_par = {'max_depth' : [None],'max_features': [None],'n_estimators': [10,30]}

In [0]:
# Create Grid Search
RFSearch = GridSearchCV(base_model_, param_grid = hyper_par, scoring = 'r2', cv = None, verbose=1) # Default 5-fold cross validation

In [0]:
# Start and Log GridSearch
mlflow.sklearn.autolog()
experiment_ = mlflow.set_experiment("/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings_Testing")
with mlflow.start_run(experiment_id=experiment_.experiment_id, run_name="RandomForest Grid Search") as run:
  RFSearch.fit(xTrain,yTrain)

RFSearch_result_df = pd.DataFrame(RFSearch.cv_results_)
display(RFSearch_result_df.head(50))