```python
# TODO: run it once finished to create the package requirements file
pip freeze > requirements.txt
# conda list -e > requirements.txt
```

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy.stats import uniform, randint

Load the clean dataset created during data eng. steps:

In [2]:
df_listings = pd.read_parquet('data/listings_clean.parquet.gzip')

In [3]:
# Show all columns (instead of cascading columns in the middle)
pd.set_option("display.max_columns", None)

A quick sneak peak to the data:

In [4]:
df_listings.head()

Unnamed: 0,id,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,neighbourhood,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_maximum_nights,maximum_maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,bathrooms_type,bathrooms_qty
0,10803,38901,within a few hours,1.0,0.95,f,t,"Brunswick East, Victoria, Australia",Moreland,-37.76606,144.97951,Private room,2,,1.0,49,5,14,14,14,173,2013-01-12,2023-07-19,4.49,4.65,3.98,4.72,4.69,4.66,4.61,True,1,1.33,shared,1.0
1,12936,50121,,,,f,t,"St Kilda, Victoria, Australia",Port Phillip,-37.85999,144.97662,Entire home/apt,2,1.0,1.0,95,3,14,14,14,42,2010-08-04,2020-03-15,4.68,4.78,4.71,4.83,4.83,4.78,4.66,True,10,0.26,private,1.0
2,38271,164193,within an hour,1.0,0.91,t,t,"Berwick, Victoria, Australia",Casey,-38.05723,145.33982,Entire home/apt,5,3.0,3.0,116,1,14,14,14,228,2010-11-24,2023-08-26,4.86,4.92,4.98,4.91,4.94,4.9,4.88,True,1,1.47,private,1.0
3,41836,182833,,,,f,t,"Reservoir, Victoria, Australia",Darebin,-37.69761,145.00066,Private room,2,,1.0,40,7,365,1125,1125,159,2010-11-16,2018-08-22,4.71,4.68,4.65,4.89,4.83,4.39,4.69,True,2,1.02,shared,1.0
4,43429,189684,within an hour,1.0,0.99,t,t,"Oakleigh East, Victoria, Australia",Monash,-37.89983,145.11579,Entire home/apt,2,1.0,1.0,117,2,1125,1125,1125,248,2010-12-05,2023-09-02,4.87,4.91,4.93,4.94,4.93,4.79,4.86,True,3,1.6,private,1.0


Checking dataset types:

In [5]:
df_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23185 entries, 0 to 23184
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              23185 non-null  int64         
 1   host_id                         23185 non-null  int64         
 2   host_response_time              15418 non-null  category      
 3   host_response_rate              15418 non-null  float64       
 4   host_acceptance_rate            16579 non-null  float64       
 5   host_is_superhost               22468 non-null  category      
 6   host_identity_verified          23183 non-null  object        
 7   neighbourhood                   13257 non-null  object        
 8   neighbourhood_cleansed          23185 non-null  object        
 9   latitude                        23185 non-null  float64       
 10  longitude                       23185 non-null  float64       
 11  ro

TODO:
- ~~Decide strategy for category attributes~~
- ~~Explore which attributes would need normalization~~
- ~~Decide regression algorithm~~
- How to deal with location?

Let's separate target attribute and the rest of dataset and split the dataset on train and test sets and set the strategy for preprocessing:

In [28]:
df_listings.columns

Index(['id', 'host_id', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_identity_verified',
       'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude',
       'room_type', 'accommodates', 'bedrooms', 'beds', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'number_of_reviews', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count', 'reviews_per_month', 'bathrooms_type',
       'bathrooms_qty'],
      dtype='object')

In [29]:
# Drop unnecessary columns and features. Arguably, the don't offer any value for the purpose of this project
X = df_listings.drop(columns=['id', 'price', 'first_review', 'last_review', 'host_id', 
                              'calculated_host_listings_count', 'neighbourhood'])
# neighbourhood was dropped because we'll use neighbourhood_cleansed instead, which doesn't have any missing values and a lower cardinality

# Define the target variable
y = df_listings['price']

# Define categorical and numerical features
categorical_features = ['host_response_time', 'host_is_superhost', 'host_identity_verified',
                        'neighbourhood_cleansed', 'room_type', 'bathrooms_type', 'instant_bookable']

numerical_features = ['host_response_rate', 'host_acceptance_rate', 'latitude', 'longitude',
                      'accommodates', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
                      'minimum_maximum_nights', 'maximum_maximum_nights', 'number_of_reviews',
                      'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                      'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'bathrooms_qty',
                      'review_scores_value', 'reviews_per_month']

# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


X_train, X_test, y_train, y_test = train_test_split(X, 
                                              y,
                                              test_size=0.2,
                                              random_state=99
                                              )

Now, let's use a simple linear regressor for our baseline model:

In [30]:
# Handle missing values with imputation bfill or ffill
def handle_missing_values(df):
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    return df


In [31]:
# Instantiate the linear regression model
linear_model = LinearRegression()

# Create pipeline with preprocessor and model
pipeline_linear = Pipeline(steps=[('preprocessor', preprocessor),
                           ('random_search', linear_model)])

pipeline_linear.fit(handle_missing_values(X_train), y_train)

# Predict the prices
y_pred_train = pipeline_linear.predict(handle_missing_values(X_train))
y_pred_test = pipeline_linear.predict(handle_missing_values(X_test))

# evaluate the model with cross-validation and mean absolute error on the test set
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

Train MAE: 167.43620654558424
Test MAE: 172.61146473919328


In [32]:
# let's plot the predictions vs. the actual values
fig = px.scatter(x=y_test, 
                y=y_pred_test      
                )
fig.update_layout(
    title='Linear model - predictions vs. actual values',
    xaxis_title="Actuals",
    yaxis_title="Predictions"
)

In [33]:
# let's plot the residuals
residuals = y_test - y_pred_test
px.histogram(x=residuals)

In [34]:
# let's create a dataframe with the actual, predicted values and the residuals
df_pred = pd.DataFrame({'actual': y_test, 'predicted': y_pred_test, 'residuals': y_test - y_pred_test})

In [35]:
df_pred.sort_values(by='residuals', ascending=False).head(10)

Unnamed: 0,actual,predicted,residuals
15498,85000,841.782775,84158.217225
3492,13379,66.661716,13312.338284
13542,9180,393.260759,8786.739241
5136,8000,185.123988,7814.876012
3934,4262,149.663521,4112.336479
10783,3988,257.298687,3730.701313
7967,4000,659.023535,3340.976465
10781,3584,281.744566,3302.255434
8772,3000,7.384498,2992.615502
6097,3000,185.013386,2814.986614


We would pass any tweaks or tuning for now as this is a baseline model.

Now let's train a boosting model and see if we can get a better performance:

In [36]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate, RandomizedSearchCV

histogram_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=300, 
    random_state=0,
    early_stopping=True
)

# Define parameter grid for RandomizedSearchCV
param_dist = {
    'learning_rate': uniform(0.01, 0.1),
    'max_leaf_nodes': randint(20, 200)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=histogram_gradient_boosting, 
                                   param_distributions=param_dist, 
                                   scoring='neg_mean_absolute_error',
                                   cv=5, 
                                   n_iter=10, 
                                   random_state=42
                                   )

In [37]:
cv_results_hgbdt = cross_validate(
    random_search,
    X_train,
    y_train,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_search.py", line 970, in fit
    self._run_search(evaluate_candidates)
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_search.py", line 1914, in _run_search
    evaluate_candidates(
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_search.py", line 947, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_validation.py", line 536, in _warn_or_raise_about_fit_failures
    raise ValueError(all_fits_failed_message)
ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\arrays\categorical.py", line 564, in astype
    new_cats = new_cats.astype(dtype=dtype, copy=copy)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'a few days or more'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 560, in fit
    X, known_categories = self._preprocess_X(X, reset=True)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 277, in _preprocess_X
    X = self._validate_data(X, **check_X_kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\sklearn\utils\validation.py", line 921, in check_array
    array = array.astype(new_dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\generic.py", line 6534, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\internals\managers.py", line 414, in astype
    return self.apply(
           ^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\internals\managers.py", line 354, in apply
    applied = getattr(b, f)(**kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\internals\blocks.py", line 616, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\dtypes\astype.py", line 238, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\dtypes\astype.py", line 180, in astype_array
    values = values.astype(dtype, copy=copy)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\carlos.orjuela\AppData\Local\miniconda3\envs\airbnb\Lib\site-packages\pandas\core\arrays\categorical.py", line 575, in astype
    raise ValueError(msg)
ValueError: Cannot cast object dtype to float64



In [None]:
print("Histogram Gradient Boosting Regressor Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_hgbdt['test_score'].mean():.3f} ± "
    f"{cv_results_hgbdt['test_score'].std():.3f} $"
)
print(f"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds"
)

Let's plot the target variable to check the MAE in this context:

In [None]:
# Price histogram
px.histogram(x=pd.concat([y_train, y_test]))