```python
# TODO: run it once finished to create the package requirements file
pip freeze > requirements.txt
# conda list -e > requirements.txt
```

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline

from scipy.stats import uniform, randint

Load the clean dataset created during data eng. steps:

In [2]:
df_listings = pd.read_parquet('data/listings_clean.parquet.gzip')

In [3]:
# Show all columns (instead of cascading columns in the middle)
pd.set_option("display.max_columns", None)

A quick sneak peak to the data:

In [4]:
df_listings.head()

Unnamed: 0,id,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,neighbourhood,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_maximum_nights,maximum_maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_shared_rooms,reviews_per_month,bathrooms_type,bathrooms_qty
0,10803,38901,within a few hours,1.0,0.95,f,t,"Brunswick East, Victoria, Australia",Moreland,-37.76606,144.97951,Private room,2,,1.0,49,5,14,14,14,173,2013-01-12,2023-07-19,4.49,4.65,3.98,4.72,4.69,4.66,4.61,True,1,0,1.33,shared,1
1,12936,50121,,,,f,t,"St Kilda, Victoria, Australia",Port Phillip,-37.85999,144.97662,Entire home/apt,2,1.0,1.0,95,3,14,14,14,42,2010-08-04,2020-03-15,4.68,4.78,4.71,4.83,4.83,4.78,4.66,True,10,0,0.26,private,1
2,38271,164193,within an hour,1.0,0.91,t,t,"Berwick, Victoria, Australia",Casey,-38.05723,145.33982,Entire home/apt,5,3.0,3.0,116,1,14,14,14,228,2010-11-24,2023-08-26,4.86,4.92,4.98,4.91,4.94,4.9,4.88,True,1,0,1.47,private,1
3,41836,182833,,,,f,t,"Reservoir, Victoria, Australia",Darebin,-37.69761,145.00066,Private room,2,,1.0,40,7,365,1125,1125,159,2010-11-16,2018-08-22,4.71,4.68,4.65,4.89,4.83,4.39,4.69,True,2,0,1.02,shared,1
4,43429,189684,within an hour,1.0,0.99,t,t,"Oakleigh East, Victoria, Australia",Monash,-37.89983,145.11579,Entire home/apt,2,1.0,1.0,117,2,1125,1125,1125,248,2010-12-05,2023-09-02,4.87,4.91,4.93,4.94,4.93,4.79,4.86,True,3,0,1.6,private,1


Checking dataset types:

In [5]:
df_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23185 entries, 0 to 23184
Data columns (total 36 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   id                                           23185 non-null  int64         
 1   host_id                                      23185 non-null  int64         
 2   host_response_time                           15418 non-null  category      
 3   host_response_rate                           15418 non-null  float64       
 4   host_acceptance_rate                         16579 non-null  float64       
 5   host_is_superhost                            22468 non-null  category      
 6   host_identity_verified                       23183 non-null  object        
 7   neighbourhood                                13257 non-null  object        
 8   neighbourhood_cleansed                       23185 non-null  object        


TODO:
- Decide strategy for category attributes
- ~~Explore which attributes would need normalization~~
- Decide regression algorithm
- How to include location?

Let's separate target attribute and the rest of dataset and split the dataset on train ~~, validation~~ and test sets ~~(ratio ~70:20:10~~):

In [6]:
# Drop unnecessary columns and features
X = df_listings.drop(columns=['id', 'price', 'first_review', 'last_review'])

# Define the target variable
y = df_listings['price']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X)

# X_train, X_t, y_train, y_t = train_test_split(X, 
#                                               y,
#                                               test_size=0.3,
#                                               random_state=99
#                                               )

# X_val, X_test, y_val, y_test = train_test_split(X_t, 
#                                                 y_t,
#                                                 test_size=0.3,
#                                                 random_state=99
#                                               )

X_train, X_test, y_train, y_test = train_test_split(X, 
                                              y,
                                              test_size=0.2,
                                              random_state=99
                                              )

Now, let's use a simple linear regressor for our baseline model:

In [7]:
# Handle missing values with imputation bfill or ffill
def handle_missing_values(df):
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    return df


In [8]:
# Normalize the features, Handle missing values with imputation bfill or ffill
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(handle_missing_values(X_train))
X_test_scaled = scaler.transform(handle_missing_values(X_test))

# Train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict the prices
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# evaluate the model with cross-validation and mean absolute error on the test set
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

Train MAE: 162.24253791696097
Test MAE: 1235572162521.914


In [9]:
# let's plot the predictions vs. the actual values
px.scatter(x=y_test, 
           y=y_pred_test
           )

In [10]:
# let's plot the residuals
residuals = y_test - y_pred_test
px.histogram(x=residuals)

In [11]:
# let's create a dataframe with the actual, predicted values and the residuals
df_pred = pd.DataFrame({'actual': y_test, 'predicted': y_pred_test, 'residuals': y_test - y_pred_test})

In [12]:
df_pred.sort_values(by='residuals', ascending=False).head(10)

Unnamed: 0,actual,predicted,residuals
11172,418,-971528000000000.0,971528000000000.0
13648,476,-431019200000000.0,431019200000000.0
2754,32,-319924600000000.0,319924600000000.0
18054,250,-258217000000000.0,258217000000000.0
2786,187,-245421900000000.0,245421900000000.0
16002,465,-207571300000000.0,207571300000000.0
16743,181,-203298800000000.0,203298800000000.0
3694,82,-32174170000000.0,32174170000000.0
561,159,-15194110000000.0,15194110000000.0
10086,194,-728695800000.0,728695800000.0


Although the linear regressor model has a terrible performance, we would pass any tweaks or tunning for now.

Now let's train a boosting model and see if we can get a better performance:

In [24]:
# Use X_train, y_train to train a HistGradientBoostingRegressor model
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate, RandomizedSearchCV

histogram_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=200, random_state=0
)
cv_results_hgbdt = cross_validate(
    histogram_gradient_boosting,
    X_train,
    y_train,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [28]:
print("Histogram Gradient Boosting Decision Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_hgbdt['test_score'].mean():.3f} ± "
    f"{cv_results_hgbdt['test_score'].std():.3f} $"
)
print(f"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds"
)

Histogram Gradient Boosting Decision Tree
Mean absolute error via cross-validation: 142.322 ± 15.271 $
Average fit time: 1.320 seconds
Average score time: 0.023 seconds


Let's plot the target variable to check the MAE in this context:

In [35]:
# Price histogram
px.histogram(x=pd.concat([y_train, y_test]))

Now, let's analyze 