In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

In [8]:
%store -r listings_encoded

In [9]:
listings_encoded['price'] = np.log(listings_encoded['price'])

In [10]:
def train_and_eval(model, X_train, y_train, X_test, y_test, depth, estim=None):
    """
    Runs training and evaluation of given decision tree or random forest
    regressor and prints resulting metrics.
    """

    model.fit(X_train, y_train)
    extra_str = "" if estim is None else f"{estim} estimators and "
    print(f"\nTraining {model.__class__.__name__} with {extra_str} max_depth={depth}\n")

    # results on training set
    y_train_pred = model.predict(X_train)
    y_train_exp , y_train_pred_exp = np.exp(y_train), np.exp(y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_rmse = root_mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    # results on test set
    y_pred = model.predict(X_test)
    y_test_exp , y_train_pred_exp = np.exp(y_test), np.exp(y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Absolute Error - train: {train_mae}, test: {mae}")
    print(f"Root Mean Squared Error - train: {train_rmse}, test: {rmse}")
    print(f"R^2 Score - train: {train_r2}, test: {r2}")

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    listings_encoded.drop("price", axis=1),
    listings_encoded["price"],
    test_size=0.2,
)

In [12]:
y_train, y_test = np.log(y_train),np.log(y_test)

In [13]:
# ----------------------------------
# Random Forest Hyperparameter Search
# ----------------------------------
for estim in [100, 200]:
    for depth in [5,6,8]:

        model = RandomForestRegressor(
            n_estimators=estim, random_state=42, max_depth=depth
        )

        train_and_eval(
            model, X_train, y_train, X_test, y_test, estim=estim, depth=depth
        )

# ----------------------------------
# Decision Tree Hyperparameter Search
# ----------------------------------

for depth in [6, 8, 10, 12]:
    model = DecisionTreeRegressor(max_depth=depth)
    train_and_eval(model, X_train, y_train, X_test, y_test, depth=depth)


Training RandomForestRegressor with 100 estimators and  max_depth=5

Mean Absolute Error - train: 0.0664621524851559, test: 0.06937508004687862
Root Mean Squared Error - train: 0.08735749919631819, test: 0.09181740014869313
R^2 Score - train: 0.5600743981338463, test: 0.5084664787294877

Training RandomForestRegressor with 100 estimators and  max_depth=6

Mean Absolute Error - train: 0.06238017393514769, test: 0.06622786949138688
Root Mean Squared Error - train: 0.08153133971761975, test: 0.0880885036762356
R^2 Score - train: 0.6167977870147021, test: 0.5475801786558727

Training RandomForestRegressor with 100 estimators and  max_depth=8

Mean Absolute Error - train: 0.053863181620844, test: 0.06187113844106134
Root Mean Squared Error - train: 0.06998473864899764, test: 0.08277071118091264
R^2 Score - train: 0.7176514632903208, test: 0.6005554208988475

Training RandomForestRegressor with 200 estimators and  max_depth=5

Mean Absolute Error - train: 0.06636836950548651, test: 0.069315

In [17]:
import joblib
from sklearn.metrics import mean_squared_error
grid = [(n, d) for n in [100, 200] for d in [5, 6, 8]]
scored = [(
    np.sqrt(mean_squared_error(
        y_test,
        RandomForestRegressor(n_estimators=n, max_depth=d, random_state=42, n_jobs=-1)
        .fit(X_train, y_train)
        .predict(X_test))),
    n, d)
    for n, d in grid]

rmse, n, d = min(scored)
best = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=42, n_jobs=-1).fit(X_train, y_train)
joblib.dump(best, f"rf_best_estim{n}_depth{d}_rmse{rmse:.5f}.joblib")
print(f"  Best RF saved → rf_best_estim{n}_depth{d}_rmse{rmse:.5f}.joblib")

✅  Best RF saved → rf_best_estim100_depth8_rmse0.08277.joblib
