In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_pickle("data/preprocessed_data.pkl")

In [3]:
# df_raw.columns

Index(['age', 'size', 'organization_name', 'published_at', 'status_changed_at',
       'los', 'breed_secondary', 'breed_mixed', 'good_with_children',
       'good_with_dogs',
       ...
       'color_secondary_Merle (Red)',
       'color_secondary_Red / Chestnut / Orange', 'color_secondary_Sable',
       'color_secondary_Tricolor (Brown, Black, & White)',
       'color_secondary_White / Cream',
       'color_secondary_Yellow / Tan / Blond / Fawn',
       'color_tertiary_Brown / Chocolate',
       'color_tertiary_Red / Chestnut / Orange',
       'color_tertiary_White / Cream',
       'color_tertiary_Yellow / Tan / Blond / Fawn'],
      dtype='object', length=129)

In [5]:
y = df_raw["los"]

features = []
for col in df_raw.columns:
    if col != "los":  # Skip the 'Target' column if it exists
        features.append(col)

X = df_raw[features]


In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
X['breed_secondary'].unique()

array(['Poodle', None, 'American Staffordshire Terrier',
       'Yorkshire Terrier', 'Golden Retriever', 'Pug', 'American Bulldog',
       'Terrier', 'Havanese', 'Pit Bull Terrier', 'Labrador Retriever',
       'Jack Russell Terrier', 'Shepherd', 'Miniature Pinscher',
       'Pomeranian', 'Chihuahua', 'Spitz', 'Greyhound', 'Shih Tzu',
       'Boxer', 'Doberman Pinscher', 'Flat-Coated Retriever',
       'Cattle Dog', 'Lhasa Apso', 'Siberian Husky', 'Rat Terrier',
       'Akita', 'Dachshund', 'Great Dane', 'Boston Terrier', 'Beagle',
       'Chow Chow', 'Hound', 'English Bulldog', 'Bichon Frise',
       'Australian Shepherd', 'American Eskimo Dog', 'Brussels Griffon',
       'Retriever', 'Maltese', 'Husky', 'Alaskan Malamute', 'Spaniel',
       'Collie', 'Border Collie', 'Australian Cattle Dog / Blue Heeler',
       'Old English Sheepdog', 'German Shepherd Dog',
       'Belgian Shepherd / Malinois', 'Basset Hound',
       'Staffordshire Bull Terrier', 'Pointer', 'Dalmatian', 'Papillon',


In [8]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=312)

In [None]:
# set up hyperparameter search space for the random forest regressor
random_grid = {
    "bootstrap": [True, False],
    "max_depth": [int(x) for x in np.linspace(10, 110, num=11)],
    "max_features": ["auto", "sqrt", "log2"],
    # "max_leaf_nodes": None,
    # "max_samples": None,
    # "min_impurity_decrease": 0.0,
    "min_samples_leaf": [1, 2, 4, 8],
    "min_samples_split": [2, 4, 8, 16],
    # "min_weight_fraction_leaf": 0.0,
    "n_estimators": [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
}

In [None]:
# train the regressor
regr = RandomForestRegressor(max_depth=5, random_state=312, criterion="squared_error")
regr_random = RandomizedSearchCV(estimator=regr, param_distributions=random_grid, n_iter=100, cv=5, random_state=312, n_jobs=-1)

regr_random.fit(X_train, y_train)

In [None]:
# print out selected parameters
best_params = regr_random.best_params_

print(regr_random.best_score_)
print(best_params)

In [None]:
# retrain using the best params
regr = RandomForestRegressor(n_jobs=-1, random_state=312).set_params(**best_params)
regr.fit(X_train, y_train)

In [None]:
# infer on test data
yhat = regr.predict(X_test)

In [None]:
# plot inferences
fig, ax = plt.subplots()
ax.scatter(yhat, y_test, edgecolors=(0, 0, 1))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()

In [None]:
feature_importances = pd.Series(regr.feature_importances_, index=regr.feature_names_in_)

In [None]:
display(feature_importances.sort_values(ascending=False).head(20))

In [None]:
# evaluate performance
mse = mean_squared_error(y_test, yhat)
mae = mean_absolute_error(y_test, yhat)
r2 = r2_score(y_test, yhat)

metrics_rfr = {
    "mse": mse,
    "mae": mae,
    "r2": r2,
}
# df_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["RandomForestRegressor"])

# evaluate a baseline of always guessing the mean
yhat = np.ones((y_test.shape[0],1)) * y_train.mean()
mse = mean_squared_error(y_test, yhat)
mae = mean_absolute_error(y_test, yhat)
r2 = r2_score(y_test, yhat)

metrics_baseline = {
    "mse": mse,
    "mae": mae,
    "r2": r2,
}

In [None]:
df_metrics = pd.DataFrame.from_dict(data={"Baseline": metrics_baseline, "RandomForestRegressor": metrics_rfr})

In [None]:
display(df_metrics)