In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
x_train = pd.read_csv("../data/processed/x_train.csv", index_col=0, dtype=str)
x_test = pd.read_csv("../data/processed/x_test.csv", index_col=0, dtype=str)
y_train = pd.read_csv(
  "../data/processed/y_train.csv", index_col=0, dtype=float
).squeeze("columns")
y_test = pd.read_csv(
  "../data/processed/y_test.csv", index_col=0, dtype=float
).squeeze("columns")

In [None]:
def get_correct_types_x(df, numeric_cols):
    for col in ['deenergize_time', 'restoration_time']:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
    for col in numeric_cols:
        df[col] = df[col].astype(float)
    return df
numeric_cols = [
    'hftd_tier', 'total_affected', 'residential_affected',
    'longitude', 'latitude', 'total_pop', 'median_age', 'median_income',
    'white_pct', 'tmin_d-4', 'tmax_d-4', 'wspd_d-4', 'tmin_d-3', 'tmax_d-3', 'wspd_d-3', 'tmin_d-2', 'tmax_d-2', 'wspd_d-2', 'tmin_d-1', 'tmax_d-1', 'wspd_d-1'
]
x_train = get_correct_types_x(x_train, numeric_cols)
x_test = get_correct_types_x(x_test, numeric_cols)
rel_x_train = x_train[numeric_cols]
rel_x_test = x_test[numeric_cols]

In [None]:
scaler = StandardScaler()
scaler.fit(rel_x_train)
scaled_train_x = scaler.transform(rel_x_train)
scaled_test_x = scaler.transform(rel_x_test)

We will look at three different error criteria: squared error, absolute error, and the reduction in Poisson deviation

In [None]:
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['1.0', 'sqrt', 'log2'],
    'max_depth': np.arange(3, 11),
}

In [None]:
rf_se = RandomForestRegressor(criterion='squared_error', random_state=6)
rf_se_cv = GridSearchCV(estimator=rf_se, param_grid=param_grid, cv=5)
rf_se_cv.fit(scaled_train_x, y_train)
se_preds = rf_se_cv.predict(scaled_test_x)

In [None]:
rf_ae = RandomForestRegressor(criterion='absolute_error', random_state=6)
rf_ae_cv = GridSearchCV(estimator=rf_ae, param_grid=param_grid, cv=5)
rf_ae_cv.fit(scaled_train_x, y_train)
ae_preds = rf_ae_cv.predict(scaled_test_x)

In [None]:
rf_pois = RandomForestRegressor(criterion='poisson', random_state=6)
rf_pois_cv = GridSearchCV(estimator=rf_pois, param_grid=param_grid, cv=5)
rf_pois_cv.fit(scaled_train_x, y_train)
pois_preds = rf_pois_cv.predict(scaled_test_x)

In [None]:
def calc_test_r2(pred_vals, true_vals, baseline_rmse):
    sse = mean_squared_error(pred_vals, true_vals) * len(true_vals)
    sst = (baseline_rmse ** 2) * len(true_vals)
    return 1 - sse / sst, np.sqrt(sse / len(true_vals))

In [None]:
baseline_rmse = np.sqrt(((y_test - y_test.mean()) ** 2).mean())
regularization_results = pd.DataFrame({
    'model': ['RF (Squared Error)', 'RF (Absolute Error)', 'RF (Poisson)'],
    'stats': [
        calc_test_r2(preds, y_test, baseline_rmse)
        for preds in [se_preds, ae_preds, poisson_preds]
    ]
})
regularization_results['test_r_sq'] = [
    model[0] for model in regularization_results['stats']
]
regularization_results['rmse'] = [
    model[1] for model in regularization_results['stats']
]
regularization_results.drop(columns='stats', inplace=True)
regularization_results