In [1]:
import numpy as np
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import re
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


## Reading in Data

In [2]:
x_train = pd.read_csv(
  "../data/processed/x_train_w_OHE.csv", index_col=0, dtype=str
)
x_test = pd.read_csv(
  "../data/processed/x_test_w_OHE.csv", index_col=0, dtype=str
)
y_train = pd.read_csv(
  "../data/processed/y_train.csv", index_col=0, dtype=float
).squeeze("columns").reset_index(drop=True)
y_test = pd.read_csv(
  "../data/processed/y_test.csv", index_col=0, dtype=float
).squeeze("columns").reset_index(drop=True)

x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42)

In [3]:
def get_correct_types_x(df, numeric_cols):
    for col in ['deenergize_time', 'restoration_time']:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
    for col in numeric_cols:
        df[col] = df[col].astype(float)
    return df
numeric_cols = [
    'hftd_tier', 'total_affected', 'residential_affected',
    'longitude', 'latitude', 'total_pop', 'median_age', 'median_income',
    'white_pct', 'tmin_d-5', 'tmax_d-5', 'wspd_d-5', 'tmin_d-4', 'tmax_d-4',
    'wspd_d-4', 'tmin_d-3', 'tmax_d-3', 'wspd_d-3', 'tmin_d-2', 'tmax_d-2',
    'wspd_d-2', 'tmin_d-1', 'tmax_d-1', 'wspd_d-1', 'day_in_year'
]

In [4]:
#Scale all numeric columns then add back in zip columns
zip_cols = x_train.columns[
    [re.search('zip_is', col) is not None for col in x_train.columns]
]
x_train = get_correct_types_x(x_train, numeric_cols)
x_valid = get_correct_types_x(x_valid, numeric_cols)
x_test = get_correct_types_x(x_test, numeric_cols)
rel_x_train = x_train[numeric_cols]
rel_x_valid = x_valid[numeric_cols]
rel_x_test = x_test[numeric_cols]

scaler = StandardScaler()
scaler.fit(rel_x_train)
scaled_x_train = np.hstack([
    scaler.transform(rel_x_train), np.array(x_train[zip_cols])
])
scaled_x_valid = np.hstack([
    scaler.transform(rel_x_valid), np.array(x_valid[zip_cols])
])
scaled_x_test = np.hstack([
    scaler.transform(rel_x_test), np.array(x_test[zip_cols])
])

## Baseline Model

In [5]:
baseline_params = {'max_depth':6, 'eta':.3, 'objective':'reg:squarederror'}
num_round = 5
baseline_d_train = xgb.DMatrix(scaled_x_train, label = y_train)
xgb_model = xgb.train(baseline_params, baseline_d_train, num_round)
d_test = xgb.DMatrix(scaled_x_test, label = y_test)
baseline_preds = xgb_model.predict(d_test)
print("RMSE = ", np.sqrt(mean_squared_error(baseline_preds, y_test)))

RMSE =  1070.4475962877377


## Optimizing Hyperparameters

In [6]:
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = xgb.XGBRegressor(**params)
    optuna_model.fit(scaled_x_train, y_train)

    # Make predictions
    y_valid_pred = optuna_model.predict(scaled_x_valid)

    # Evaluate predictions
    accuracy = np.sqrt(mean_squared_error(y_valid_pred, y_valid))
    return accuracy

In [7]:
sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler, direction='minimize')

[32m[I 2022-11-30 20:15:08,114][0m A new study created in memory with name: no-name-81b9ca32-50b7-4308-a462-af72ba6f7ffd[0m


In [8]:
study.optimize(objective, n_trials=100)
warnings.filterwarnings('ignore')

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  RMSE: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
[32m[I 2022-11-30 20:15:20,257][0m Trial 0 finished with value: 913.9017412925404 and parameters: {'max_depth': 1, 'learning_rate': 0.3629422978396401, 'n_estimators': 247, 'min_child_weight': 8, 'gamma': 0.6666779571482637, 'subsample': 0.11939653980558809, 'colsample_bytree': 0.10051732611851687, 'reg_alpha': 3.77058787704939e-08, 'reg_lambda': 1.404467975150631e-06}. Best is trial 0 with value: 913.9017412925404.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'subsample': trial.suggest_logunifor

Number of finished trials: 100
Best trial:
  RMSE: 701.5266077870028
  Params: 
    max_depth: 9
    learning_rate: 0.02451126688680403
    n_estimators: 300
    min_child_weight: 3
    gamma: 0.011610916810254264
    subsample: 0.42277103505325614
    colsample_bytree: 0.5948569807572572
    reg_alpha: 4.625677840870809e-06
    reg_lambda: 1.645016077967484e-08


## Fit model with best parameters and predict

In [9]:
def calc_test_r2(pred_vals, true_vals, baseline_rmse):
    sse = mean_squared_error(pred_vals, true_vals) * len(true_vals)
    sst = (baseline_rmse ** 2) * len(true_vals)
    return (
        1 - sse / sst, np.sqrt(sse / len(true_vals)),
        mean_absolute_error(pred_vals, true_vals)
    )

In [10]:
best_params = trial.params
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(scaled_x_train, y_train)
final_preds = best_model.predict(scaled_x_test)
# print("RMSE = ", np.sqrt(mean_squared_error(final_preds, y_test)))
baseline_rmse = np.sqrt(((y_test - y_test.mean()) ** 2).mean())
test_r2, rmse, mae = calc_test_r2(final_preds, y_test, baseline_rmse)
print('Test R-Squared:', test_r2)
print('RMSE:', rmse)
print('MAE:', mae)

Test R-Squared: 0.7795826603057782
RMSE: 741.783317774549
MAE: 506.77986933196166
