In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
x_train = pd.read_csv(
  "../data/processed/x_train_w_OHE.csv", index_col=0, dtype=str
)
x_test = pd.read_csv(
  "../data/processed/x_test_w_OHE.csv", index_col=0, dtype=str
)
y_train = pd.read_csv(
  "../data/processed/y_train.csv", index_col=0, dtype=float
).squeeze("columns")
y_test = pd.read_csv(
  "../data/processed/y_test.csv", index_col=0, dtype=float
).squeeze("columns")

In [3]:
zip_cols = x_train.columns[
  [re.search('zip_is', col) is not None for col in x_train.columns]
]

In [4]:
def get_correct_types_x(df, numeric_cols):
    for col in ['deenergize_time', 'restoration_time']:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
    for col in numeric_cols:
        df[col] = df[col].astype(float)
    return df
numeric_cols = [
    'hftd_tier', 'total_affected', 'residential_affected',
    'longitude', 'latitude', 'total_pop', 'median_age', 'median_income',
    'white_pct', 'tmin_d-5', 'tmax_d-5', 'wspd_d-5', 'tmin_d-4', 'tmax_d-4',
    'wspd_d-4', 'tmin_d-3', 'tmax_d-3', 'wspd_d-3', 'tmin_d-2', 'tmax_d-2',
    'wspd_d-2', 'tmin_d-1', 'tmax_d-1', 'wspd_d-1'
]
x_train = get_correct_types_x(x_train, numeric_cols)
x_test = get_correct_types_x(x_test, numeric_cols)
rel_x_train = x_train[numeric_cols]
rel_x_test = x_test[numeric_cols]

In [5]:
scaler = StandardScaler()
scaler.fit(rel_x_train)
scaled_train_x = scaler.transform(rel_x_train)
scaled_test_x = scaler.transform(rel_x_test)

In [6]:
scaled_train_x = np.hstack([scaled_train_x, np.array(x_train[zip_cols])])
scaled_test_x = np.hstack([scaled_test_x, np.array(x_test[zip_cols])])

In [7]:
alphas = 10 ** np.arange(-7., 2.)

In [8]:
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(scaled_train_x, y_train)
ridge_preds = ridge.predict(scaled_test_x)

In [9]:
lasso = LassoCV(max_iter=int(1e6), cv=5, random_state=6)
lasso.fit(scaled_train_x, y_train)
lasso_preds = lasso.predict(scaled_test_x)

In [10]:
e_net = ElasticNetCV(
    l1_ratio=[0.01, .1, .3, .5, .65, .8, .9, .95, .975, .99, 1],
    max_iter=int(1e6), cv=5, random_state=6
)
e_net.fit(scaled_train_x, y_train)
e_net_preds = e_net.predict(scaled_test_x)

In [11]:
def calc_test_r2(pred_vals, true_vals, baseline_rmse):
    sse = mean_squared_error(pred_vals, true_vals) * len(true_vals)
    sst = (baseline_rmse ** 2) * len(true_vals)
    return 1 - sse / sst, np.sqrt(sse / len(true_vals))

In [12]:
baseline_rmse = np.sqrt(((y_test - y_test.mean()) ** 2).mean())
regularization_results = pd.DataFrame({
    'model': ['Ridge', 'LASSO', 'Elastic Net'],
    'stats': [
        calc_test_r2(preds, y_test, baseline_rmse)
        for preds in [ridge_preds, lasso_preds, e_net_preds]
    ]
})
regularization_results['test_r_sq'] = [
    model[0] for model in regularization_results['stats']
]
regularization_results['rmse'] = [
    model[1] for model in regularization_results['stats']
]
regularization_results.drop(columns='stats', inplace=True)
regularization_results

Unnamed: 0,model,test_r_sq,rmse
0,Ridge,0.53009,1083.081942
1,LASSO,0.526052,1087.725956
2,Elastic Net,0.526052,1087.725956
