In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import xgboost as xgb

color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv("hourly-energy-consumption/PJME_hourly.csv")
df = df.set_index("Datetime")
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.plot(style=".",
        figsize=(15,5),
        color=color_pal[0],
        title="PJME Energy Use in MW")

In [None]:
df["PJME_MW"].plot(kind="hist", bins=500)

In [None]:
df.query("PJME_MW < 20_000").plot(figsize=(15,5), style=".")

In [None]:
df.query("PJME_MW < 19_000").plot(figsize=(15,5), style=".")

In [None]:
df = df.query("PJME_MW > 19_000").copy()

In [None]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)

In [None]:
fig, axs = plt.subplots(5, 1, figsize=(15,15), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]
    train["PJME_MW"].plot(ax=axs[fold],
                          label="Training Set",
                          title=f"Data Train/Test Split Fold {fold}")
    test["PJME_MW"].plot(ax=axs[fold],
                         label="Test Set")
    axs[fold].axvline(test.index.min(), color="black", ls="--")
    fold += 1

In [None]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()

    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["quarter"] = df.index.quarter
    df["month"] = df.index.month
    df["year"] = df.index.year
    df["dayofyear"] = df.index.dayofyear

    return df

In [None]:
df = create_features(df=df)

In [None]:
def add_lags(df):
    """
    Create Lag Features.
    """
    df = df.copy()

    target_map = df["PJME_MW"].to_dict()
    df["lag1"] = (df.index - pd.Timedelta("364 days")).map(target_map)
    df["lag2"] = (df.index - pd.Timedelta("728 days")).map(target_map)
    df["lag3"] = (df.index - pd.Timedelta("1092 days")).map(target_map)

    return df

In [None]:
df = add_lags(df=df)

In [None]:
df

In [None]:
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]

    train = create_features(train)
    test = create_features(test)

    FEATURES = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'lag1', 'lag2', 'lag3']
    TARGET = "PJME_MW"

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(base_score=0.5,
                           booster="gbtree",
                           n_estimators=2000,
                           early_stopping_rounds=50,
                           objective="reg:linear",
                           max_depth=3,
                           learning_rate=0.01)
    
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=200)
    
    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [None]:
print(f"Score across folds: {np.mean(scores).round(4)}")
print(f"Fold Scores: {np.array(scores).round(4)}")

# Predicting the Future

In [None]:
df

In [None]:
df.columns

In [None]:
FEATURES = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'lag1', 'lag2', 'lag3']
TARGET = 'PJME_MW'

X = df[FEATURES]
y = df[TARGET]

reg = xgb.XGBRegressor(base_score=0.5,
                       booster="gbtree",
                       n_estimators=1000,
                       objective="reg:linear",
                       max_depth=3,
                       learning_rate=0.01)

In [None]:
reg.fit(X, y,
        eval_set=[(X, y)],
        verbose=200)

In [None]:
df.index.max()

In [None]:
future = pd.date_range("2018-08-03", "2019-08-03", freq="1h")
future_df = pd.DataFrame(index=future)
future_df["isFuture"] = True
df["isFuture"] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)

In [None]:
df_and_future

In [None]:
future_with_features = df_and_future.query("isFuture").copy()

In [None]:
future_with_features

In [None]:
future_with_features["pred"] = reg.predict(future_with_features[FEATURES])

In [None]:
future_with_features["pred"].plot(figsize=(15,5),
                                  color=color_pal[2],
                                  ms=1,
                                  lw=1,
                                  title="Future Predictions")

# Save your Models

In [None]:
reg.get_booster().save_model("model.json")

In [None]:
# Load the saved model

reg_new = xgb.XGBRegressor()

reg_new.load_model("model.json")

In [None]:
future_with_features["pred"] = reg_new.predict(future_with_features[FEATURES])
future_with_features["pred"].plot(figsize=(15,5),
                                  color=color_pal[2],
                                  ms=1,
                                  lw=1,
                                  title="Future Predictions")