In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error

import xgboost as xgb

color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv("hourly-energy-consumption/PJME_hourly.csv")
df = df.set_index("Datetime")
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.plot(style=".",
        figsize=(15,5),
        color=color_pal[0],
        title="PJME Energy Use in MW")

In [None]:
train = df.loc[df.index < "01-01-2015"]
test = df.loc[df.index >= "01-01-2015"]

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

train.plot(ax=ax, label="Training Set", title="Data Train/Test Split")
test.plot(ax=ax, label="Test Set")

ax.axvline("01-01-2015", color="black", ls="--")
ax.legend(["Training Set, Test Set"])

In [None]:
df.loc[(df.index > "01-01-2010") & (df.index < "01-08-2010")].plot(figsize=(15,5), title="Week of Data")

In [None]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()

    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["quarter"] = df.index.quarter
    df["month"] = df.index.month
    df["year"] = df.index.year
    df["dayofyear"] = df.index.dayofyear

    return df

In [None]:
df = create_features(df=df)

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

sns.boxplot(data=df, x="hour", y="PJME_MW")

ax.set_title("MW by Hour")

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

sns.boxplot(data=df, x="month", y="PJME_MW")

ax.set_title("MW by Month")

In [None]:
train = create_features(train)
test = create_features(test)

FEATURES = ['hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear']
TARGET = ['PJME_MW']

In [None]:
X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]

In [None]:
reg = xgb.XGBRegressor(n_estimators=5000,
                       early_stopping_rounds=50,
                       learning_rate=0.001,
                       n_jobs=-1)

reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        verbose=300)

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=["importance"])

In [None]:
fi.sort_values("importance").plot(kind="barh", title="Feature Importance")

In [None]:
test["prediction"] = reg.predict(X_test)
df = df.merge(test[["prediction"]], how="left", left_index=True, right_index=True)

In [None]:
ax = df[["PJME_MW"]].plot(figsize=(15,5))
df["prediction"].plot(ax=ax, style=".")

plt.legend(["Truth Data", "Predictions"])
ax.set_title("Raw Data and Predictions")

In [None]:
ax = df.loc[(df.index > "01-01-2018") & (df.index < "01-08-2018")]["PJME_MW"] \
    .plot(figsize=(15,5), title="Week of Data")
df.loc[(df.index > "01-01-2018") & (df.index < "01-08-2018")]["prediction"] \
    .plot(style=".")

plt.legend(["Truth Data", "Predictions"])

In [None]:
score = np.sqrt(mean_squared_error(test["PJME_MW"], test["prediction"]))
print(f"RMSE Score on Test Set: {score:0.2f}")

In [None]:
test["error"] = np.abs(test["PJME_MW"] - test["prediction"])

In [None]:
test["date"] = test.index.date

In [None]:
test.groupby("date")["error"].mean().sort_values(ascending=False).head(10)