# Hourly prediction
- goal is the most accurate prediction of net load

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from tbats import TBATS
from green_city.utils import metrics_dict, datetime2index, index2datetime

plt.rcParams['figure.figsize'] = [25, 8]

In [None]:
## MLFLOW ##
import mlflow
from green_city.mlflow_config import get_mlflow_config

flow_conf = get_mlflow_config()
tracking_uri = flow_conf["TRACKING_URI"]
mlflow.set_tracking_uri(flow_conf["TRACKING_URI"])
mlflow.set_experiment(flow_conf["EXPERIMENT_NAME"]);

In [None]:
#document wide parameters that don't change in experiments
#for logging to mlflow server
global_params = {
    "feature": "net_load_kWh",
    "building_nr": 5,
    "resolution": "H",
    "pred_steps": 24,
}

In [None]:
assert global_params["resolution"] == "H"

# Load the dataset
def get_df(building_nr, feature):
    df = (
    pd.read_csv(f"../data/preprocessed/Building_{building_nr}.csv")
        .astype({'datetime': 'datetime64'})
        [[feature, "datetime"]]
        .rename(columns={feature: "actual"})
    )
    return df

df = get_df(global_params["building_nr"], global_params["feature"])

In [None]:
plotting = False
if plotting:
    #Plotting the data to get a bit of an overview
    fig, (ax1, ax2) = plt.subplots(2, 1)
    df.actual.plot(ax=ax1, xlabel="");
    #ax1.set_xlabel("")
    plot_acf(df.actual, lags=370, ax=ax2);
    plt.subplots_adjust(hspace=0.2)

In [None]:
# prediction function
def predict_next_hours(index=None,
                       time=None,
                       data=None,
                       pred_steps=24,
                       train_steps=None,
                       model='TBATS',
                       seasonalities=None,
                       **_):
    df = data
    if index is None:
        index = df.loc[df.datetime == time].index[0]
    if seasonalities is None:
        seasonalities = [24]
        if train_steps > 7*24:
            seasonalities.append(7*24)
        if train_steps > 365*24:
            seasonalities.append(365*24)

    print(f"[Predicting the next {pred_steps}h after {df.loc[index, 'datetime']} (row {index})]")

    if train_steps is not None:
        df_train = df.loc[index-train_steps+1:index]
    else:
        df_train = df.loc[:index]

    if model == 'TBATS':
        #estimator = TBATS(seasonal_periods=[24, 7*24, 365*24])
        estimator = TBATS(seasonal_periods=[24])
        fitted_model = estimator.fit(df_train.actual)
        forecast = fitted_model.forecast(steps=pred_steps)

    return pd.DataFrame({"forecast": forecast}, index=range(index+1, index+pred_steps+1))

In [None]:
#we could also define multiple parameter dictionaries at one place and let them run:

for timestr in ["2008-10-03 14:00", "2009-06-06 19:00", "2008-04-02 01:00"]:
    for train_steps in [100, 200, 400 ,1000]:
        params = global_params.copy()
        params.update({
            "index": datetime2index(timestr),
            "datetime": pd.Timestamp(timestr),
            pred_steps: 24,
            "train_steps": time_steps,
            "model": "TBATS",
            "seasonalities": [24],
        })

        with mlflow.start_run(run_name="test_TBATS") as r:

            ## predict ##
            predictions = predict_next_hours(**params, data=df)
            df_forecast = pd.concat([df, predictions], axis=1)
            df_forecast["error"] = df_forecast.actual - df_forecast.forecast

            ## evaluate ##
            pred_lims = (params["index"]+1, params["index"]+params["pred_steps"])
            df_predrange = df_forecast.loc[pred_lims[0]:pred_lims[1]]

            metrics = metrics_dict(df_predrange.actual, df_predrange.forecast, ["mae", "mse", "r2_score"])
            print("mae: {mae}, mse: {mse}, r2: {r2_score}".format(**metrics))
            mlflow.log_metrics(metrics)

            ## logging ##
            mlflow.log_params(params)

In [None]:
index = 64

params = global_params.copy()
params.update({
    "index": index,
    "datetime": index2datetime(index),
    "pred_steps": 24,
    "train_steps": 200,
    "model": "TBATS",
    "seasonalities": [24],
})

with mlflow.start_run(run_name="24h_TBATS") as r:

    ## predict ##
    predictions = predict_next_hours(**params, data=df)
    df_forecast = pd.concat([df, predictions], axis=1)
    df_forecast["error"] = df_forecast.actual - df_forecast.forecast

    ## evaluate ##
    pred_lims = (params["index"]+1, params["index"]+params["pred_steps"])
    df_predrange = df_forecast.loc[pred_lims[0]:pred_lims[1]]

    metrics = metrics_dict(df_predrange.actual, df_predrange.forecast, ["mae", "mse", "r2_score"])
    print("mae: {mae}, mse: {mse}, r2: {r2_score}".format(**metrics))
    mlflow.log_metrics(metrics)

    ## logging ##
    mlflow.log_params(params)

df_forecast.loc[:params["index"]+params["pred_steps"]+10].set_index("datetime").plot();

In [None]:
start_time = "2009-01-03 22:00"

index_dt = pd.Timestamp.fromisoformat(start_time)
#second forecast:
params = global_params.copy()
params.update({
    "datetime": index_dt,
    "index": datetime2index(index_dt),
    "pred_steps": 24,
    "train_steps": 40,
    "model": "TBATS"
})

pred_lims = (params["index"]+1, params["index"]+params["pred_steps"])
train_lims = (params["index"]-params["train_steps"]+1, params["index"])

with mlflow.start_run(run_name="predict_2009") as r:
    forecast_2009 = predict_next_hours(**params, data=df)
    df_w_fc = pd.concat([df, forecast_2009.rename(columns={"forecast": "fc_2009"})], axis=1)
    df_predrange = df_w_fc.loc[pred_lims[0]:pred_lims[1]]

    #calculate error
    df_w_fc["error"] = df_w_fc.actual - df_w_fc.fc_2009

    metrics = metrics_dict(df_predrange.actual, df_predrange.fc_2009, ["mae", "mse", "r2_score"])
    print("mae: {mae}, mse: {mse}, r2: {r2_score}".format(**metrics))
    mlflow.log_metrics(metrics)

    mlflow.log_params(params)

df_w_fc.loc[train_lims[0]:pred_lims[1]+10].set_index('datetime').plot()