This notebook was used to generate all plots for the REPORT.md file. Please check REPORT.md instead.

## Imports

In [None]:
import pandas as pd
from ens_load_forecast.paths import PATH_LOAD_ACTUAL, PATH_WEATHER, PATH_LOAD_FORECAST, PATH_ZONES_AND_STATIONS, PATH_MAP_DATA, PATH_DATA
from ens_load_forecast.graphs import plot_load_per_zone, plot_load_seasonal, plot_on_map, correlation_heatmap, scatter_matrix
from ens_load_forecast.data_preprocessing import get_load_forecast, get_load_actual, get_weather,get_preprocessed_weather, get_merged_dataset
from ens_load_forecast.features_engineering import extract_features
from ens_load_forecast.models import train_models_for_each_zone
import ens_load_forecast.constants as cst
import numpy as np


## Actual load

In [None]:
df_load_actual = get_load_actual()
# plot_load_per_zone(df = df_load_actual, title="Actual load evolution (per zone)")

In [None]:
# plot_load_seasonal(df = df_load_actual, zone="N.Y.C.", title="Actual load yearly heatmap in N.Y.C. (MW)")

In [None]:
# df = df_load_actual.groupby("zone").apply(lambda x:x.mean())
# df["zone"] = df.index
# plot_on_map(df=df, quantity_key="load", title="Average yearly load (MW)")

## Load forecast

In [None]:
df_load_forecast = get_load_forecast()
# plot_load_per_zone(df = df_load_forecast, title="Forecast load evolution (per zone)")

In [None]:
# df_load_forecast[cst.FORECAST_HORIZON].unique()

In [None]:
# plot_load_seasonal(df = df_load_forecast[df_load_forecast[cst.FORECAST_HORIZON] == 2], zone="NYISO")

## Weather

In [None]:
df_weather = get_weather(force_recompute=False)

## Merge data

In [None]:
df_merged = get_merged_dataset(df_load_actual=df_load_actual, df_load_forecast=df_load_forecast, df_weather=df_weather)

In [None]:
df_features = extract_features(df_merged)
# df_features

In [None]:
df = df_features[df_features[cst.ZONE] == "MHK VL"]
# correlation_heatmap(df= df, title="Features correlation heatmap (pearson) (Mohawk Valley)")
# scatter_matrix(df=df, title="Scatter matrix (Mohawk Valley)")

In [None]:

df = df_features[df_features[cst.ZONE] == "N.Y.C."]
(df[cst.LOAD] - df[cst.LOAD_FORECAST]).plot(backend = "plotly")

In [None]:

# series = df_merged.groupby(cst.ZONE).apply(lambda x: (np.abs((x["load"] - x["load_forecast"])/x["load"])).mean())
# df = pd.DataFrame(data=series, index=series.index, columns=["load_forecast_error"])
# df["zone"] = df.index
# plot_on_map(df = df, quantity_key="load_forecast_error", title="Forecast error %MAE")

## Models

In [None]:
models, scores = train_models_for_each_zone(df_features=df_features)

In [None]:
best_models = {}
best_scores = {}
for zone, zone_scores in scores.items():
    best_model = ""
    best_score = 1.1
    for model_name, model_scores in zone_scores.items():
        score = model_scores["train"]["rmse"] / zone_scores["naive_model"]["train"]["rmse"]
        if score < best_score:
            best_model = model_name
            best_score = score
    best_models[zone] = best_model
    best_scores[zone] = best_score



In [None]:
df = pd.DataFrame(data=pd.Series(best_scores), columns = ["NRMSE"])
df[cst.ZONE] = df.index
plot_on_map(df=df, quantity_key="NRMSE", title="Best achieved NRMSE per zone (the lower the better) (train set)")

In [None]:
df = pd.DataFrame(data=pd.Series(best_models), columns = ["model_name"])
df[cst.ZONE] = df.index
plot_on_map(df=df, quantity_key="model_name", title="Best model (train set)")