# Write ERA5 demand predictions

In [35]:
import os
import glob
import xarray as xr
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

import matplotlib.pyplot as plt
import matplotlib.cm

In [20]:
%cd /g/data/w42/dr6273/work/demand_model/

import functions as fn

/g/data/w42/dr6273/work/demand_model


In [15]:
%load_ext autoreload
%autoreload 2

### Set global variables

In [16]:
RESULTS_PATH = "/g/data/w42/dr6273/work/projects/Aus_energy/model_results/"

In [3]:
MARKET = "NEM" # "NEM" or "EU"

In [4]:
REMOVE_WEEKEND = True

In [5]:
REMOVE_XMAS = True

In [6]:
REMOVE_MONTH = 0 # integer: [1, 12]

In [7]:
MASK_NAME = "pop_dens_mask"

In [8]:
TIME_COLUMNS = []

In [9]:
FIRST_TRAIN_YEAR = 2010
LAST_TRAIN_YEAR = 2019

In [10]:
FIRST_TEST_YEAR = 2020
LAST_TEST_YEAR = 2020

In [11]:
N_FEATURES = "parsimonious"

In [12]:
regions = ["NEM", "QLD", "NSW", "VIC", "SA", "TAS"]

### Load features and hyperparameters

In [23]:
# def read_results(results_name, mask_name, rm_weekend, rm_xmas, n_features):
#     """
#     Read in results dataframes as dictionary items
#     """
#     if results_name == "feature_selection":
#         name = "feature_selection_results"
#     elif (results_name == "training") | (results_name == "test"):
#         name = results_name + "_predictions"
#     else:
#         name = results_name
        
#     results = dict()
#     for r in regions:
#         filename = fn.get_filename(
#             name, MARKET, r, mask_name,
#             FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, FIRST_TEST_YEAR, LAST_TEST_YEAR,
#             rm_weekend, rm_xmas, REMOVE_MONTH, n_features
#         )
#         results[r] = pd.read_csv(
#             RESULTS_PATH + results_name + "/random_forest/" + filename + ".csv",
#             index_col=0
#         )
#     return results

In [24]:
features = fn.read_results(
    "feature_selection", MARKET, regions, MASK_NAME,
    FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, FIRST_TEST_YEAR, LAST_TEST_YEAR,
    REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES, RESULTS_PATH
)

In [27]:
hyps = fn.read_results(
    "hyperparameters", MARKET, regions, MASK_NAME,
    FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, FIRST_TEST_YEAR, LAST_TEST_YEAR,
    REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES, RESULTS_PATH
)

### Fit model

In [28]:
dem_da = xr.open_dataset("/g/data/w42/dr6273/work/projects/Aus_energy/data/energy_demand/daily_demand_2010-2020_stl.nc")["demand_stl"]
dem_da = fn.remove_time(dem_da, REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH)

In [29]:
da_list = []
for r in regions:
     da_list.append(dem_da.sel(region=r).expand_dims({"region": [r]}))
demand = xr.concat(da_list, "region")

In [30]:
# Prepare predictors
files = fn.get_predictor_files(MARKET, MASK_NAME)
pred_ds = xr.open_mfdataset(files, combine="nested", compat="override")

In [34]:
# Prepare dataframe for machine learning
region_dfs = {}
for r in regions:
    df = fn.to_dataframe(dem_da, pred_ds, r)
    selected_preds = fn.parse_features(fn.sel_model(features[r])["feature_names"])[:]
    print(selected_preds[::-1])
    df = df[["demand"] + selected_preds[::-1]]
    region_dfs[r] = df

['t2m', 't2m3', 'cdd', 'hdd', 'cdd4', 'q', 'hdd4', 't2max', 't2min', 't2m4', 'msdwswrf', 'w10', 'mtpr']
['t2m', 't2m3', 'q', 'hdd4', 't2max', 'cdd3', 'rh', 'msdwswrf']
['t2m', 't2m3', 'q', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m3', 'cdd', 'hdd', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m3', 'cdd', 'q', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m', 't2m3', 'rh', 'msdwswrf', 'w10']


In [36]:
models = {}

for r in regions:
    test_len = dem_da.sel(time=slice(str(FIRST_TEST_YEAR), str(LAST_TEST_YEAR))).time.values.shape[0]
    
    train_X, test_X, train_y, test_y = fn.split(
        fn.sel_train_test(region_dfs[r], FIRST_TRAIN_YEAR, LAST_TEST_YEAR),
        "demand",
        test_size=test_len,
        random_state=0,
        shuffle=False
    )
    
    # Finalise model
    rf = ExtraTreesRegressor(
        n_estimators=int(hyps[r].loc["n_estimators"].values),
        min_samples_leaf=int(hyps[r].loc["min_samples_leaf"].values),
        max_depth=int(hyps[r].loc["max_depth"].values),
        max_leaf_nodes=int(hyps[r].loc["max_leaf_nodes"].values),
        random_state=0,
    )
    
    models[r], _ = fn.predict_forest(train_y, train_X, train_X, rf)

In [38]:
models["NSW"]

### Prepare predictors from 1959

In [44]:
x = pred_ds.sel(region=r).to_array("variable").transpose()

In [45]:
pd.DataFrame(
    x,
    columns=x["variable"],
    index=x["time"]
)

Unnamed: 0,mtpr,w10,msdwswrf,rh,cdd3,t2m4,t2min,t2max,hdd4,q,cdd4,hdd,cdd,t2m3,hdd3,t2m
1959-01-01,6.961498e-06,2.143507,295.833407,67.832597,,,,,,7.165296,,3.337329,-0.000056,,,287.809994
1959-01-02,4.138236e-06,1.547626,296.495098,68.295531,,,282.314842,291.004595,,6.668263,,4.492937,-0.000056,,,286.654383
1959-01-03,6.463298e-07,1.914627,336.022931,65.253390,-0.000056,,283.028545,294.007927,,7.098229,,2.787199,-0.000056,287.607653,3.539570,288.360122
1959-01-04,7.133438e-07,3.561940,322.010753,66.016614,-0.000056,288.017412,283.832506,293.966743,3.129871,7.611438,-0.000056,1.900114,-0.000056,288.086819,3.060498,289.247497
1959-01-05,3.009269e-06,3.646212,329.861403,70.886509,-0.000056,289.015338,287.486882,297.136628,2.320323,9.611721,-0.000056,0.099138,-0.000056,289.802591,1.595898,291.801697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-27,1.076402e-05,3.540114,336.674630,57.457803,0.187025,291.605356,291.101700,302.260550,1.065333,10.546278,0.140283,0.002582,0.560964,292.559360,0.617335,296.664633
2022-12-28,6.566586e-06,3.352393,270.689053,62.519901,0.187025,291.121505,281.360945,295.255835,1.549175,6.175383,0.140283,4.345357,0.000056,291.481027,1.695494,286.807123
2022-12-29,-4.658816e-07,1.758395,281.512600,65.667224,0.187025,290.382313,281.876394,291.125878,2.288237,6.549033,0.140283,4.067125,0.000056,290.186211,2.804607,287.085357
2022-12-30,1.613845e-05,3.059278,249.934595,76.733655,0.000056,290.091305,287.808120,293.266439,2.440684,9.237493,0.140283,1.349576,0.000056,287.899934,3.253604,289.805775


In [61]:
# Prepare dataframe for machine learning
predictions = {}

for r in regions:
    pred_arr = pred_ds.sel(region=r).to_array("variable")
    df = pd.DataFrame(
        pred_arr.transpose(),
        columns=pred_arr["variable"],
        index=pred_arr["time"]
    )
    selected_preds = fn.parse_features(fn.sel_model(features[r])["feature_names"])[:]
    print(selected_preds[::-1])
    df = df[selected_preds[::-1]]
    
    df = df.dropna()
    
    model = models[r]
    df["prediction"] = model.predict(np.array(df))
    predictions[r] = df
    
    filename = fn.get_filename(
        "extrapolated", MARKET, r, MASK_NAME,
        FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, "1959", "2022",
        REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES
    )
    df.to_csv(
        RESULTS_PATH + "/extrapolated/random_forest/" + filename + ".csv",
    )

['t2m', 't2m3', 'cdd', 'hdd', 'cdd4', 'q', 'hdd4', 't2max', 't2min', 't2m4', 'msdwswrf', 'w10', 'mtpr']
['t2m', 't2m3', 'q', 'hdd4', 't2max', 'cdd3', 'rh', 'msdwswrf']
['t2m', 't2m3', 'q', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m3', 'cdd', 'hdd', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m3', 'cdd', 'q', 't2max', 't2m4', 'msdwswrf', 'w10']
['t2m', 't2m3', 'rh', 'msdwswrf', 'w10']
