# Get dates for which predictors in test set are outside domain of training set

In [1]:
import xarray as xr
import numpy as np
import pandas as pd

In [2]:
%cd /g/data/w42/dr6273/work/demand_model/

import functions as fn

/g/data/w42/dr6273/work/demand_model


In [3]:
%load_ext autoreload
%autoreload 2

### Set global variables

In [4]:
RESULTS_PATH = "/g/data/w42/dr6273/work/projects/Aus_energy/model_results/"

In [5]:
MARKET = "NEM" # "NEM" or "EU"

In [6]:
REMOVE_WEEKEND = True

In [7]:
REMOVE_XMAS = True

In [8]:
REMOVE_MONTH = 0 # integer: [1, 12]

In [9]:
MASK_NAME = "pop_dens_mask"

In [10]:
FIRST_TRAIN_YEAR = 2010
LAST_TRAIN_YEAR = 2019

In [11]:
FIRST_TEST_YEAR = 1959
LAST_TEST_YEAR = 2022

In [12]:
N_FEATURES = "parsimonious"

In [13]:
DETREND = True

In [14]:
regions = ["NEM", "QLD", "NSW", "VIC", "SA", "TAS"]

### Load model data

In [15]:
predictions = fn.read_results(
    "extrapolated", MARKET, regions, MASK_NAME,
    FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, FIRST_TEST_YEAR, LAST_TEST_YEAR,
    REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES, RESULTS_PATH,
    DETREND
)

In [35]:
predictions["NSW"]["hdd"].isnull().sum()

0

### Exclude out-of-range days

In [38]:
excluded = {}
excluded_binary = {} # series of zeros (no NaNs) and ones (NaNs)
# for r in predictions.keys():
for r in ["NSW"]:
    cols = predictions[r].columns
    exc = pd.DataFrame(index=predictions[r].index, columns=cols)
    
    # for col in cols:
    for col in ["hdd"]:
        s = predictions[r][col]
        s_min = s.loc[str(FIRST_TRAIN_YEAR) : str(LAST_TRAIN_YEAR+1)].min()
        s_max = s.loc[str(FIRST_TRAIN_YEAR) : str(LAST_TRAIN_YEAR+1)].max()
        exc[col] = s.where(
                        (s >= s_min) &
                        (s <= s_max)
                    )
    excluded[r] = exc
    
    exc_b = exc.isna().sum(axis=1)
    excluded_binary[r] = exc_b.where(exc_b == 0, other=1)

In [42]:
print(s_min, s_max)

0.170676396312813 11.706513597200004


In [43]:
exc["hdd"].loc["1959"].max()

11.188224415009731

In [22]:
def pc_excluded(df):
    """
    Return percentage of rows in df with at least one NaN
    """
    row_has_nan = df.isna().sum(axis=1) # rows of zero if no NaNs in columns
    total_row_nan = row_has_nan.where(row_has_nan == 0).isna().sum() # number of rows with NaNs
    return (total_row_nan / len(df)) * 100 # percentage of rows with NaNs

In [23]:
for r in excluded.keys():
    pc = pc_excluded(excluded[r])
    print(r, str(np.round(pc, 2)) + "%")

NEM 9.96%
QLD 41.78%
NSW 20.11%
VIC 0.26%
SA 0.21%
TAS 0.19%


### Write binary series to file

In [19]:
for r, df in zip(excluded_binary.keys(), excluded_binary.values()):
    filename = fn.get_filename(
        "predictors_excluded", MARKET, r, MASK_NAME,
        FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, "1959", "2022",
        REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES
    )
    if DETREND:
        filename = filename + "_detrended"
    df.to_csv(
        RESULTS_PATH + "/predictors_excluded/random_forest/" + filename + ".csv",
    )