# Get dates for which predictors in test set are outside domain of training set

In [1]:
import xarray as xr
import numpy as np
import pandas as pd

In [2]:
%cd /g/data/w42/dr6273/work/demand_model/

import functions as fn

/g/data/w42/dr6273/work/demand_model


In [3]:
%load_ext autoreload
%autoreload 2

### Set global variables

In [4]:
RESULTS_PATH = "/g/data/w42/dr6273/work/projects/Aus_energy/model_results/"

In [5]:
MARKET = "NEM" # "NEM" or "EU"

In [6]:
REMOVE_WEEKEND = True

In [7]:
REMOVE_XMAS = True

In [8]:
REMOVE_MONTH = 0 # integer: [1, 12]

In [9]:
MASK_NAME = "pop_dens_mask"

In [10]:
FIRST_TRAIN_YEAR = 2010
LAST_TRAIN_YEAR = 2019

In [11]:
FIRST_TEST_YEAR = 1959
LAST_TEST_YEAR = 2022

In [12]:
N_FEATURES = "parsimonious"

In [13]:
DETREND = True

In [14]:
regions = ["NEM", "QLD", "NSW", "VIC", "SA", "TAS"]

### Load model data

In [15]:
predictions = fn.read_results(
    "extrapolated", MARKET, regions, MASK_NAME,
    FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, FIRST_TEST_YEAR, LAST_TEST_YEAR,
    REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES, RESULTS_PATH,
    DETREND
)

In [17]:
predictions["VIC"]

Unnamed: 0,t2max,msdwswrf,t2m,w10,cdd,t2m4,t2m3,prediction
1959-01-04,302.629356,372.055798,295.258111,1.607908,0.131555,292.031826,293.156114,138433.257152
1959-01-05,303.765055,351.496326,296.204196,0.832812,0.165222,293.917949,294.913937,145101.873880
1959-01-06,295.849309,285.878661,290.249039,3.560216,0.012032,293.747520,293.903072,125959.784064
1959-01-07,295.064362,346.514225,290.337147,2.691591,0.012031,293.011409,292.262745,125299.084462
1959-01-08,296.099648,303.895529,290.539345,4.677562,0.013005,291.831707,290.374460,123588.183765
...,...,...,...,...,...,...,...,...
2022-12-27,307.431720,324.358435,302.475985,4.944655,5.316111,296.251678,298.216891,165910.569965
2022-12-28,298.648019,174.052472,288.824506,3.756586,-0.012034,295.868979,295.889977,125021.224012
2022-12-29,293.206471,304.140598,289.193490,2.681623,-0.012035,294.216046,293.498716,121081.657929
2022-12-30,299.553684,330.703908,293.914331,1.264915,-0.006304,293.602804,290.644827,129588.681960


### Exclude out-of-range days

In [29]:
excluded = {}
excluded_binary = {} # series of zeros (no NaNs) and ones (NaNs)
# for r in predictions.keys():
for r in ["VIC"]:
    cols = predictions[r].columns
    exc = pd.DataFrame(index=predictions[r].index, columns=cols)
    
    for col in ["cdd"]:#cols:
        s = predictions[r][col]
        s_min = s.loc[str(FIRST_TRAIN_YEAR) : str(LAST_TRAIN_YEAR+1)].min()
        s_max = s.loc[str(FIRST_TRAIN_YEAR) : str(LAST_TRAIN_YEAR+1)].max()
        exc[col] = s.where(
                        (s >= s_min) &
                        (s <= s_max)
                    )
    excluded[r] = exc
    
    exc_b = exc.isna().sum(axis=1)
    excluded_binary[r] = exc_b.where(exc_b == 0, other=1)

In [32]:
s_max

10.200177128067066

In [26]:
exc.loc["2021-07"]

Unnamed: 0,t2max,msdwswrf,t2m,w10,cdd,t2m4,t2m3,prediction
2021-07-01,287.935282,98.11742,284.353657,4.030792,,282.616869,283.616874,136412.880339
2021-07-02,286.616385,77.108702,281.967931,4.580727,,283.20482,283.528436,140381.310793
2021-07-03,281.751876,75.627766,279.481472,5.029364,,282.516871,281.935036,146641.407183
2021-07-04,283.563195,52.73727,280.547114,3.722047,,281.588233,280.666192,148689.039372
2021-07-05,283.920851,73.078857,280.666174,2.272047,,280.666362,280.232261,149158.791312
2021-07-06,283.36355,50.370642,280.620255,2.018257,,280.329436,280.611865,149480.600492
2021-07-07,284.14928,90.540043,278.597615,1.440247,,280.108482,279.962034,150994.906191
2021-07-08,283.777397,109.304647,278.039686,1.079761,,279.481624,279.086526,150595.529863
2021-07-09,283.812398,101.92127,279.060776,1.025179,,279.080266,278.566707,150972.628213
2021-07-10,284.653269,78.940823,280.377365,0.705297,,279.019546,279.159966,150424.784248


In [17]:
def pc_excluded(df):
    """
    Return percentage of rows in df with at least one NaN
    """
    row_has_nan = df.isna().sum(axis=1) # rows of zero if no NaNs in columns
    total_row_nan = row_has_nan.where(row_has_nan == 0).isna().sum() # number of rows with NaNs
    return (total_row_nan / len(df)) * 100 # percentage of rows with NaNs

In [18]:
for r in excluded.keys():
    pc = pc_excluded(excluded[r])
    print(r, str(np.round(pc, 2)) + "%")

NEM 0.77%
QLD 1.33%
NSW 0.25%
VIC 2.85%
SA 2.27%
TAS 0.71%


### Write binary series to file

In [19]:
for r, df in zip(excluded_binary.keys(), excluded_binary.values()):
    filename = fn.get_filename(
        "predictors_excluded", MARKET, r, MASK_NAME,
        FIRST_TRAIN_YEAR, LAST_TRAIN_YEAR, "1959", "2022",
        REMOVE_WEEKEND, REMOVE_XMAS, REMOVE_MONTH, N_FEATURES
    )
    if DETREND:
        filename = filename + "_detrended"
    df.to_csv(
        RESULTS_PATH + "/predictors_excluded/random_forest/" + filename + ".csv",
    )