In [1]:
import pandas as pd
from datetime import timedelta
from graph_traffic.get_data import get_data
from graph_traffic.config import figures_path, project_path
from graph_traffic.dataloading import npzDataset
from graph_traffic.baseline_models import MeanRegressor, MedianRegressor, RepeatRegressor, RepeatLastRegressor, DaytimeRegressor, DriftRegressor
from graph_traffic.model_selection import timeseries_cv, plot_predictions, print_losses, get_node_ids
from graph_traffic.merge_data import merge_data
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pickle

baseline_path = figures_path + "/baseline"

months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

ids_to_use = get_node_ids()
print(f"total of {len(ids_to_use)} nodes")
data_dict = dict(
    ids_list=ids_to_use,#[3954, 3973, 3976, 3977, 3978],#[3532, 3542, 3544, 3545, 3546, 3577, 3637, 3640, 3641, 4761, 4763],
    seq_len=4*2,
    with_graph=False,
    from_date="2019-01-01",
    to_date="2021-12-31",
    dataset_name="small",
    target="ocupacion",
    interactions="drop"
)

meteo_dict = dict(
    rain="drop",
    wind="drop",
    temperature="drop",
    humidity="drop",
    pressure="drop",
    radiation="drop"
)

temporal_dict = dict(
    year="drop",
    season="drop",
    month="drop",
    day_of_month="drop",
    weekday="drop",
    hour="passthrough",
    minute="drop",
    bank_holiday="drop",
    school_holiday="drop",
    working_day="passthrough",
    state_of_alarm="drop"
)

#get_data(data_dict, meteo_dict, temporal_dict, train_until="2021-05-31")
dataset_name = data_dict["dataset_name"]
n_points = None

  from .autonotebook import tqdm as notebook_tqdm


total of 37 nodes


# 1 Mean regressor

In [9]:
maes_per_sensor = dict()
mses_per_sensor = dict()
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()
for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in data_dict["ids_list"]:
        df = merge_data(node_id, target=data_dict["target"])[["date", "ocupacion"]]
        train_y = df.loc[df.date <= train_until, "ocupacion"]
        test_y = df.loc[(df.date > train_until) & (df.date <= train_until+timedelta(days=30)), "ocupacion"]
        print("train size:", train_y.shape[0])
        print("test size:", test_y.shape[0])
        pred = np.repeat(np.mean(train_y), test_y.shape[0])
        maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y, pred)
        mses_per_sensor[node_id][train_until] = mean_squared_error(test_y, pred)

maes = pd.DataFrame(maes_per_sensor).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.sort_index()
maes = maes.round(4)
#maes = maes[["avg", "std"]].sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/mean_regressor/mean_maes.csv")


mses = pd.DataFrame(mses_per_sensor).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
#mses = mses[["avg", "std"]].sort_index()
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/mean_regressor/mean_mses.csv")

df = pd.merge(maes, mses, left_index=True, right_index=True).round(4)
df = df.sort_index()
df.to_csv(f"{project_path}/training_history/baseline/mean_regressor/mean_losses.csv")

df

2021-05-31 00:00:00
train size: 81325
test size: 2842
train size: 80988
test size: 2804
train size: 80164
test size: 2747
train size: 79584
test size: 2778
train size: 79384
test size: 2636
train size: 79237
test size: 2822
train size: 77568
test size: 2718
train size: 77036
test size: 2765
train size: 79056
test size: 2682
train size: 76846
test size: 2709
train size: 76440
test size: 2735
train size: 76492
test size: 2694
train size: 79599
test size: 2742
train size: 75968
test size: 2736
train size: 76755
test size: 2789
train size: 76078
test size: 2719
train size: 77704
test size: 2705
train size: 75557
test size: 2688
train size: 75309
test size: 2730
train size: 79203
test size: 2715
train size: 74193
test size: 2773
train size: 73968
test size: 2697
train size: 74743
test size: 2609
train size: 74674
test size: 2652
train size: 74078
test size: 2619
train size: 72971
test size: 2767
train size: 73257
test size: 2555
train size: 71778
test size: 2665
train size: 76284
test size:

Unnamed: 0,Jun_x,Jul_x,Aug_x,Sep_x,Oct_x,Nov_x,Dec_x,avg_x,std_x,Jun_y,Jul_y,Aug_y,Sep_y,Oct_y,Nov_y,Dec_y,avg_y,std_y
3481,7.0153,7.115,7.2112,7.0071,6.8886,6.8029,6.9053,6.9922,0.1298,66.74,65.43,62.19,70.99,71.35,66.31,70.82,67.69,3.21
3572,2.8969,2.5039,2.2545,2.8987,3.1151,3.0603,3.0844,2.8305,0.3037,19.57,12.9,8.02,17.03,24.18,20.69,21.95,17.76,5.2
3576,4.233,3.5844,3.1496,5.5771,6.5974,43.4029,4.477,10.1459,13.6203,34.6,21.21,14.01,61.91,124.81,2487.4,34.14,396.87,854.15
3628,2.954,2.8268,2.5342,3.4529,3.637,3.5461,3.6138,3.2235,0.4115,16.85,15.95,9.86,26.93,30.75,25.4,26.99,21.82,7.05
3684,2.2751,1.943,5.146,4.6263,2.4922,2.4255,2.5497,3.0654,1.1744,10.68,6.3,133.56,86.36,10.74,10.3,15.88,39.12,46.62
3685,4.5613,3.821,3.1543,4.4104,4.4399,4.4424,4.4989,4.1897,0.4806,28.38,20.26,14.78,25.65,26.58,27.78,27.61,24.43,4.68
3686,5.3219,4.1548,3.1344,5.4994,5.9766,5.4983,5.3114,4.9853,0.9138,48.11,29.04,14.34,53.36,61.13,52.62,48.51,43.87,15.1
5044,2.1029,1.8384,1.4601,1.9668,2.0751,2.2658,1.9341,1.949,0.2366,6.37,8.75,3.43,5.3,8.31,9.82,5.54,6.79,2.09
5045,3.7217,3.5979,3.9927,3.9802,4.2209,4.1645,4.1369,3.9735,0.2169,21.49,19.91,22.05,22.66,30.13,26.36,24.84,23.92,3.22
5046,3.7113,3.9815,4.738,3.7219,4.2082,3.6514,3.8108,3.9747,0.3589,20.41,21.23,26.55,22.25,34.03,20.19,22.97,23.95,4.57


# 2. Median regressor

In [10]:
maes_per_sensor = dict()
mses_per_sensor = dict()
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()
for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in data_dict["ids_list"]:
        df = merge_data(node_id, target=data_dict["target"])[["date", "ocupacion"]]
        train_y = df.loc[df.date <= train_until, "ocupacion"]
        test_y = df.loc[(df.date > train_until) & (df.date <= train_until+timedelta(days=30)), "ocupacion"]
        print("train size:", train_y.shape[0])
        print("test size:", test_y.shape[0])
        pred = np.repeat(np.median(train_y), test_y.shape[0])
        maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y, pred)
        mses_per_sensor[node_id][train_until] = mean_squared_error(test_y, pred)

maes = pd.DataFrame(maes_per_sensor).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.sort_index()
maes = maes.round(4)
#maes = maes[["avg", "std"]].sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/median_regressor/median_maes.csv")


mses = pd.DataFrame(mses_per_sensor).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
#mses = mses[["avg", "std"]].sort_index()
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/median_regressor/median_mses.csv")

df = pd.merge(maes, mses, left_index=True, right_index=True).round(4)
df = df.sort_index()
df.to_csv(f"{project_path}/training_history/baseline/median_regressor/median_losses.csv")

df

2021-05-31 00:00:00
train size: 81325
test size: 2842
train size: 80988
test size: 2804
train size: 80164
test size: 2747
train size: 79584
test size: 2778
train size: 79384
test size: 2636
train size: 79237
test size: 2822
train size: 77568
test size: 2718
train size: 77036
test size: 2765
train size: 79056
test size: 2682
train size: 76846
test size: 2709
train size: 76440
test size: 2735
train size: 76492
test size: 2694
train size: 79599
test size: 2742
train size: 75968
test size: 2736
train size: 76755
test size: 2789
train size: 76078
test size: 2719
train size: 77704
test size: 2705
train size: 75557
test size: 2688
train size: 75309
test size: 2730
train size: 79203
test size: 2715
train size: 74193
test size: 2773
train size: 73968
test size: 2697
train size: 74743
test size: 2609
train size: 74674
test size: 2652
train size: 74078
test size: 2619
train size: 72971
test size: 2767
train size: 73257
test size: 2555
train size: 71778
test size: 2665
train size: 76284
test size:

Unnamed: 0,Jun_x,Jul_x,Aug_x,Sep_x,Oct_x,Nov_x,Dec_x,avg_x,std_x,Jun_y,Jul_y,Aug_y,Sep_y,Oct_y,Nov_y,Dec_y,avg_y,std_y
3481,5.0379,4.6807,3.8988,5.237,5.5039,5.1984,5.3542,4.9873,0.5056,65.06,58.41,41.33,71.91,76.98,68.24,73.57,65.07,11.21
3572,2.8424,2.2782,1.738,2.8521,3.0847,2.9452,2.9799,2.6744,0.4518,21.3,13.04,6.26,18.57,26.31,22.31,23.6,18.77,6.42
3576,4.2519,3.3369,2.4722,5.5614,6.6042,44.5,4.2923,10.1455,14.0815,37.94,21.15,10.03,68.03,134.12,2606.12,36.13,416.22,894.84
3628,2.9683,2.6085,1.847,3.417,3.6968,3.5483,3.5458,3.0902,0.6193,19.39,17.09,7.59,30.34,34.98,29.04,30.42,24.12,8.99
3684,2.3024,1.9274,5.0646,4.7083,2.5483,2.4913,2.5683,3.0872,1.1597,11.24,6.58,135.85,89.24,11.85,11.42,17.01,40.45,47.35
3685,4.6742,3.8224,2.9223,4.4963,4.5347,4.5138,4.5515,4.2165,0.588,30.29,20.42,12.6,26.99,28.12,29.11,28.96,25.21,5.96
3686,5.6747,4.2475,2.5228,5.8163,6.3459,5.7614,5.4452,5.1163,1.2141,59.45,35.03,12.2,65.26,75.64,64.72,59.21,53.07,20.24
5044,2.1131,1.8317,1.3702,1.9734,2.0784,2.2734,1.9372,1.9396,0.2667,6.49,8.82,3.16,5.38,8.43,9.99,5.65,6.85,2.19
5045,3.5861,3.2983,3.4016,3.8994,4.1616,4.0863,4.036,3.7813,0.3234,19.06,16.45,17.16,21.32,29.37,25.3,23.69,21.76,4.33
5046,2.8835,2.5934,3.0352,3.045,3.7288,3.0132,3.1317,3.0616,0.3172,14.2,12.05,14.36,18.28,33.31,16.09,19.24,18.22,6.57


# 3. Repeat regressor

In [13]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for seq_len in [2, 4, 8, 16]:
    print("\nSequence length:", seq_len)
    data_dict["seq_len"] = seq_len
    for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
        print(train_until)
        for node_id in ids_to_use:
            print(node_id, end="\r")
            data_dict["ids_list"] = [node_id]
            get_data(data_dict, meteo_dict, temporal_dict, train_until)
            #train_data = npzDataset(dataset_name, "train", n_points)
            test_data = npzDataset(dataset_name, "test", n_points)

            #train_x = train_data.x
            #train_y = train_data.y

            test_x = test_data.x
            test_y = test_data.y
            #print(f"{train_until}, test_shape: {test_x.shape}")
            reg = RepeatRegressor()
            reg.fit(None, None)
            pred = reg.predict(test_x)
            if test_x.shape[0] == 0:
                maes_per_sensor[node_id][train_until] = np.nan
                mses_per_sensor[node_id][train_until] = np.nan
            else:
                maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
                mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

        with open(f"{project_path}/training_history/baseline/repeat/repeat-maes_{seq_len}.pkl", "wb") as f:
            pickle.dump(maes_per_sensor, f)
        with open(f"{project_path}/training_history/baseline/repeat/repeat-mses_{seq_len}.pkl", "wb") as f:
            pickle.dump(mses_per_sensor, f)


Sequence length: 2
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 4
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 8
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 16
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [17]:
for seq_len in [2, 4, 8, 16]:
    with open(f"{project_path}/training_history/baseline/repeat/repeat-maes_{seq_len}.pkl", "rb") as f:
        maes = pickle.load(f)
    with open(f"{project_path}/training_history/baseline/repeat/repeat-mses_{seq_len}.pkl", "rb") as f:
        mses = pickle.load(f)

    maes = pd.DataFrame(maes).T
    maes.columns = months
    maes["avg"] = maes.mean(axis=1)
    maes["std"] = maes.std(axis=1)
    maes = maes.round(4)
    maes = maes.sort_index()
    maes.to_csv(f"{project_path}/training_history/baseline/repeat/repeat-maes_{seq_len}.csv")

    mses = pd.DataFrame(mses).T
    mses.columns = months
    mses["avg"] = mses.mean(axis=1)
    mses["std"] = mses.std(axis=1)
    mses = mses.round(2)
    mses = mses.sort_index()
    mses.to_csv(f"{project_path}/training_history/baseline/repeat/repeat-mses_{seq_len}.csv")

In [None]:
import pandas as pd
for seq_len in [2, 4, 8, 16]:
    df = pd.read_csv(f"{project_path}/training_history/baseline/repeat/repeat-mses_{seq_len}.csv", index_col=0)
    df = df.round(2)
    df.to_csv(f"{project_path}/training_history/baseline/repeat/repeat-mses_{seq_len}.csv")

# 4. Repeat last regressor

In [20]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for seq_len in [2, 4, 8, 16]:
    print("\nSequence length:", seq_len)
    data_dict["seq_len"] = seq_len
    for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
        print(train_until)
        for node_id in ids_to_use:
            print(node_id, end="\r")
            data_dict["ids_list"] = [node_id]
            get_data(data_dict, meteo_dict, temporal_dict, train_until)
            #train_data = npzDataset(dataset_name, "train", n_points)
            test_data = npzDataset(dataset_name, "test", n_points)

            #train_x = train_data.x
            #train_y = train_data.y

            test_x = test_data.x
            test_y = test_data.y
            #print(f"{train_until}, test_shape: {test_x.shape}")
            reg = RepeatLastRegressor()
            reg.fit(None, None)
            pred = reg.predict(test_x)
            if test_x.shape[0] == 0:
                maes_per_sensor[node_id][train_until] = np.nan
                mses_per_sensor[node_id][train_until] = np.nan
            else:
                maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
                mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

        with open(f"{project_path}/training_history/baseline/repeat_last/repeat-last-maes_{seq_len}.pkl", "wb") as f:
            pickle.dump(maes_per_sensor, f)
        with open(f"{project_path}/training_history/baseline/repeat_last/repeat-last-mses_{seq_len}.pkl", "wb") as f:
            pickle.dump(mses_per_sensor, f)


Sequence length: 2
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 4
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 8
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 16
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [23]:
for seq_len in [2, 4, 8, 16]:
    with open(f"{project_path}/training_history/baseline/repeat_last/repeat-last-maes_{seq_len}.pkl", "rb") as f:
        maes = pickle.load(f)
    with open(f"{project_path}/training_history/baseline/repeat_last/repeat-last-mses_{seq_len}.pkl", "rb") as f:
        mses = pickle.load(f)

    maes = pd.DataFrame(maes).T
    maes.columns = months
    maes["avg"] = maes.mean(axis=1)
    maes["std"] = maes.std(axis=1)
    maes = maes.round(4)
    maes = maes.sort_index()
    maes.to_csv(f"{project_path}/training_history/baseline/repeat_last/repeat-last-maes_{seq_len}.csv")

    mses = pd.DataFrame(mses).T
    mses.columns = months
    mses["avg"] = mses.mean(axis=1)
    mses["std"] = mses.std(axis=1)
    mses = mses.round(2)
    mses = mses.sort_index()
    mses.to_csv(f"{project_path}/training_history/baseline/repeat_last/repeat-last-mses_{seq_len}.csv")

# 5. Mean per hour without working day

In [32]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in ids_to_use:
        print(node_id, end="\r")
        data_dict["ids_list"] = [node_id]
        get_data(data_dict, meteo_dict, temporal_dict, train_until)
        train_data = npzDataset(dataset_name, "train", n_points)
        test_data = npzDataset(dataset_name, "test", n_points)

        train_x = train_data.x
        train_y = train_data.y

        test_x = test_data.x
        test_y = test_data.y
        #print(f"{train_until}, test_shape: {test_x.shape}")
        reg = DaytimeRegressor()
        reg.fit(train_x, train_y)
        pred = reg.predict(test_x)
        if test_x.shape[0] == 0:
            maes_per_sensor[node_id][train_until] = np.nan
            mses_per_sensor[node_id][train_until] = np.nan
        else:
            maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
            mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

    with open(f"{project_path}/training_history/baseline/daytime_mean/daytime-maes.pkl", "wb") as f:
        pickle.dump(maes_per_sensor, f)
    with open(f"{project_path}/training_history/baseline/daytime_mean/daytime-mses.pkl", "wb") as f:
        pickle.dump(mses_per_sensor, f)

2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [33]:
with open(f"{project_path}/training_history/baseline/daytime_mean/daytime-maes.pkl", "rb") as f:
    maes = pickle.load(f)
with open(f"{project_path}/training_history/baseline/daytime_mean/daytime-mses.pkl", "rb") as f:
    mses = pickle.load(f)

maes = pd.DataFrame(maes).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.round(4)
maes = maes.sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/daytime_mean/daytime-maes.csv")

mses = pd.DataFrame(mses).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/daytime_mean/daytime-mses.csv")

# 6. Mean per hour with working day

In [34]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in ids_to_use:
        print(node_id, end="\r")
        data_dict["ids_list"] = [node_id]
        get_data(data_dict, meteo_dict, temporal_dict, train_until)
        train_data = npzDataset(dataset_name, "train", n_points)
        test_data = npzDataset(dataset_name, "test", n_points)

        train_x = train_data.x
        train_y = train_data.y

        test_x = test_data.x
        test_y = test_data.y
        #print(f"{train_until}, test_shape: {test_x.shape}")
        reg = DaytimeRegressor(agg="mean", by_working_day=True)
        reg.fit(train_x, train_y)
        pred = reg.predict(test_x)
        if test_x.shape[0] == 0:
            maes_per_sensor[node_id][train_until] = np.nan
            mses_per_sensor[node_id][train_until] = np.nan
        else:
            maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
            mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

    with open(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-maes.pkl", "wb") as f:
        pickle.dump(maes_per_sensor, f)
    with open(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-mses.pkl", "wb") as f:
        pickle.dump(mses_per_sensor, f)

2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [35]:
with open(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-maes.pkl", "rb") as f:
    maes = pickle.load(f)
with open(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-mses.pkl", "rb") as f:
    mses = pickle.load(f)

maes = pd.DataFrame(maes).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.round(4)
maes = maes.sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-maes.csv")

mses = pd.DataFrame(mses).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/daytime_mean_with_workingday/daytime-mses.csv")

# 7. Median per hour without working day

In [36]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in ids_to_use:
        print(node_id, end="\r")
        data_dict["ids_list"] = [node_id]
        get_data(data_dict, meteo_dict, temporal_dict, train_until)
        train_data = npzDataset(dataset_name, "train", n_points)
        test_data = npzDataset(dataset_name, "test", n_points)

        train_x = train_data.x
        train_y = train_data.y

        test_x = test_data.x
        test_y = test_data.y
        #print(f"{train_until}, test_shape: {test_x.shape}")
        reg = DaytimeRegressor(agg="median")
        reg.fit(train_x, train_y)
        pred = reg.predict(test_x)
        if test_x.shape[0] == 0:
            maes_per_sensor[node_id][train_until] = np.nan
            mses_per_sensor[node_id][train_until] = np.nan
        else:
            maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
            mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

    with open(f"{project_path}/training_history/baseline/daytime_median/daytime-maes.pkl", "wb") as f:
        pickle.dump(maes_per_sensor, f)
    with open(f"{project_path}/training_history/baseline/daytime_median/daytime-mses.pkl", "wb") as f:
        pickle.dump(mses_per_sensor, f)

2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [37]:
with open(f"{project_path}/training_history/baseline/daytime_median/daytime-maes.pkl", "rb") as f:
    maes = pickle.load(f)
with open(f"{project_path}/training_history/baseline/daytime_median/daytime-mses.pkl", "rb") as f:
    mses = pickle.load(f)

maes = pd.DataFrame(maes).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.round(4)
maes = maes.sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/daytime_median/daytime-maes.csv")

mses = pd.DataFrame(mses).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/daytime_median/daytime-mses.csv")

# 8. Median per hour with working day

In [38]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()

for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
    print(train_until)
    for node_id in ids_to_use:
        print(node_id, end="\r")
        data_dict["ids_list"] = [node_id]
        get_data(data_dict, meteo_dict, temporal_dict, train_until)
        train_data = npzDataset(dataset_name, "train", n_points)
        test_data = npzDataset(dataset_name, "test", n_points)

        train_x = train_data.x
        train_y = train_data.y

        test_x = test_data.x
        test_y = test_data.y
        #print(f"{train_until}, test_shape: {test_x.shape}")
        reg = DaytimeRegressor(agg="median", by_working_day=True)
        reg.fit(train_x, train_y)
        pred = reg.predict(test_x)
        if test_x.shape[0] == 0:
            maes_per_sensor[node_id][train_until] = np.nan
            mses_per_sensor[node_id][train_until] = np.nan
        else:
            maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
            mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

    with open(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-maes.pkl", "wb") as f:
        pickle.dump(maes_per_sensor, f)
    with open(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-mses.pkl", "wb") as f:
        pickle.dump(mses_per_sensor, f)

2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109

In [39]:
with open(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-maes.pkl", "rb") as f:
    maes = pickle.load(f)
with open(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-mses.pkl", "rb") as f:
    mses = pickle.load(f)

maes = pd.DataFrame(maes).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.round(4)
maes = maes.sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-maes.csv")

mses = pd.DataFrame(mses).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/daytime_median_with_workingday/daytime-mses.csv")

# 9. Drift regressor

In [None]:
maes_per_sensor = dict()
mses_per_sensor = dict()
data_dict["ids_list"] = ids_to_use
for node_id in data_dict["ids_list"]:
    maes_per_sensor[node_id] = dict()
    mses_per_sensor[node_id] = dict()
for seq_len in [2, 4, 8, 16]:
    print("\nSequence length:", seq_len)
    data_dict["seq_len"] = seq_len
    for train_until in pd.date_range("2021-05-27", "2021-11-30", freq="1M"):
        print(train_until)
        for node_id in ids_to_use:
            print(node_id, end="\r")
            data_dict["ids_list"] = [node_id]
            get_data(data_dict, meteo_dict, temporal_dict, train_until)
            train_data = npzDataset(dataset_name, "train", n_points)
            test_data = npzDataset(dataset_name, "test", n_points)

            train_x = train_data.x
            train_y = train_data.y

            test_x = test_data.x
            test_y = test_data.y
            #print(f"{train_until}, test_shape: {test_x.shape}")
            reg = DriftRegressor()
            reg.fit(train_x, train_y)
            pred = reg.predict(test_x)
            if test_x.shape[0] == 0:
                maes_per_sensor[node_id][train_until] = np.nan
                mses_per_sensor[node_id][train_until] = np.nan
            else:
                maes_per_sensor[node_id][train_until] = mean_absolute_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])
                mses_per_sensor[node_id][train_until] = mean_squared_error(test_y[:, :, 0, 0], pred[:, :, 0, 0])

        with open(f"{project_path}/training_history/baseline/drift/drift-maes_{seq_len}.pkl", "wb") as f:
            pickle.dump(maes_per_sensor, f)
        with open(f"{project_path}/training_history/baseline/drift/drift-mses_{seq_len}.pkl", "wb") as f:
            pickle.dump(mses_per_sensor, f)


Sequence length: 2
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 4
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 8
2021-05-31 00:00:00
2021-06-30 00:00:00
2021-07-31 00:00:00
2021-08-31 00:00:00
2021-09-30 00:00:00
2021-10-31 00:00:00
2021-11-30 00:00:00
5109
Sequence length: 16
2021-05-31 00:00:00
3685

In [None]:
with open(f"{project_path}/training_history/baseline/drift/drift-maes_{seq_len}.pkl", "rb") as f:
    maes = pickle.load(f)
with open(f"{project_path}/training_history/baseline/drift/drift-mses_{seq_len}.pkl", "rb") as f:
    mses = pickle.load(f)

maes = pd.DataFrame(maes).T
maes.columns = months
maes["avg"] = maes.mean(axis=1)
maes["std"] = maes.std(axis=1)
maes = maes.round(4)
maes = maes.sort_index()
maes.to_csv(f"{project_path}/training_history/baseline/drift/drift-maes_{seq_len}.csv")

mses = pd.DataFrame(mses).T
mses.columns = months
mses["avg"] = mses.mean(axis=1)
mses["std"] = mses.std(axis=1)
mses = mses.round(2)
mses = mses.sort_index()
mses.to_csv(f"{project_path}/training_history/baseline/drift/drift-mses_{seq_len}.csv")