In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from darts import TimeSeries
from darts.metrics import mae, rmse
from darts.models import RNNModel
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.dataprocessing.transformers import Scaler
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# Auxiliar functions
from tshelpers.plot import plot_compare
from tshelpers.metrics import mae_score, rmse_score

warnings.filterwarnings("ignore", category=UserWarning)
sns.set_theme(style="ticks", palette="mako")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Reading short imputed master dataframe
master_df = pd.read_csv("data/2016_2021_master_df_short_imputed.csv")
master_df["DATE_PST"] = pd.to_datetime(master_df["DATE_PST"])
master_df.set_index("DATE_PST", inplace=True)
display(master_df.head())
print(master_df.shape)

Unnamed: 0_level_0,STATION,PM 2.5,MISSING,MISSING_SEQ,MISSING_SAMPLE
DATE_PST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01 01:00:00,Vancouver_Clark_Drive_PM25,30.557,0,0,0
2016-01-01 02:00:00,Vancouver_Clark_Drive_PM25,34.661,0,0,0
2016-01-01 03:00:00,Vancouver_Clark_Drive_PM25,35.419,0,0,0
2016-01-01 04:00:00,Vancouver_Clark_Drive_PM25,24.335,0,0,0
2016-01-01 05:00:00,Vancouver_Clark_Drive_PM25,29.336,0,0,0


(228056, 5)


In [3]:
# Redefining experimentation subsets to 2 months prior training set
stations = master_df["STATION"].unique()
subsets = {}
for station in stations:
    subsets[station] = {}
    for month, year in [(2, 2016), (11, 2017), (10, 2018), (8, 2020)]:
        subsets[station][f"{month}-{year}"] = master_df[master_df["STATION"] == station].loc[
    datetime(year, month-1, 1):datetime(year, month+1, 1), ["PM 2.5"]]

for station in subsets:
    print(station, subsets[station].keys())

# Darts timeseries64
for station in subsets:
    for month in subsets[station]:
        subsets[station][month] = TimeSeries.from_dataframe(subsets[station][month])

Vancouver_Clark_Drive_PM25 dict_keys(['2-2016', '11-2017', '10-2018', '8-2020'])
Vancouver_International_Airport_#2_PM25 dict_keys(['2-2016', '11-2017', '10-2018', '8-2020'])
North_Vancouver_Mahon_Park_PM25 dict_keys(['2-2016', '11-2017', '10-2018', '8-2020'])
North_Vancouver_Second_Narrows_PM25 dict_keys(['2-2016', '11-2017', '10-2018', '8-2020'])


In [None]:
# Iterating through stations subsets

for station in subsets:
    for month in subsets[station]:
        series = subsets[station][month]
        train, val = series.split_before(0.75)

        ## Creating month, year, and day-of-week covariates
        # Year covariates
        year_series = datetime_attribute_timeseries(
            pd.date_range(start=series.start_time(),
                            freq=series.freq_str,
                            periods=len(series)),
            attribute="year",
            one_hot=False,
        )

        # Weekday covariates
        weekday_series = datetime_attribute_timeseries(
            year_series, attribute="weekday", one_hot=True
        )

        # Hour covariates
        hour_series = datetime_attribute_timeseries(
            year_series, attribute="hour", one_hot=False
        )
        # Scaling hour covariates
        hour_series = Scaler().fit_transform(hour_series)

        covariates = weekday_series.stack(hour_series)
        cov_train, cov_val = covariates.split_before(pd.Timestamp(datetime(2020, 8, 15)))

        model_name = f"LSTM_2-months_training_batch32_400"
        results_dict[model_name] = {}

        # Early stopping
        stopper = EarlyStopping(
            monitor="val_loss",
            patience=10,
            min_delta=0.02,
            mode="min"
        )

        # LSTM model
        my_model = RNNModel(
            model="LSTM",
            hidden_dim=200,
            dropout=0.1,
            batch_size=32,
            n_epochs=100,
            optimizer_kwargs={"lr": 5e-4},
            model_name=model_name,
            log_tensorboard=True,
            random_state=123,
            training_length=20,
            input_chunk_length=21,
            force_reset=True,
            save_checkpoints=True,
            pl_trainer_kwargs={
                "callbacks": [stopper],
                "accelerator": "gpu",
                "devices": -1,
                "auto_select_gpus": True
            }
        )

        # Training
        my_model.fit(
            train,
            future_covariates=covariates,
            val_series=val,
            val_future_covariates=covariates,
            verbose=True,
        )

        # Validation
        pred_val = my_model.predict(n=len(val), future_covariates=covariates)
        my_model.load_from_checkpoint(model_name=model_name, best=True)

        # Storing results
        results_dict[model_name]["covariates"] = covariates
        results_dict[model_name]["train"] = train
        results_dict[model_name]["val"] = val
        results_dict[model_name]["pred_val"] = pred_val
        results_dict[model_name]["MAE"] = mae(pred_val, val)
        results_dict[model_name]["RMSE"] = rmse(pred_val, val)