In [39]:
import sys
import os
sys.path.append("..")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from webapp.utils.azure_utils import KeyVault, DataLake
import dask.dataframe as dd
import dask.array as da

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer, MultiNormalizer, TorchNormalizer, EncoderNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss, MultiLoss, MAE, RMSE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [40]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)
file_system = "energyhub"

In [174]:
# Read data:
metadata = storage.read(file_system, directory = "data_parq/metadata", file_name = "metadata.parq", extension = "parq")
weather = storage.read(file_system, directory = "data_parq/weather", file_name = "weather.parq", extension = "parq")
electricity = storage.read(file_system, directory = "data_parq/meters", file_name = "electricity.parq", extension = "parq")

In [177]:
sites = metadata.site_id.unique()
site = sites[0]

In [175]:
weather.isna().sum()*100/len(weather)

timestamp                    0.000000
site_id                      0.000000
air_temperature              0.038651
cloud_coverage              51.631810
dew_temperature              0.099044
precipitation_depth_1_hr    40.217293
precipitation_depth_6_hr    94.515741
sea_level_pressure           6.529656
wind_direction               3.927034
wind_speed                   0.173327
dtype: float64

In [176]:
int_cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precipitation_depth_1_hr',
       'precipitation_depth_6_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed']
       
df_w_res = []
for site in sites:
    for col in int_cols:
        df_w = weather[weather["site_id"] == site].copy()
        df_w[col] = df_w[col].interpolate(method = "linear")
        df_w_res.append(df_w)
weather = pd.concat(df_w_res)

weather.isna().sum()*100/len(weather)

timestamp                    0.000000
site_id                      0.000000
air_temperature              0.033820
cloud_coverage              46.451704
dew_temperature              0.086663
precipitation_depth_1_hr    38.499838
precipitation_depth_6_hr    83.391335
sea_level_pressure           6.378485
wind_direction               3.436154
wind_speed                   0.151661
dtype: float64

In [124]:
# Add site_id to electricity:
e = pd.merge(electricity, metadata[['building_id', 'site_id']], on = "building_id", how = "left")

In [125]:
# Use one site only:

m = metadata[metadata["site_id"] == site]
m = metadata[['building_id', 'site_id', 'sq_meter']]
w = weather[weather["site_id"] == site]
e = e[e["site_id"] == site]
buildings = e.building_id.unique()

In [126]:
# Convert to Dask dataframe:
e = dd.from_pandas(e, npartitions=10)
m = dd.from_pandas(m, npartitions=10)
w = dd.from_pandas(w, npartitions=10)

In [137]:
# Merge datasets: 
df = dd.merge(e, m, on = ["building_id", "site_id"], how = "left")
df = dd.merge(df, w, on = ["site_id", "timestamp"], how = "left").head()

In [141]:
# Convert ts to datetime and set as index:
df["timestamp"] = dd.to_datetime(df["timestamp"])

In [144]:
# Convert site id and building id to category:
df["building_id"] = df["building_id"].astype("category")
df["site_id"] = df["site_id"].astype("category")