In [5]:
import sys
sys.path.append("..")
from data_prep.prep import MeterDataSet
import pandas as pd
import numpy as np
from webapp.utils.azure_utils import KeyVault, DataLake
from copy import deepcopy

In [6]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)
file_system = "energyhub"

In [7]:
meter = "electricity"
metadata_cols = ['building_id', 'site_id','sq_meter']
weather_cols = ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precipitation_depth_1_hr',
    'precipitation_depth_6_hr', 'sea_level_pressure', 'wind_direction',
    'wind_speed']

In [8]:
metadata = storage.pandas_read(file_system, directory="data_parq/metadata", file_name="metadata.parq")
weather = storage.pandas_read(file_system, directory="data_parq/weather", file_name="weather.parq")
electricity = storage.pandas_read(file_system, directory="data_parq/meters", file_name="electricity.parq")
bad_buildings = storage.pandas_read(file_system, directory="bad_buildings", file_name="bad_buildings.csv")["building_id"].to_list()

In [9]:
# Add cos, sin time features:
def time_features(dff):
    dff = deepcopy(dff)

    dff["hour"] = dff.timestamp.dt.hour
    dff["weekday"] = dff.timestamp.dt.weekday
    dff["month"] = dff.timestamp.dt.month
    dff["year"] = dff.timestamp.dt.year 
    
    dff = dff.sort_values(by = "timestamp")
    dff["time_idx"] = dff.index

    dff["time_norm"] = 2 * np.pi * dff["time_idx"] / dff["time_idx"].max()
    dff["cos_time"] = np.cos(dff["time_norm"])
    dff["sin_time"] = np.sin(dff["time_norm"])

    dff["month_cos"] = np.cos(2*np.pi*dff["time_idx"]/(30.4*24))
    dff["month_sin"] = np.sin(2*np.pi*dff["time_idx"]/(30.4*24))
        
    dff["weekday_cos"] = np.cos(2*np.pi*dff["time_idx"]/(7*24))
    dff["weekday_sin"] = np.sin(2*np.pi*dff["time_idx"]/(7*24))

    return dff.set_index("timestamp")

In [10]:
metadata = metadata[metadata_cols]
weather = weather[weather_cols]

In [11]:
# Add site_id to electricity:
electricity = pd.merge(electricity, metadata, on = "building_id", how = "left")

In [12]:
buildings = electricity.building_id.unique()
buildings = [building for building in buildings if building not in bad_buildings]
electricity = electricity[electricity.building_id.isin(buildings)]

In [13]:
sites = electricity.site_id.unique().to_list()

In [14]:
sites

['Panther',
 'Robin',
 'Fox',
 'Rat',
 'Bear',
 'Lamb',
 'Peacock',
 'Moose',
 'Gator',
 'Bull',
 'Bobcat',
 'Crow',
 'Wolf',
 'Hog',
 'Eagle',
 'Cockatoo',
 'Mouse']

In [15]:
dfb = []
site = "Panther"

e = electricity.loc[electricity.site_id == site, :]
w = weather.loc[weather.site_id == site, :]
b = electricity.building_id.unique()

for building in b:
    df = e.loc[e.building_id == building, :]
    df = df.merge(w, left_on = ["timestamp"], right_on = ["timestamp"], how = "left")
    df = df.sort_values("timestamp").set_index("timestamp")
    df["cloud_coverage"] = df["cloud_coverage"].astype("category")
    df = df.rename(columns = {"site_id_x": "site_id"})
    df = df.drop(columns= ["site_id_y"])
    for col in df.columns:
        if col == "cloud_coverage":
            df.loc[:, col] = df.loc[:, col].fillna(method="bfill") # Back fill this categorical
        elif col in ["site_id", "building_id", "timestamp", "sq_meter"]:
            pass
        else:
            q_low = df[col].quantile(0.01)
            q_hi  = df[col].quantile(0.99)
            df.loc[~((df[col] > q_low) & (df[col] < q_hi)), col] = np.nan
            df.loc[:, col] = df.loc[:, col].interpolate(method = "cubic", limit = 6)
    df = time_features(df.reset_index())
    dfb.append(df)

In [17]:
result = pd.concat(dfb)

In [18]:
result.isna().sum()*100/len(result)

building_id                  0.000000
electricity                 19.483053
site_id                      0.000000
sq_meter                     0.000000
air_temperature              0.381897
cloud_coverage               0.017100
dew_temperature              0.894893
precipitation_depth_1_hr     0.057000
precipitation_depth_6_hr    82.090743
sea_level_pressure           1.681487
wind_direction               0.661195
wind_speed                   0.626995
hour                         0.000000
weekday                      0.000000
month                        0.000000
year                         0.000000
time_idx                     0.000000
time_norm                    0.000000
cos_time                     0.000000
sin_time                     0.000000
month_cos                    0.000000
month_sin                    0.000000
weekday_cos                  0.000000
weekday_sin                  0.000000
dtype: float64

In [19]:
cols = result.columns.to_list()
cols.remove("precipitation_depth_6_hr")

In [20]:
result = result[cols].dropna(how = "any")

In [21]:
result = result.sort_values(["building_id", "timestamp"])

In [22]:
result.isna().sum()*100/len(result)

building_id                 0.0
electricity                 0.0
site_id                     0.0
sq_meter                    0.0
air_temperature             0.0
cloud_coverage              0.0
dew_temperature             0.0
precipitation_depth_1_hr    0.0
sea_level_pressure          0.0
wind_direction              0.0
wind_speed                  0.0
hour                        0.0
weekday                     0.0
month                       0.0
year                        0.0
time_idx                    0.0
time_norm                   0.0
cos_time                    0.0
sin_time                    0.0
month_cos                   0.0
month_sin                   0.0
weekday_cos                 0.0
weekday_sin                 0.0
dtype: float64

In [23]:
buildings = result.building_id.unique()

In [24]:
result.building_id = result.building_id.astype("category")
result.cloud_coverage = result.cloud_coverage.astype("category")

In [25]:
building = buildings[0]
df = result.loc[result.building_id == building, :].sort_values(["timestamp"]).reset_index(drop = False)

In [47]:
days_test = 7
test_lengths = 24 * days_test

dfs_test = []
dfs_train = []
dfs_val = []

for building in buildings:
    df = result.loc[result.building_id == building, :].sort_values(["timestamp"]).reset_index(drop = False)
    val_cutoff = int(df.shape[0] * 0.80)
    df_train = df[:val_cutoff]
    df_val = df[val_cutoff:]
    
    dfs_train.append(df_train)
    dfs_val.append(df_val)

train = pd.concat(dfs_train).reset_index(drop = True)
val = pd.concat(dfs_val).reset_index(drop = True)

In [48]:
directory = f"forecasting/data/{site}"
file_name = f"test.parq"

directory = f"forecasting/data/{site}"
file_name = f"train.parq"
train.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)

directory = f"forecasting/data/{site}"
file_name = f"val.parq"
val.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)

train.parq write complete
val.parq write complete
