In [1]:
import sys
sys.path.append("..")
from data_prep.prep import MeterDataSet
import pandas as pd
import numpy as np
from webapp.utils.azure_utils import KeyVault, DataLake
from copy import deepcopy

c:\Users\nmert\OneDrive\Data Science\OMSA\HW & Projects\omsa_dva_project\.venv\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\nmert\OneDrive\Data Science\OMSA\HW & Projects\omsa_dva_project\.venv\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)
file_system = "energyhub"

In [3]:
meter = "electricity"
metadata_cols = ['building_id', 'site_id','sq_meter']
weather_cols = ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precipitation_depth_1_hr',
    'precipitation_depth_6_hr', 'sea_level_pressure', 'wind_direction',
    'wind_speed']

In [4]:
metadata = storage.pandas_read(file_system, directory="data_parq/metadata", file_name="metadata.parq")
weather = storage.pandas_read(file_system, directory="data_parq/weather", file_name="weather.parq")
electricity = storage.pandas_read(file_system, directory="data_parq/meters", file_name="electricity.parq")
bad_buildings = storage.pandas_read(file_system, directory="bad_buildings", file_name="bad_buildings.csv")["building_id"].to_list()

In [5]:
# Add cos, sin time features:
def time_features(dff):
    dff = deepcopy(dff)

    dff["hour"] = dff.timestamp.dt.hour
    dff["weekday"] = dff.timestamp.dt.weekday
    dff["month"] = dff.timestamp.dt.month
    dff["year"] = dff.timestamp.dt.year 
    
    dff = dff.sort_values(by = "timestamp")
    dff["time_idx"] = dff.index

    dff["time_norm"] = 2 * np.pi * dff["time_idx"] / dff["time_idx"].max()
    dff["cos_time"] = np.cos(dff["time_norm"])
    dff["sin_time"] = np.sin(dff["time_norm"])

    dff["month_cos"] = np.cos(2*np.pi*dff["time_idx"]/(30.4*24))
    dff["month_sin"] = np.sin(2*np.pi*dff["time_idx"]/(30.4*24))
        
    dff["weekday_cos"] = np.cos(2*np.pi*dff["time_idx"]/(7*24))
    dff["weekday_sin"] = np.sin(2*np.pi*dff["time_idx"]/(7*24))

    return dff.set_index("timestamp")

In [6]:
metadata = metadata[metadata_cols]
weather = weather[weather_cols]

In [7]:
# Add site_id to electricity:
electricity = pd.merge(electricity, metadata, on = "building_id", how = "left")

In [8]:
buildings = electricity.building_id.unique()
buildings = [building for building in buildings if building not in bad_buildings]
electricity = electricity[electricity.building_id.isin(buildings)]

In [9]:
sites = electricity.site_id.unique().to_list()

In [10]:
sites

['Panther',
 'Robin',
 'Fox',
 'Rat',
 'Bear',
 'Lamb',
 'Peacock',
 'Moose',
 'Gator',
 'Bull',
 'Bobcat',
 'Crow',
 'Wolf',
 'Hog',
 'Eagle',
 'Cockatoo',
 'Mouse']

In [11]:
for site in sites[1:2]:
    dfb = []

    e = electricity.loc[electricity.site_id == site, :]
    w = weather.loc[weather.site_id == site, :]
    b = electricity.building_id.unique()

    for building in b:
        df = e.loc[e.building_id == building, :]
        df = df.merge(w, left_on = ["timestamp"], right_on = ["timestamp"], how = "left")
        df = df.sort_values("timestamp").set_index("timestamp")
        df["cloud_coverage"] = df["cloud_coverage"].astype("category")
        df = df.rename(columns = {"site_id_x": "site_id"})
        df = df.drop(columns= ["site_id_y"])
        for col in df.columns:
            if col == "cloud_coverage":
                df.loc[:, col] = df.loc[:, col].fillna(method="bfill") # Back fill this categorical
            elif col in ["site_id", "building_id", "timestamp", "sq_meter"]:
                pass
            else:
                q_low = df[col].quantile(0.01)
                q_hi  = df[col].quantile(0.99)
                df.loc[~((df[col] > q_low) & (df[col] < q_hi)), col] = np.nan
                df.loc[:, col] = df.loc[:, col].interpolate(method = "cubic", limit = 6)
        df = time_features(df.reset_index())
        dfb.append(df)
    result = pd.concat(dfb)
    result.isna().sum()*100/len(result)
    cols = result.columns.to_list()
    cols.remove("precipitation_depth_6_hr")
    result = result[cols].dropna(how = "any")
    result = result.sort_values(["building_id", "timestamp"])
    result.isna().sum()*100/len(result)
    buildings = result.building_id.unique()
    result.building_id = result.building_id.astype("category")
    result.cloud_coverage = result.cloud_coverage.astype("category")

    days_test = 7
    test_lengths = 24 * days_test

    dfs_test = []
    dfs_train = []
    dfs_val = []

    for building in buildings:
        df = result.loc[result.building_id == building, :].sort_values(["timestamp"]).reset_index(drop = False)
        val_cutoff = int(df.shape[0] * 0.80)
        df_train = df[:val_cutoff]
        df_val = df[val_cutoff:]
        
        dfs_train.append(df_train)
        dfs_val.append(df_val)

    train = pd.concat(dfs_train).reset_index(drop = True)
    val = pd.concat(dfs_val).reset_index(drop = True)

    directory = f"forecasting/data/{site}"
    file_name = f"test.parq"

    directory = f"forecasting/data/{site}"
    file_name = f"train.parq"
    train.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
    storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)

    directory = f"forecasting/data/{site}"
    file_name = f"val.parq"
    val.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
    storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)

ValueError: No objects to concatenate