In [90]:
import sys
import os
from copy import deepcopy
sys.path.append("..")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from webapp.utils.azure_utils import KeyVault, DataLake
import dask.dataframe as dd
import dask.array as da
import river
from data_prep.prep import MeterDataSet

In [91]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)

In [92]:
file_system = "energyhub"
dest_dir = "/data_parq/norm_data"
meters_dir = "/data_parq/meters"
meta_dir = "/data_parq/metadata"
weather_dir = "/data_parq/weather"
bad_building_dir = "/bad_buildings"

In [93]:
meter = "electricity"
metadata_cols = ['building_id', 'site_id','sq_meter', 'primary_space_usage']
weather_cols = ['site_id', 'timestamp', 'air_temperature', 'dew_temperature','wind_direction', 'wind_speed']

## Initiate the Class

In [94]:
electricity = MeterDataSet(meter, metadata_cols, weather_cols)

## Process weather

In [95]:
w_cols = weather_cols[2:]

In [96]:
weather = electricity.fill_weather_na(w_cols, "linear")

In [97]:
len(weather)

331166

In [98]:
weather.columns

Index(['timestamp', 'site_id', 'air_temperature', 'dew_temperature',
       'wind_direction', 'wind_speed'],
      dtype='object')

In [99]:
# weather_df = weather.compute()

In [100]:
# weather_df.isna().sum()*100/len(weather_df)

## Process Meter

In [101]:
meter = electricity.meter

In [102]:
meter.compute().isna().sum()*100/len(meter)

timestamp      0.0
building_id    0.0
electricity    0.0
dtype: float64

## Merge and filter out bad buildings

In [103]:
electricity.merge()

In [104]:
bad_buildings = storage.pandas_read(
            file_system, directory=bad_building_dir, file_name="bad_buildings.csv")

In [105]:
bad_buildings = bad_buildings.building_id.to_list()

In [106]:
# len(bad_buildings)
#154

154

In [107]:
all_buildings = electricity.df.building_id.unique()

In [108]:
# len(all_buildings)
#1573

1573

In [109]:
good_buildings = [b for b in all_buildings if b not in bad_buildings]


1425

In [None]:
# len(good_buildings)
#1425

In [110]:
electricity.df = electricity.filter_buildings(good_buildings)

In [111]:
df = electricity.df

In [112]:
df.info

<bound method DataFrame.info of Dask DataFrame Structure:
                     timestamp        building_id electricity          site_id sq_meter primary_space_usage air_temperature dew_temperature wind_direction wind_speed
npartitions=19                                                                                                                                                       
                datetime64[ns]  category[unknown]     float64  category[known]  float64   category[unknown]         float64         float64        float64    float64
                           ...                ...         ...              ...      ...                 ...             ...             ...            ...        ...
...                        ...                ...         ...              ...      ...                 ...             ...             ...            ...        ...
                           ...                ...         ...              ...      ...                 ...     

In [45]:
# len(df)

25149355

In [20]:
# len(df.building_id.unique())
# 1573 unique buildings before filtering bad buildings

## Add features

In [114]:
df['usage_lag1'] = df['electricity'].shift(1)
df['usage_lag2'] = df['electricity'].shift(2)
df['usage_lag3'] = df['electricity'].shift(3)

In [115]:
df["hour"] = df.timestamp.dt.hour
df["weekday"] = df.timestamp.dt.weekday
df["month"] = df.timestamp.dt.month
df["year"] = df.timestamp.dt.year 
df["weekday_hour"] = df.weekday.astype(str) + "-" + df.hour.astype(str)

In [116]:
df["ts"] = (df.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600

In [117]:
df["hour_x"] = np.cos(2*np.pi*df.ts/24)
df["hour_y"] = np.sin(2*np.pi*df.ts/24)
    
df["month_x"] = np.cos(2*np.pi*df.ts/(30.4*24))
df["month_y"] = np.sin(2*np.pi*df.ts/(30.4*24))
    
df["weekday_x"] = np.cos(2*np.pi*df.ts/(7*24))
df["weekday_y"] = np.sin(2*np.pi*df.ts/(7*24))

In [118]:
def add_time_features(df):
    """
    Adapted from: https://github.com/buds-lab/ashrae-great-energy-predictor-3-solution-analysis/blob/master/solutions/rank-1/scripts/02_preprocess_data.py
    """
    df.timestamp = pd.to_datetime(df.timestamp)
    # time features
    df["hour"] = df.timestamp.dt.hour
    df["weekday"] = df.timestamp.dt.weekday
    df["month"] = df.timestamp.dt.month
    df["year"] = df.timestamp.dt.year    
    
    # time interactions
    df["weekday_hour"] = df.weekday.astype(str) + "-" + df.hour.astype(str)
    
    # apply cyclic encoding of periodic features
    df["ts"] = (df.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600
    df["hour_x"] = np.cos(2*np.pi*df.ts/24)
    df["hour_y"] = np.sin(2*np.pi*df.ts/24)
    
    df["month_x"] = np.cos(2*np.pi*df.ts/(30.4*24))
    df["month_y"] = np.sin(2*np.pi*df.ts/(30.4*24))
    
    df["weekday_x"] = np.cos(2*np.pi*df.ts/(7*24))
    df["weekday_y"] = np.sin(2*np.pi*df.ts/(7*24))

In [38]:
# add_time_features(df)

In [119]:
df.columns

Index(['timestamp', 'building_id', 'electricity', 'site_id', 'sq_meter',
       'primary_space_usage', 'air_temperature', 'dew_temperature',
       'wind_direction', 'wind_speed', 'usage_lag1', 'usage_lag2',
       'usage_lag3', 'hour', 'weekday', 'month', 'year', 'weekday_hour', 'ts',
       'hour_x', 'hour_y', 'month_x', 'month_y', 'weekday_x', 'weekday_y'],
      dtype='object')

## Store the data in Azure for use later

In [120]:
file_system = "energyhub"
directory = "data_parq/norm"

In [121]:
#turn to pd so we can save to Azure
df_pd = df.compute()

In [124]:
for building in good_buildings:
    building_df = df_pd.loc[df_pd.building_id == building, :]
    file_name = "norm_" + str(building) + ".parq"
    building_df.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
    storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)
    os.remove(file_name)

norm_Panther_parking_Alaina.parq write complete
norm_Panther_office_Clementine.parq write complete
norm_Panther_retail_Lester.parq write complete
norm_Panther_education_Mohammad.parq write complete
norm_Panther_lodging_Kirk.parq write complete
norm_Panther_education_Cleopatra.parq write complete
norm_Panther_education_Violet.parq write complete
norm_Panther_parking_Mellissa.parq write complete
norm_Panther_education_Emily.parq write complete
norm_Panther_education_Enriqueta.parq write complete
norm_Panther_education_Mattie.parq write complete
norm_Panther_education_Misty.parq write complete
norm_Panther_education_Edna.parq write complete
norm_Panther_lodging_Cora.parq write complete
norm_Panther_education_Scarlett.parq write complete
norm_Panther_education_Tina.parq write complete
norm_Panther_education_Sophia.parq write complete
norm_Panther_lodging_Kara.parq write complete
norm_Panther_retail_Kristina.parq write complete
norm_Panther_education_Karri.parq write complete
norm_Panther_r