In [1]:
import sys
sys.path.append("..")
from webapp.utils.azure_utils import KeyVault, DataLake
from webapp.utils.data_pre_utils import df_val_map
import numpy as np
import pandas as pd
import datetime
import fastparquet

In [2]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)

In [3]:
file_system = "energyhub"
dest_dir = "/data_parq/norm_data"
meters_dir = "/data_parq/meters"
meta_dir = "/data_parq/metadata"
weather_dir = "/data_parq/weather"


In [4]:
meter = storage.read(file_system, meters_dir, file_name="electricity.parq", extension="parq")


In [None]:
meta = storage.read(file_system, meta_dir, file_name="metadata.parq", extension="parq")


In [None]:
weather = storage.read(file_system, weather_dir, file_name="weather.parq", extension="parq")

In [None]:
meter.info()

In [None]:
meter.isna().sum()*100/len(meter)

In [None]:
# number of buildings in dataset
len(meter.building_id.unique())

In [None]:
meter.groupby("building_id").count().rsub(meter.groupby("building_id").size(), axis=0)

In [None]:
meter[meter["electricity"].isna()].groupby("building_id").count()

In [None]:
meta.columns

In [None]:
meta.isna().sum()*100/len(meta)

In [None]:
weather.info()

In [None]:
weather.isna().sum()*100/len(weather)

## Process Datasets

In [None]:
def process_meter(meter_df):
    #drop 
    meter_df["timestamp"] = pd.to_datetime(meter_df["timestamp"], format='%Y-%m-%d %H:%M:%S')
    meter_df["month"] = meter_df["timestamp"].dt.month
    meter_df["weekday"] = meter_df["timestamp"].dt.weekday
    meter_df["hour"] = meter_df["timestamp"].dt.hour
    #maybe add holiday here too?
    return meter_df

In [None]:
def process_meta(meta_df):
    #use these cols because they have low % of NA values. Other cols are mostly NA
    use_cols = ['building_id', 'site_id', 'primary_space_usage', 'sq_meter']
    drop_cols = [col for col in meta_df.columns if col not in use_cols]
    meta_df.drop(columns=drop_cols, inplace=True)
    meta_df['primary_space_usage'].fillna("Unknown", inplace=True )
    return meta_df

In [None]:
def process_weather(weather_df):
    weather_df.drop(columns = ['cloud_coverage','precipitation_depth_1_hr','precipitation_depth_6_hr','sea_level_pressure'], inplace=True)
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"], format='%Y-%m-%d %H:%M:%S')
    weather_df['air_temperature'].interpolate(method="linear", inplace=True)
    weather_df['dew_temperature'].interpolate(method="linear", inplace=True)
    weather_df['wind_direction'].interpolate(method="linear", inplace=True)
    weather_df['wind_speed'].interpolate(method="linear", inplace=True)
    return weather_df


In [None]:
meter_df = process_meter(meter)

In [None]:
meta_df = process_meta(meta)

In [None]:
weather_df = process_weather(weather)

In [None]:
weather_df.info()

In [None]:
meter_df.info()

In [None]:
df = meter_df.merge(meta_df, how="left", on="building_id")\
            .merge(weather_df, how="left", on = ["timestamp", "site_id"])

In [None]:
df.head()

In [None]:
df.isna().sum()*100/len(df)

In [None]:
df[df.air_temperature.isna()]["building_id"].value_counts()

In [None]:
# Drop missings
df.dropna(inplace=True)

In [None]:
file = "electricity_features.parq"
#change to parquet file
fastparquet.write(file, df, compression="GZIP")

In [None]:
#write to azure
storage.upload(file_system=file_system, directory=dest_dir, file_name=file,file_path=file,overwrite=True )