# Data Preperation

In [1]:
import sys
import os
sys.path.append("..")
from webapp.utils.azure_utils import KeyVault, DataLake
from webapp.utils.data_pre_utils import df_val_map
import numpy as np
import pandas as pd

In [2]:
# Connect to Storage Account
vault = KeyVault(keyVaultName = "keyvaultdva2022")
storage_credential = vault.get_secret(secretName = "storagePrimaryKey")
storage = DataLake(account_name = "storageaccountdva", credential = storage_credential)

In [3]:
file_system = "energyhub"

# Clean Meta Data

In [4]:
meta_data = storage.pandas_read(file_system, directory="data_original/metadata", file_name="metadata.csv")

In [5]:
# Change column names:
meta_data = meta_data.rename(columns = {'building_id': 'building_id',
                    'site_id': 'site_id',
                    'building_id_kaggle': 'building_id_kaggle',
                    'site_id_kaggle': 'site_id_kaggle',
                    'primaryspaceusage': 'primary_space_usage',
                    'sub_primaryspaceusage': 'sub_primary_space_usage',
                    'sqm': 'sq_meter',
                    'sqft': 'sq_feet',
                    'lat': 'latitude',
                    'lng': 'longitude',
                    'timezone': 'timezone',
                    'electricity': 'electricity',
                    'hotwater': 'hotwater',
                    'chilledwater': 'chilledwater',
                    'steam': 'steam',
                    'water': 'water',
                    'irrigation': 'irrigation',
                    'solar': 'solar',
                    'gas': 'gas',
                    'industry': 'industry',
                    'subindustry': 'subindustry',
                    'heatingtype': 'heating_type',
                    'yearbuilt': 'year_built',
                    'date_opened': 'date_opened',
                    'numberoffloors': 'number_of_floors',
                    'occupants': 'occupants',
                    'energystarscore': 'energy_stars_core',
                    'eui': 'eui',
                    'site_eui': 'site_eui',
                    'source_eui': 'source_eui',
                    'leed_level': 'leed_level',
                    'rating': 'rating'})

In [6]:
# Map values with Yes flag:
mapping_values = {'Yes': True, np.nan: False}
cols_to_map = ["electricity", "hotwater", "chilledwater", "steam", "water", "irrigation", "solar", "gas"]

for col in cols_to_map:
    df_val_map(meta_data, col, mapping_values)

In [7]:
# Set data types:
meta_data= meta_data.astype({'building_id': 'category', 
                'site_id': 'category', 
                'building_id_kaggle': 'category',
                'site_id_kaggle': 'category',
                'primary_space_usage': 'category', 
                'sub_primary_space_usage': 'category',
                'sq_meter': 'float64',
                'sq_feet': 'float64',
                'latitude': 'float64',
                'longitude': 'float64',
                'timezone': 'category',
                'electricity': 'bool_',
                'hotwater': 'bool_',
                'chilledwater': 'bool_', 
                'steam': 'bool_',
                'water': 'bool_', 
                'irrigation': 'bool_', 
                'solar': 'bool_',
                'gas': 'bool_',
                'industry': 'category',
                'subindustry': 'category',
                'heating_type': 'category', 
                'year_built': 'category',
                'date_opened': 'category',
                'number_of_floors':'int64', 
                'occupants': 'int64',
                'energy_stars_core': 'category',
                'eui': 'category', 
                'site_eui': 'category',
                'source_eui': 'category', 
                'leed_level': 'category', 
                'rating': 'category'}, errors='ignore')

In [8]:
# Set index:
#meta_data = meta_data.set_index("building_id")

In [9]:
file_system = "energyhub"
file_name = "metadata.parq"
directory = "data_parq/metadata"

meta_data.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)
os.remove(file_name)

metadata.parq write complete


# Clean Weather

In [10]:
weather = storage.pandas_read(file_system, directory="data_original/weather", file_name="weather.csv")

In [11]:
weather = weather.rename(columns = {'timestamp': 'timestamp',
            'site_id': 'site_id',
            'airTemperature': 'air_temperature',
            'cloudCoverage': 'cloud_coverage',
            'dewTemperature': 'dew_temperature',
            'precipDepth1HR': 'precipitation_depth_1_hr',
            'precipDepth6HR': 'precipitation_depth_6_hr',
            'seaLvlPressure': 'sea_level_pressure',
            'windDirection': 'wind_direction',
            'windSpeed': 'wind_speed'})

In [12]:
weather = weather.astype({'timestamp': 'datetime64[ns]', 
        'site_id': 'category', 
        'air_temperature': 'float64', 
        'cloud_coverage': 'category',
        'dew_temperature': 'float64', 
        'precipitation_depth_1_hr': 'float64',
        'precipitation_depth_6_hr': 'float64',
        'sea_level_pressure': 'float64',
        'wind_direction': 'float64',
        'wind_speed': 'float64'})

In [13]:
# Set index: 
#weather = weather.set_index(["timestamp", "site_id"])

In [14]:
file_system = "energyhub"
file_name = "weather.parq"
directory = "data_parq/weather"

weather.to_parquet(path = file_name, engine = "pyarrow", compression = "gzip", index = False)
storage.upload(file_system, directory = directory, file_name = file_name, file_path = file_name, overwrite=True)
os.remove(file_name)

weather.parq write complete
