In [None]:
%reload_ext autoreload
%autoreload 2

# import packages
import pandas as pd
import numpy as np
import requests
from os import path
import time

## Download EULP Metadata

In [None]:
def download_parquet_from_s3(url, download_path, file_name):
    """
    Given a url to a parquet file on an amazon S3 bucket, downloads the file to a local folder
    """
    file_object = requests.get(url)

    with open(f'{download_path}/{file_name}', 'wb') as local_file:
        local_file.write(file_object.content)

com_meta_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/metadata/metadata.parquet'
res_meta_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tm3_release_1/metadata/metadata.parquet'

# download commercial metadata and convert to csv
download_parquet_from_s3(com_meta_url, '../data/downloaded/eulp_usbs/metadata', 'commercial_metadata.parquet')
download_parquet_from_s3(res_meta_url, '../data/downloaded/eulp_usbs/metadata', 'residential_metadata.parquet')

res_metadata = pd.read_parquet('../data/downloaded/eulp_usbs/residential_metadata.parquet')
com_metadata = pd.read_parquet('../data/downloaded/eulp_usbs/commercial_metadata.parquet')



## Download Commercial data dictionaries

In [None]:
# download data dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/data_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/com_data_dictionary.csv')

# download enumeration dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/enumeration_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/com_enumeration_dictionary.csv')

# download upgrade dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/upgrade_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/com_upgrade_dictionary.csv')


## Download Residential Data Dictionaries

In [None]:
# download data dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/data_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/res_data_dictionary.csv')

# download enumeration dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/enumeration_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/res_enumeration_dictionary.csv')

# download upgrade dictionary and save as csv
dict_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/upgrade_dictionary.tsv'
data_dictionary = pd.read_csv(dict_url, sep='\t')
data_dictionary.to_csv('../data/downloaded/eulp_usbs/metadata/res_upgrade_dictionary.csv')

## Take stratified random sample of commercial buildings

In [None]:
# load the commercial metadata file
com_meta_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/metadata/metadata.parquet'
com_metadata = pd.read_parquet(com_meta_url, columns=['bldg_id','in.upgrade_name','in.building_type','in.nhgis_tract_gisjoin','in.climate_zone_ashrae_2004','in.state_abbreviation'])

# only keep buildings in Baseline upgrade
com_metadata = com_metadata[com_metadata['in.upgrade_name'] == 'Baseline']
com_metadata = com_metadata.drop(columns='in.upgrade_name')

# rename columns
com_metadata = com_metadata.rename(columns={'in.building_type':'building_type','in.nhgis_tract_gisjoin':'nhgis_tract_gisjoin','in.climate_zone_ashrae_2004':'climate_zone','in.state_abbreviation':'state'})

# specify the category for each building type
building_categories = {'FullServiceRestaurant': 'Restaurant',
                        'Hospital': 'Hospital',
                        'LargeHotel': 'Hotel',
                        'LargeOffice': 'Office',
                        'MediumOffice': 'Office',
                        'OutPatient': 'Office',
                        'Outpatient': 'Office',
                        'PrimarySchool': 'School',
                        'QuickServiceRestaurant': 'Restaurant',
                        'RetailStandalone': 'Retail',
                        'RetailStripmall': 'Retail',
                        'SecondarySchool': 'School',
                        'SmallHotel': 'Hotel',
                        'SmallOffice': 'Office',
                        'Warehouse': 'Warehouse'}

com_metadata['building_category'] = com_metadata['building_type'].map(building_categories)

com_metadata['building_sector'] = 'Commercial'

com_metadata['scaling_factor'] = 1

com_metadata

In [None]:
# load the crosswalk between ba and tract
ba_tract_crosswalk = pd.read_csv('../data/processed/ba_tract_crosswalk_2019.csv')

ba_tract_crosswalk

In [None]:
# create a container for the commercial sample data
commercial_sample = []

# for each ba
for ba in list(ba_tract_crosswalk['ba_code'].unique()):
    # get a list of all tracts in the ba
    ba_tract_list = list(ba_tract_crosswalk.loc[ba_tract_crosswalk['ba_code'] == ba, 'GISJOIN'].unique())
    # find all buildings located in these tracts
    buildings_in_ba = com_metadata.loc[com_metadata['nhgis_tract_gisjoin'].isin(ba_tract_list),:]
    # sample 10% of buildings of each building type in each climate zone
    frac_sample = buildings_in_ba.groupby(['climate_zone','building_type']).sample(frac=0.1, random_state=2022)
    # take an n=1 sample from each building type in each climate zone
    n_sample = buildings_in_ba.groupby(['climate_zone','building_type']).sample(n=1, random_state=2022)
    # identify all of the unique CZ-buildingtype combinations that exist in each sample
    n_sample['match_key'] = n_sample[['climate_zone','building_type']].agg('_'.join, axis=1)
    n_unique_types = list(n_sample['match_key'].unique())
    frac_sample['match_key'] = frac_sample[['climate_zone','building_type']].agg('_'.join, axis=1)
    frac_unique_types = list(frac_sample['match_key'].unique())
    # identify if there are any building types missing from the fractional sample
    missing_sample = list(set(frac_unique_types).symmetric_difference(set(n_unique_types)))

    # append any missing samples to the fractional sample from the n sample
    if len(missing_sample) > 0:
        frac_sample = pd.concat([frac_sample, n_sample[n_sample['match_key'].isin(missing_sample)]], axis=0)

    # add a column for the ba code
    frac_sample['ba_code'] = ba

    commercial_sample.append(frac_sample)

commercial_sample = pd.concat(commercial_sample, axis=0)

commercial_sample

## Take stratified random sample of residential buildings

In [None]:
# load the residential metadata file
res_meta_url = 'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/metadata/metadata.parquet'
res_metadata = pd.read_parquet(res_meta_url, columns=['bldg_id','in.weather_file_city','in.ashrae_iecc_climate_zone_2004','in.geometry_building_type_acs','in.geometry_building_number_units_mf','in.geometry_building_number_units_sfa','in.units_represented','in.state'])

res_building_categories = {'Mobile Home':'MobileHome',
                           'Single-Family Attached':'SingleFamily',
                           'Single-Family Detached':'SingleFamily',
                           '2 Unit':'SmallMultifamily',
                           '3 or 4 Unit':'SmallMultifamily',
                           '5 to 9 Unit':'MediumMultifamily',
                           '10 to 19 Unit':'MediumMultifamily',
                           '20 to 49 Unit':'MediumMultifamily',
                           '50 or more Unit':'LargeMultifamily'}

res_building_names = {'Mobile Home':'MobileHome',
                           'Single-Family Attached':'SFAttached',
                           'Single-Family Detached':'SFDetached',
                           '2 Unit':'MF2unit',
                           '3 or 4 Unit':'MF3-4unit',
                           '5 to 9 Unit':'MF5-9unit',
                           '10 to 19 Unit':'MF10-19unit',
                           '20 to 49 Unit':'MF20-49unit',
                           '50 or more Unit':'MF50+unit'}

# rename the building categories in the building type acs column
res_metadata['building_category'] = res_metadata['in.geometry_building_type_acs'].map(res_building_categories)

# calculate a scaling factor based on the number of units represented by the data
res_metadata['in.geometry_building_number_units_mf'] = res_metadata['in.geometry_building_number_units_mf'].replace({'None':0}).astype(int)
res_metadata['in.geometry_building_number_units_sfa'] = res_metadata['in.geometry_building_number_units_sfa'].replace({'None':0}).astype(int)
res_metadata['scaling_factor'] = ((res_metadata['in.geometry_building_number_units_mf'] + res_metadata['in.geometry_building_number_units_sfa']) / res_metadata['in.units_represented']).replace({0:1})
res_metadata = res_metadata.drop(columns=['in.geometry_building_number_units_mf','in.geometry_building_number_units_sfa','in.units_represented'])

res_metadata = res_metadata.rename(columns={'in.weather_file_city':'weather_file_city','in.geometry_building_type_acs':'building_type','in.state':'state','in.ashrae_iecc_climate_zone_2004':'climate_zone'})

# rename the building categories in the building type acs column
res_metadata['building_type'] = res_metadata['building_type'].replace(res_building_names)

res_metadata['building_sector'] = 'Residential'

res_metadata

In [None]:
# load the crosswalk between ba and tract
ba_wf_crosswalk = pd.read_csv('../data/processed/ba_weather_file_crosswalk.csv')

ba_wf_crosswalk

In [None]:
# create a container for the commercial sample data
residential_sample = []

# for each ba
for ba in list(ba_wf_crosswalk['ba_code'].unique()):
    # get a list of all weather files in the ba
    ba_wf_list = list(ba_wf_crosswalk.loc[ba_wf_crosswalk['ba_code'] == ba, 'weather_file_city'].unique())
    # find all buildings located in these tracts
    buildings_in_ba = res_metadata.loc[res_metadata['weather_file_city'].isin(ba_wf_list),:]
    # sample 10% of buildings of each building type in each climate zone
    frac_sample = buildings_in_ba.groupby(['climate_zone','building_type']).sample(frac=0.1, random_state=2022)
    # take an n=1 sample from each building type in each climate zone
    n_sample = buildings_in_ba.groupby(['climate_zone','building_type']).sample(n=1, random_state=2022)
    # identify all of the unique CZ-buildingtype combinations that exist in each sample
    n_sample['match_key'] = n_sample[['climate_zone','building_type']].agg('_'.join, axis=1)
    n_unique_types = list(n_sample['match_key'].unique())
    frac_sample['match_key'] = frac_sample[['climate_zone','building_type']].agg('_'.join, axis=1)
    frac_unique_types = list(frac_sample['match_key'].unique())
    # identify if there are any building types missing from the fractional sample
    missing_sample = list(set(frac_unique_types).symmetric_difference(set(n_unique_types)))

    # append any missing samples to the fractional sample from the n sample
    if len(missing_sample) > 0:
        frac_sample = pd.concat([frac_sample, n_sample[n_sample['match_key'].isin(missing_sample)]], axis=0)

    # add a column for the ba code
    frac_sample['ba_code'] = ba

    residential_sample.append(frac_sample)

residential_sample = pd.concat(residential_sample, axis=0)

residential_sample

# Download data

In [None]:

for ba in ba_list:
    # check if the data has already been downloaded
    if os.path.exists(f'../data/processed/nrel_demand/{ba}.csv.zip'):
        print(f'BA {ba} already downloaded.')
    else:
        start_time = time.time()

        # create a list to hold the data
        commercial_data = []
        residential_data = []
        already_downloaded = []

        com_buildings = commercial_sample.loc[commercial_sample['ba_code'] == ba,:]
        num_com = len(com_buildings)
        
        print(f'Downloading {num_com} {ba} commercial buildings')
        for bldg_id, row in com_buildings.iterrows():
            if bldg_id in already_downloaded:
                pass
            else:
                print(f'  Downloading bldg_id {bldg_id}',end='\r')
                state = row['state']
                cz = row['climate_zone']
                building_category = row['building_category']
                building_type = row['building_type']

                # construct the url for the individual building timeseries file
                url_to_download = f'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/timeseries_individual_buildings/by_state/upgrade=0/state={state}/{bldg_id}-0.parquet'

                # read the building data into a dataframe, keeping only the total electricity timeseries data
                try:
                    bldg_data = pd.read_parquet(url_to_download, columns=['timestamp','out.electricity.total.energy_consumption'])
                    # resample the data from 15 min interval to 1 hour interval, and round to 2 decimal points
                    bldg_data = bldg_data.set_index(pd.to_datetime(bldg_data['timestamp']), drop=True).resample('H', label='left', closed='right').sum().round(2)

                    # rename the column to describe the building
                    bldg_data = bldg_data.rename(columns={'out.electricity.total.energy_consumption':f'{cz}_{building_category}_{building_type}_{bldg_id}'})

                    commercial_data.append(bldg_data)
                    already_downloaded.append(bldg_id)
                except:
                    try:
                        print(f'ERROR DOWNLOADING {bldg_id}. Retrying...')
                        bldg_data = pd.read_parquet(url_to_download, columns=['timestamp','out.electricity.total.energy_consumption'])
                        # resample the data from 15 min interval to 1 hour interval, and round to 2 decimal points
                        bldg_data = bldg_data.set_index(pd.to_datetime(bldg_data['timestamp']), drop=True).resample('H', label='left', closed='right').sum().round(2)

                        # rename the column to describe the building
                        bldg_data = bldg_data.rename(columns={'out.electricity.total.energy_consumption':f'{cz}_{building_category}_{building_type}_{bldg_id}'})

                        commercial_data.append(bldg_data)
                        already_downloaded.append(bldg_id)
                    except:
                        print(f'SKIPPING {bldg_id}')

        

        res_buildings = residential_sample.loc[residential_sample['ba_code'] == ba,:]
        num_res = len(res_buildings)

        print(f'Downloading {num_res} {ba} residential buildings')
        for bldg_id, row in res_buildings.iterrows():
            if bldg_id in already_downloaded:
                pass
            else:
                print(f'  Downloading bldg_id {bldg_id}',end='\r')
                state = row['state']
                cz = row['climate_zone']
                building_category = row['building_category']
                building_type = row['building_type']

                # construct the url for the individual building timeseries file
                url_to_download = f'https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/timeseries_individual_buildings/by_state/upgrade=0/state={state}/{bldg_id}-0.parquet'

                # read the building data into a dataframe, keeping only the total electricity timeseries data
                try:
                    bldg_data = pd.read_parquet(url_to_download, columns=['timestamp','out.electricity.total.energy_consumption'])
                    # resample the data from 15 min interval to 1 hour interval, and round to 2 decimal points
                    bldg_data = bldg_data.set_index(pd.to_datetime(bldg_data['timestamp']), drop=True).resample('H', label='left', closed='right').sum().round(2)

                    # scale the data
                    bldg_data['out.electricity.total.energy_consumption'] = bldg_data['out.electricity.total.energy_consumption'] * row['scaling_factor']

                    # rename the column to describe the building
                    bldg_data = bldg_data.rename(columns={'out.electricity.total.energy_consumption':f'{cz}_{building_category}_{building_type}_{bldg_id}'})

                    residential_data.append(bldg_data)
                    already_downloaded.append(bldg_id)
                except:
                    try:
                        print(f'ERROR DOWNLOADING {bldg_id}. Retrying...')
                        bldg_data = pd.read_parquet(url_to_download, columns=['timestamp','out.electricity.total.energy_consumption'])
                        # resample the data from 15 min interval to 1 hour interval, and round to 2 decimal points
                        bldg_data = bldg_data.set_index(pd.to_datetime(bldg_data['timestamp']), drop=True).resample('H', label='left', closed='right').sum().round(2)

                        # scale the data
                        bldg_data['out.electricity.total.energy_consumption'] = bldg_data['out.electricity.total.energy_consumption'] * row['scaling_factor']

                        # rename the column to describe the building
                        bldg_data = bldg_data.rename(columns={'out.electricity.total.energy_consumption':f'{cz}_{building_category}_{building_type}_{bldg_id}'})

                        residential_data.append(bldg_data)
                        already_downloaded.append(bldg_id)
                    except:
                        print(f'SKIPPING {bldg_id}')
                

        print('  Constructing dataframe')

        commercial_data = pd.concat(commercial_data, axis='columns')
        residential_data = pd.concat(residential_data, axis='columns')

        demand_data = pd.concat([commercial_data, residential_data], axis='columns')
            
        # rename the timestamp column to datetime_local
        demand_data = demand_data.reset_index(drop=True)
        
        # save the data as a zipped csv
        demand_data.to_csv(f'../data/processed/nrel_demand/{ba}.csv.zip', compression='zip', index=False)

        # reset the dataframe
        demand_data = pd.DataFrame()

        print(f'  {round(time.time() - start_time, 0)} s / {round(time.time() - start_time, 2) / (num_com + num_res)} s/bldg')

