In [None]:
%reload_ext autoreload
%autoreload 2

# import packages
import pandas as pd
import numpy as np
#from pathlib import Path
import plotly.express as px
import plotly
from os import path
import missingno
import re
from statsmodels.formula.api import ols


# import project modules
import download_data
import screening

For California, the is a source of facility-level hourly demand data, based on actual AMI readings. 
 - LBNL 2025 California Demand Response Potential Study, Phase 2 (https://www.cpuc.ca.gov/General.aspx?id=10622). As part of this project AMI data was collected from the three major IOUs and the aggregated/anonymized data are available for download (https://buildings.lbl.gov/download-page-2025-california-demand-response)

This dataset is useful for two main purposes:  
1. Provide a source of demand data for facility types not included in the DOE dataset (e.g. data centers)  
2. These data can be used to help validate that the DOE simulations are reasonably accurate.

### Description of building type codes
- retail
- office
- refrigerated warehouse
- other
- Petroleum Refining and Related Industries 
- Food Manufacturing, Beverage and Tobacco 
- Chemicals - Industrial Gases 
- Chemicals - Other 
- Computer and Electronic Product Manufacturing 
- Plastics and Rubber Products Manufacturing 
- Primary Metals 
- Agriculture - crops: irrigation pumping primarily
- Water
- Wastewater

In [None]:
# let's explore the metadata
metadata = pd.read_csv('A:/Research/lbnl-load-enduse-shapes/lbnl-load-enduse-shapes/anonymized_1in2_actual_actual_2014/anonymized_1in2_actual_actual_2014_cluster_summary.csv')

metadata.head(1)

In [None]:
metadata[['customer_count','util','slap','sector','building_type']].groupby(['util','building_type']).count().reset_index().pivot(index='util', columns='building_type', values='customer_count')

In [None]:
metadata[['customer_count','util','slap','sector','building_type']].groupby(['slap','building_type']).count().reset_index().pivot(index='slap', columns='building_type', values='customer_count')

In [None]:
metadata[['customer_count','util','slap','sector','building_type','kw_bin']].groupby(['kw_bin','building_type']).count().reset_index().pivot(index='building_type', columns='kw_bin', values='customer_count')

# Creating load shapes

To aggregate the data I have a couple of options:  
1. sum together all of the load of a certain building type and use that load shape. However, this weights large facilities more
2. Average together normalized load shapes from each bin

In [None]:
# filters
util = ['pge']
slap = None
sector = None
building_type = ['office']
care = ['nonCare']
kw_bin = None

filename_list = metadata.copy()

if util is not None:
    filename_list = filename_list[filename_list['util'].isin(util)]
if slap is not None:
    filename_list = filename_list[filename_list['slap'].isin(slap)]
if sector is not None:
    filename_list = filename_list[filename_list['sector'].isin(sector)]
if building_type is not None:
    filename_list = filename_list[filename_list['building_type'].isin(building_type)]
if care is not None:
    filename_list = filename_list[filename_list['care'].isin(care)]
if kw_bin is not None:
    filename_list = filename_list[filename_list['kw_bin'].isin(kw_bin)]


filename_list = list(filename_list['cluster'])

len(filename_list)

In [None]:
# we will use 1-in-2 profiles, representing a typical weather year, rather than the 1-in-10 profiles, which represent a "hot" year
lbnl_dir = 'A:/Research/lbnl-load-enduse-shapes/lbnl-load-enduse-shapes/anonymized_1in2_actual_actual_2014/'

#file_format = f'{utility}-{sector}-{sublap}-{building_type}-{kw_bin}-{care}-{kwh_bin}.csv'


data = pd.DataFrame()
for filename in filename_list:
    # let's take a look a single building type with different kwh bins
    df = pd.read_csv(lbnl_dir + f'{filename}.csv', usecols=['total']).rename(columns={'total':filename})

    data = data.join(df, how='right')

# add an index
data.index = pd.date_range(start='2014-01-01 00:00:00', end='2014-12-31 23:00:00', freq='H')

# normalize the data
"""
for col in data.columns:

    data[col] = data[col] / data[col].max()
"""


In [None]:
data.std(ddof=0) / data.mean()

In [None]:
px.box(data.std(ddof=0) / data.mean())

In [None]:
plot1 = px.line(data, width=1000)
plot1.update_xaxes(
    #rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1d", step="day", stepmode="backward"),
            dict(count=7, label="1w", step="day", stepmode="backward"),
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(step="all")
        ])))
plot1.show()

In [None]:
from tslearn.clustering import TimeSeriesKMeans

import math

In [None]:

# convert the data to tslearn format
ts_data = data.to_numpy().T

# set the number of clusters as the square root of the number of profiles
cluster_count = math.ceil(math.sqrt(len(ts_data))) 

# cluster the data using k-means with euclinean distance
km = TimeSeriesKMeans(n_clusters=cluster_count)
clusters = km.fit_predict(ts_data)

# assign each building to a cluster
cluster_dict = {list(data.columns)[i]: list(clusters)[i] for  i in range(len(data.columns))}

# format the data for plotting
cluster_plot_data = data.reset_index().melt(id_vars='index',var_name='building', value_name='normalized_demand')
cluster_plot_data['cluster'] = cluster_plot_data['building'].map(cluster_dict)

cluster_plot_data = cluster_plot_data.groupby(['cluster','index']).mean().reset_index()

In [None]:
px.line(cluster_plot_data, x='index', y='normalized_demand', facet_col='cluster', width=1200, height=800, facet_col_wrap=3)


In [None]:
filename_list

In [None]:
col_name =  'sce-ind-SCEW-data_center-50_200kW-nonCare-0.0_1.0'

mh = data[[col_name]].groupby([data.index.month, data.index.hour]).mean()
mh.index = mh.index.set_names(['month','hour'])
px.line(mh.reset_index(), x='hour', y=col_name, facet_col='month').show()


In [None]:
monthly_use = data.groupby(data.index.month).mean()
monthly_use.index = monthly_use.index.rename('month')
monthly_use = monthly_use.reset_index().melt(id_vars='month', var_name='building',value_name='normalized_demand')

px.bar(monthly_use, x='month', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=800, width=1200).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

In [None]:
weekly_use = data.groupby(data.index.dayofweek).mean()
weekly_use.index = weekly_use.index.rename('day_of_week')
weekly_use = weekly_use.reset_index().melt(id_vars='day_of_week', var_name='building',value_name='normalized_demand')

px.bar(weekly_use, x='day_of_week', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=500, width=800).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

In [None]:
daily_use = data.groupby(data.index.hour).mean()
daily_use.index = daily_use.index.rename('hour')
daily_use = daily_use.reset_index().melt(id_vars='hour', var_name='building',value_name='normalized_demand')

px.bar(daily_use, x='hour', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=500, width=800).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

## Load Profiles of interest

- Food/Bev:
    - 'pge-ind-PGSA-food_bev-gt200kW-nonCare-0.0_0.33' - spike from Aug 11 to Sept 11
    - 'pge-ind-PGSA-food_bev-gt200kW-nonCare-0.66_1.0' - spike from July - September
    - 'sce-ind-SCEW-food_bev-gt200kW-nonCare-0.1_0.2' - spike for 2-3 weeks late september, but same overnight baseline use
- Data Center
    - 'pge-ind-PGSB-data_center-gt200kW-nonCare-0.0_1.0' - daily and day of week seasonality, but almost no annual seasonality
- Metals
    - 'sdge-ind-SDG1-metals-50_200kW-nonCare-0.0_0.25' - almost no annual seasonality, strong day/night swing with almost zero overnight use
- Refrigerated Warehouses - tend to have strong seasonal and daily load, correlated with solar
    - 'pge-com-PGF1-ref_wh-lt50kW-nonCare-0.0_1.0' - seems to use most energy in evening and overnight
    - 'sce-com-PGLP-com_other-noKW-nonCare-0.0_1.0' - shows strong midday peak correlating with solar, as well as seasonal pea kin summer
- Water
    - 'sce-ind-SCEW-water-gt200kW-nonCare-0.1_0.2' - very little variation within a tight band
- Crop
    - 'sdge-ind-SDG1-crop-lt50kW-nonCare-0.5_0.6' - strong overnight pumping pattern
- Chemical Manufacturing
    - 'sce-ind-SCEN-chemical-50_200kW-nonCare-0.0_1.0' - night shift work, almost zero energy use during day, no annual or DOW seasonality
    - 'sce-ind-SCEC-chemical-50_200kW-nonCare-0.0_1.0' - strong day shift with annual seasonality and almost 0 use over weekend


# Examine Profiles of interest

In [None]:
interesting_files = ['pge-ind-PGSA-food_bev-gt200kW-nonCare-0.0_0.33',
                     'pge-ind-PGSA-food_bev-gt200kW-nonCare-0.66_1.0',
                     'pge-ind-PGSB-data_center-gt200kW-nonCare-0.0_1.0',
                     'sdge-ind-SDG1-metals-50_200kW-nonCare-0.0_0.25',
                     'pge-com-PGF1-ref_wh-lt50kW-nonCare-0.0_1.0',
                     'sce-com-PGLP-com_other-noKW-nonCare-0.0_1.0',
                     'sce-ind-SCEW-water-gt200kW-nonCare-0.1_0.2',
                     'sdge-ind-SDG1-crop-lt50kW-nonCare-0.5_0.6',
                     'sce-ind-SCEN-chemical-50_200kW-nonCare-0.0_1.0',
                     'sce-ind-SCEC-chemical-50_200kW-nonCare-0.0_1.0'
                     ]

building_names = {'pge-ind-PGSA-food_bev-gt200kW-nonCare-0.0_0.33':'ag_tree_nut_processor',
                        'pge-ind-PGSA-food_bev-gt200kW-nonCare-0.66_1.0':'ag_tomato_processor',
                        'sce-ind-SCEW-food_bev-gt200kW-nonCare-0.1_0.2':'ag_prune_processor',
                        'pge-ind-PGSB-data_center-gt200kW-nonCare-0.0_1.0':'data_center',
                        'sdge-ind-SDG1-metals-50_200kW-nonCare-0.0_0.25':'metals_day_shift',
                        'pge-com-PGF1-ref_wh-lt50kW-nonCare-0.0_1.0':'warehouse_overnight',
                        'sce-com-PGLP-com_other-noKW-nonCare-0.0_1.0':'warehouse_midday',
                        'sce-ind-SCEW-water-gt200kW-nonCare-0.1_0.2':'water_constant_load',
                        'sdge-ind-SDG1-crop-lt50kW-nonCare-0.5_0.6':'crop_overnight_pumping',
                        'sce-ind-SCEN-chemical-50_200kW-nonCare-0.0_1.0':'chem_night_shift',
                        'sce-ind-SCEC-chemical-50_200kW-nonCare-0.0_1.0':'chem_day_shift'
                        }

# we will use 1-in-2 profiles, representing a typical weather year, rather than the 1-in-10 profiles, which represent a "hot" year
lbnl_dir = 'A:/Research/lbnl-load-enduse-shapes/lbnl-load-enduse-shapes/anonymized_1in2_actual_actual_2014/'

#file_format = f'{utility}-{sector}-{sublap}-{building_type}-{kw_bin}-{care}-{kwh_bin}.csv'


interesting = pd.DataFrame()
for filename in interesting_files:
    # let's take a look a single building type with different kwh bins
    df = pd.read_csv(lbnl_dir + f'{filename}.csv', usecols=['total']).rename(columns={'total':filename})

    interesting = interesting.join(df, how='right')

# add an index
interesting.index = pd.date_range(start='2014-01-01 00:00:00', end='2014-12-31 23:00:00', freq='H')

# normalize the interesting
for col in interesting.columns:

    interesting[col] = interesting[col] / interesting[col].max()

interesting = interesting.rename(columns=building_names)

plot2 = px.line(interesting, width=1000)
plot2.update_xaxes(
    #rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1d", step="day", stepmode="backward"),
            dict(count=7, label="1w", step="day", stepmode="backward"),
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(step="all")
        ])))
plot2.show()

In [None]:
monthly_use = interesting.groupby(interesting.index.month).mean()
monthly_use.index = monthly_use.index.rename('month')
monthly_use = monthly_use.reset_index().melt(id_vars='month', var_name='building',value_name='normalized_demand')

px.bar(monthly_use, x='month', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=500, width=800).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

In [None]:
weekly_use = interesting.groupby(interesting.index.dayofweek).mean()
weekly_use.index = weekly_use.index.rename('day_of_week')
weekly_use = weekly_use.reset_index().melt(id_vars='day_of_week', var_name='building',value_name='normalized_demand')

px.bar(weekly_use, x='day_of_week', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=500, width=800).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

In [None]:
daily_use = interesting.groupby(interesting.index.hour).mean()
daily_use.index = daily_use.index.rename('hour')
daily_use = daily_use.reset_index().melt(id_vars='hour', var_name='building',value_name='normalized_demand')

px.bar(daily_use, x='hour', y='normalized_demand', facet_col='building', facet_col_wrap=5, height=500, width=800).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).update_yaxes(range=[0,1])

# Seasonal Fixed Effects Model

In [None]:

# loop through possible combinations

combinations = ['C(year):C(month):C(dayofweek):C(hour)',
    'C(year):C(month):C(hour)',
    'C(year):C(month)',
    'C(year)'
]

r2 = pd.DataFrame(index=pd.MultiIndex(levels=[[],[]], codes=[[],[]], names=['building','fixed_effect']))

for building in interesting.columns:
    data = interesting.copy()[[building]]

    # create new columns for different seasonalities
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['dayofweek'] = data.index.dayofweek
    data['hour'] = data.index.hour

    # for each combination of regressors, run a linear model and store the results
    for combination in combinations:
        # only keep the regressors that we care about
        #X = data_dummy[[col for col in data_dummy if any(effect in col for effect in combination)]]
        #X = sm.add_constant(X)

        # fit the model, dropping any missing values
        model = ols(f'{building} ~ ' + combination, data=data).fit()
        #predictions = model.predict(X)

        # save the Rsquared value
        r2.loc[(building, str(combination)),'r2'] = model.rsquared
        r2.loc[(building, str(combination)),'adj_r2'] = model.rsquared_adj

# reverse the order of the columns
r2 = r2.iloc[::-1]

In [None]:
r2.to_csv('../results/lbnl_r2_values.csv')

In [None]:
# reformat labels
relabels = {'C(year):C(month):C(dayofweek):C(hour)':'year:month:dayofweek:hour',
    'C(year):C(month):C(hour)':'year:month:hour',
    'C(year):C(month)':'year:month',
    'C(year)':'year'
    }


px.scatter(r2.sort_index(level='building', sort_remaining=False).reset_index().replace(relabels), x='fixed_effect', y='r2', color='r2', facet_col='building', title='R-squared values',template='plotly_white', width=1200, height=600, facet_col_wrap=6, color_continuous_scale='Portland_r').update_yaxes(range=[0,1], dtick=0.1).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])).add_hline(y=0.5, line_dash='dot').add_hline(y=1).add_hline(y=0).update_coloraxes(cmin=0, cmax=1)
