In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
# Original source: https://www.eia.gov/electricity/gridmonitor/knownissues/xls/PJM.xlsx
# Downloaded 6/3/24

df = pd.read_csv('data/pjm_hourly_published_data_20240603.csv') 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78840 entries, 0 to 78839
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   BA                78840 non-null  object 
 1   UTC time          78840 non-null  object 
 2   Local date        78840 non-null  object 
 3   Hour              78840 non-null  int64  
 4   Local time        78840 non-null  object 
 5   Time zone         78840 non-null  object 
 6   Generation only?  78840 non-null  object 
 7   DF                78554 non-null  object 
 8   D                 78646 non-null  object 
 9   NG                78621 non-null  object 
 10  TI                78622 non-null  object 
 11  Imputed D         175 non-null    object 
 12  Imputed NG        196 non-null    object 
 13  Imputed TI        4 non-null      float64
 14  Adjusted D        78813 non-null  object 
 15  Adjusted NG       78813 non-null  object 
 16  Adjusted TI       78622 non-null  object

In [4]:
# Data column descriptions from EIA
column_descr = {
    'BA': '2-4 letter code that identifies the balancing authority',
    'UTC time': 'The end of the hour in Coordinated Universal Time (UTC)',
    'Local date': 'The date (using local time zone) for which data has been reported',
    'Hour': 'The hour number for the day.  Hour 1 corresponds to the time period 12:00 AM - 1:00 AM',
    'Local time': 'The end of the hour in local time',
    'Time zone': 'The local time zone',
    'Generation only?': ' Y indicates the balancing authority is a generation-only BA. Generation-only BAs consist of a power plant or group of power plants and do not directly serve retail customers. Therefore, they only report net generation and interchange and do not report demand or demand forecasts.',
    'DF': 'Demand forecast (DF): Each BA produces a day-ahead electricity demand forecast for every hour of the next day. These forecasts help BAs plan for and coordinate the reliable operation of their electric system on the following day. This column displays the actual data reported to EIA in MWh.',
    'D': 'Demand (D): A calculated value representing the amount of electricity load within the balancing authority’s electric system. A BA derives its demand value by taking the total metered net electricity generation within its electric system and subtracting the total metered net electricity interchange occurring between the BA and its neighboring BAs. This column displays the actual data reported to EIA in MWh.',
    'NG': 'Net generation (NG): the metered output of electric generating units in the balancing authority’s electric system. This generation only includes generating units that are managed by the balancing authority or whose operations are visible to the balancing authority.  This column displays the actual data reported to EIA in MWh.',
    'TI': 'Total Interchange (TI): the net metered tie line flow from one BA to another directly interconnected BA. Total net interchange is the net sum of all interchange occurring between a BA and it\'s directly interconnected neighboring BAs.  Negative interchange values indicate net inflows, and positive interchange values indicate net outflows.  This column displays the actual data reported to EIA in MWh.',
    'Imputed D': 'EIA imputes for anomalous values for total demand (D) if the value is missing or reported as negative, zero, or at least 1.5 times greater than the maximum of past total demand values reported by that BA. This column displays imputed values in MWh when they are made.',
    'Imputed NG': 'EIA imputes for anomalous values for total net generation (NG) if the value is missing or reported as negative, zero, or at least 1.5 times greater than the maximum of past total net generation values reported by that BA. This column displays imputed values in MWh when they are made.',
    'Imputed TI': 'EIA imputes for anomalous values for total interchange (TI) if the value is as at least 1.5 times greater than the maximum of past positive total interchange values reported by that BA or at least 1.5 times less than the minimum of past negative total interchange values reported by that BA. This column displays imputed values in MWh when they are made.',
    'Adjusted D': 'This column displays the demand (D) reported by the balancing authority in MWh unless imputation was required. When imputation was required, this column displays the imputed demand.',
    'Adjusted NG': 'This column displays the net generation (NG) reported by the balancing authority in MWh unless imputation was required. When imputation was required, this column displays the imputed net generation.',
    'Adjusted TI': 'This column displays the total interchange (TI) reported by the balancing authority in MWh unless imputation was required. When imputation was required, this column displays the imputed total interchange.',
}

TODO
- I think the important value columns are 'Adjusted D' and DF. Confirm this matches data we can pull from the API (where there is no distinction between raw and adjusted - I'm assuming that's the same as adjusted).

Drop columns we're ignoring for now

In [5]:
df = df[['UTC time', 'Time zone', 'DF', 'D', 'Adjusted D']]

Convert columns to appropriate types

In [6]:
df.loc[:,['D']]

Unnamed: 0,D
0,84024
1,79791
2,76760
3,74931
4,74368
...,...
78835,
78836,
78837,
78838,


In [7]:
df['UTC time'] = pd.to_datetime(df['UTC time'], format='%d%b%Y %H:%M:%S', utc=True)
#df['Local time'] = pd.to_datetime(df['Local time'], format='%d%b%Y %H:%M:%S').dt.tz_localize('EST')
for col in ['DF', 'D', 'Adjusted D']:
    # Handle commas in string-encoded integers
    df.loc[:, col] = pd.to_numeric(df[col].str.replace(',', ''))

In [8]:
df.head()

Unnamed: 0,UTC time,Time zone,DF,D,Adjusted D
0,2015-07-01 05:00:00+00:00,Eastern,29415.0,84024.0,84024.0
1,2015-07-01 06:00:00+00:00,Eastern,27687.0,79791.0,79791.0
2,2015-07-01 07:00:00+00:00,Eastern,26574.0,76760.0,76760.0
3,2015-07-01 08:00:00+00:00,Eastern,26029.0,74931.0,74931.0
4,2015-07-01 09:00:00+00:00,Eastern,26220.0,74368.0,74368.0


Let's confirm that the 'Adjusted D' values in this dataset match the demand values available through the API.

In [9]:
import requests

url = "https://api.eia.gov/v2/electricity/rto/region-data/data/?frequency=hourly&data[0]=value&facets[respondent][]=PJM&facets[type][]=D&facets[type][]=NG&sort[0][column]=period&sort[0][direction]=asc"

# TODO fetch all data via API back to 2015-07-01 in 6-month intervals to confirm
# it matches your bulk-downloaded CSV data.
params = {
    'offset': 0,
    'length': 5000,
    'api_key': os.environ['EIA_API_KEY'],
    'start': '2015-07-01T00',
    'end': '2015-07-20T00',
}

r = requests.get(url, params=params)
response.raise_for_status() 

print(r.url)

api_df = pd.DataFrame(r.json()['response']['data'])

https://api.eia.gov/v2/electricity/rto/region-data/data/?frequency=hourly&data%5B0%5D=value&facets%5Brespondent%5D%5B%5D=PJM&facets%5Btype%5D%5B%5D=D&facets%5Btype%5D%5B%5D=NG&sort%5B0%5D%5Bcolumn%5D=period&sort%5B0%5D%5Bdirection%5D=asc&offset=0&length=5000&api_key=ud2LESO91DtxL8k7V906bughtNPqCmdcUD1o0CuU&start=2015-07-01T00&end=2015-07-20T00


In [10]:
api_df['UTC period'] = pd.to_datetime(api_df['period'], utc=True)
api_df['value'] = pd.to_numeric(api_df['value'])
api_demand_df = api_df[api_df['type-name'] == 'Demand']

In [11]:
merged_df = pd.merge(df[['UTC time', 'D', 'Adjusted D']], api_demand_df[['UTC period', 'value']], 
                     left_on='UTC time', right_on='UTC period')

merged_df.head()

Unnamed: 0,UTC time,D,Adjusted D,UTC period,value
0,2015-07-01 05:00:00+00:00,84024.0,84024.0,2015-07-01 05:00:00+00:00,84024.0
1,2015-07-01 06:00:00+00:00,79791.0,79791.0,2015-07-01 06:00:00+00:00,79791.0
2,2015-07-01 07:00:00+00:00,76760.0,76760.0,2015-07-01 07:00:00+00:00,76760.0
3,2015-07-01 08:00:00+00:00,74931.0,74931.0,2015-07-01 08:00:00+00:00,74931.0
4,2015-07-01 09:00:00+00:00,74368.0,74368.0,2015-07-01 09:00:00+00:00,74368.0


In [16]:
merged_df['Adjusted D'].isna().sum()

np.int64(0)

In [14]:
assert np.all(merged_df['Adjusted D'] == merged_df['value'])

# diff_mask = merged_df['Adjusted D'] != merged_df['value']
# merged_df[diff_mask].head()