# Electric Disturbance Events

## Install Dependencies

In [1]:
%%capture
%pip install pandas==1.3.5
%pip install numpy==1.21.6

## Import Modules

In [2]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

## Mount Drive

In [3]:
import os

team_name = 'capstone-power-grid-protagonists'
colab_path = f'/content/drive/Shareddrives/{team_name}/project'
studiolab_path = f'/home/studio-lab-user/sagemaker-studiolab-notebooks/{team_name}'

try:
    # Try to mount Google Drive and set project path
    from google.colab import drive
    drive.flush_and_unmount()
    drive.mount('/content/drive')
    print('')

    root_path = colab_path
    os.chdir(root_path)

except:
    try:
        # Try to set AWS SageMaker Studio Lab project path
        root_path = studiolab_path
        os.chdir(root_path)
    
    except:
        # Set current working directory as root path
        root_path = os.getcwd()
        os.chdir(root_path)

        # If the current folder is 'notebooks', move up one level
        if root_path.endswith('/notebooks'):
            root_path = '/'.join(root_path.split('/')[:-1])
            os.chdir(root_path)
        
print('Current working directory is:')
print(os.getcwd())

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive

Current working directory is:
/content/drive/Shareddrives/capstone-power-grid-protagonists/project


## Import Data

In [4]:
# Columns to exclude in case notebook is being re-run
exclude = ['weight', 'outage', 'disturbance_prob', 'outage_prob', 'potential_population_affected', 'ba_affected']

# Processed data
cols = list(pd.read_csv('data/processed/hifld/power_plants.csv', nrows =1))
plants = pd.read_csv('data/processed/hifld/power_plants.csv',
                     low_memory=False,
                     usecols=[i for i in cols if i not in exclude])

cols = list(pd.read_csv('data/processed/hifld/substations.csv', nrows =1))
subs = pd.read_csv('data/processed/hifld/substations.csv',
                   low_memory=False,
                   usecols=[i for i in cols if i not in exclude])

cols = list(pd.read_csv('data/processed/eia/balancing_authorities.csv', nrows =1))
bas = pd.read_csv('data/processed/eia/balancing_authorities.csv',
                  low_memory=False,
                  usecols=[i for i in cols if i not in exclude])

cols = list(pd.read_csv('data/processed/doe/disturbances.csv', nrows =1))
disturb = pd.read_csv('data/processed/doe/disturbances.csv',
                      low_memory=False,
                      usecols=[i for i in cols if i not in exclude],
                      dtype={'event_type': 'category'},
                      parse_dates=['date_start', 'date_end'])

egrid = pd.read_csv('data/processed/epa/egrid.csv',
                    low_memory=False,
                    usecols=['orispl', 'fipscnty','pstatabb','bacode'])
egrid.dropna(subset=['bacode'], inplace=True)
egrid.drop_duplicates(inplace=True)

In [5]:
# Import states and abbreviations
states = pd.read_csv('data/raw/doe/states.csv',
                     header=None,
                     names=['name', 'abbr'])
states = dict(states.values)

# Import census data
census = pd.read_csv('data/raw/doe/census.csv',
                     usecols=['SUMLEV', 'STATE', 'COUNTY', 'STNAME','CTYNAME','POPESTIMATE2021'],
                     encoding='latin-1')
census.columns = census.columns.str.lower()
census['stname'] = census['stname'].apply(lambda x: states[x] if x in states.keys() else np.NaN)
census['county'] = census['county'].astype(str).str.pad(width=3, side='left', fillchar='0')
census['county'] = census['state'].astype(str) + census['county']
census['county'] = census['county'].astype('int64')

## Analysis

In [6]:
# Assign a lower weight to those disturbances where there is not county specified and only a state
disturb['weight'] = 1
disturb.loc[disturb['county_affected'].isna(), 'weight'] = .25

# Assign a boolean value as to whether there was an actual outage
disturb['outage'] = 0
disturb.loc[(disturb['demand_loss_mw'] > 0) | (disturb['num_customers_affected'] > 0), 'outage'] = 1

#### Population

In [7]:
# Merge disturbances, power plants, and substations, with census population data
disturb = disturb.merge(census[census['sumlev']==50][['stname', 'ctyname', 'popestimate2021']],
                        how='left',
                        left_on=['state_affected', 'county_affected'],
                        right_on=['stname', 'ctyname']) \
                 .rename(columns={'popestimate2021':'potential_population_affected'}) \
                 .drop(columns=['stname', 'ctyname'])

plants = plants.merge(census[['county', 'popestimate2021']],
                      how='left',
                      left_on=['countyfips'],
                      right_on=['county'],
                      suffixes=[None, '_census']) \
               .rename(columns={'popestimate2021':'potential_population_affected'}) \
               .drop(columns=['county_census'])

subs = subs.merge(census[['county', 'popestimate2021']],
                  how='left',
                  left_on=['countyfips'],
                  right_on=['county'],
                  suffixes=[None, '_census']) \
            .rename(columns={'popestimate2021':'potential_population_affected'}) \
            .drop(columns=['county_census'])

disturb = disturb.merge(census[census['sumlev']==40][['stname', 'popestimate2021']],
                        how='left',
                        left_on='state_affected',
                        right_on='stname') \
                 .rename(columns={'popestimate2021':'state_population'}) \
                 .drop(columns=['stname'])

disturb['potential_population_affected'].fillna(disturb['state_population']*.25, inplace=True)
disturb.drop(columns=['state_population'], inplace=True)

#### Balancing Authorities

In [8]:
orispl_count_by_ba = egrid.groupby(['fipscnty','bacode']).orispl.count().reset_index()
orispl_count_by_ba = orispl_count_by_ba.sort_values('orispl').drop_duplicates('fipscnty', keep='last')

# Merge the balancing authority that is most likely affected
disturb = disturb.merge(orispl_count_by_ba[['bacode','fipscnty']],
                        how='left',
                        left_on='county_affected_fips',
                        right_on='fipscnty')

disturb.rename(columns={'bacode': 'ba_affected'}, inplace=True)

# If a balancing authority was not found by county, match by state
orispl_count_by_ba = egrid.groupby(['pstatabb','bacode']).orispl.count().reset_index()
orispl_count_by_ba = orispl_count_by_ba.sort_values('orispl').drop_duplicates('pstatabb', keep='last')
state_majority_ba_dict = dict(zip(orispl_count_by_ba.pstatabb, orispl_count_by_ba.bacode))
disturb['ba_affected_by_state'] = disturb['state_affected'].apply(lambda x: state_majority_ba_dict[x])

disturb['ba_affected'].fillna(disturb['ba_affected_by_state'], inplace=True)

disturb.reset_index(drop=True, inplace=True)

In [9]:
# Calculate probability of a disturbance, whether there was an actual outage or not
bas = bas.merge(disturb.groupby('ba_affected')[['weight', 'potential_population_affected']].sum(),
                how='left',
                left_on='ba_code',
                right_on='ba_affected') \
         .rename(columns={'weight': 'disturbance_count'})

bas['disturbance_prob'] = bas['disturbance_count']/len(disturb['event_id'].unique())
bas['disturbance_prob'].fillna(0, inplace=True)

bas.drop(columns=['disturbance_count'], inplace=True)

In [10]:
# Calculate probability of an actual outage
bas = bas.merge(disturb[disturb['outage'] == 1].groupby('ba_affected')['weight'].sum(),
                how='left',
                left_on='ba_code',
                right_on='ba_affected') \
         .rename(columns={'weight': 'outage_count'})

bas['outage_prob'] = bas['outage_count']/len(disturb['event_id'].unique())
bas['outage_prob'].fillna(0, inplace=True)

bas.drop(columns=['outage_count'], inplace=True)

#### Power Plants

In [11]:
# Calculate probability of a disturbance, whether there was an actual outage or not
plants = plants.merge(disturb.groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'disturbance_count'})

plants = plants.merge(disturb.groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected',
                      suffixes=[None, '_state']) \
               .rename(columns={'weight': 'disturbance_count_state'})

# If county data was not available, fill in data based on the state which has a lower weight
plants['disturbance_count'].fillna(plants['disturbance_count_state'], inplace=True)

plants['disturbance_prob'] = plants['disturbance_count']/len(disturb['event_id'].unique())
plants['disturbance_prob'].fillna(0, inplace=True)

plants.drop(columns=['disturbance_count', 'disturbance_count_state'], inplace=True)

In [12]:
# Calculate probability of an actual outage
plants = plants.merge(disturb[disturb['outage'] == 1].groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'outage_count'})

plants = plants.merge(disturb[disturb['outage'] == 1].groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'outage_count_state'})

plants['outage_count'].fillna(plants['outage_count_state'], inplace=True)

plants['outage_prob'] = plants['outage_count']/len(disturb['event_id'].unique())
plants['outage_prob'].fillna(0, inplace=True)

plants.drop(columns=['outage_count', 'outage_count_state'], inplace=True)

#### Substations

In [13]:
# Calculate probability of a disturbance, whether there was an actual outage or not
subs = subs.merge(disturb.groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'disturbance_count'})

subs = subs.merge(disturb.groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected',
                      suffixes=[None, '_state']) \
               .rename(columns={'weight': 'disturbance_count_state'})

# If county data was not available, fill in data based on the state which has a lower weight
subs['disturbance_count'].fillna(subs['disturbance_count_state'], inplace=True)

subs['disturbance_prob'] = subs['disturbance_count']/len(disturb['event_id'].unique())
subs['disturbance_prob'].fillna(0, inplace=True)

subs.drop(columns=['disturbance_count', 'disturbance_count_state'], inplace=True)

In [14]:
# Calculate probability of an actual outage
subs = subs.merge(disturb[disturb['outage'] == 1].groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'outage_count'})

subs = subs.merge(disturb[disturb['outage'] == 1].groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'outage_count_state'})

subs['outage_count'].fillna(subs['outage_count_state'], inplace=True)

subs['outage_prob'] = subs['outage_count']/len(disturb['event_id'].unique())
subs['outage_prob'].fillna(0, inplace=True)

subs.drop(columns=['outage_count', 'outage_count_state'], inplace=True)

## Enriched Output

In [15]:
plants.sample(3)

Unnamed: 0,plant_code,name,address,city,state,zip,telephone,type,status,county,...,net_gen,cap_factor,lines,source_lat,source_lon,connected_sub,connected_ba,potential_population_affected,disturbance_prob,outage_prob
6324,57590,"Minco Wind I, Llc",491 County Road,Minco,OK,73059,(561) 691-7171,Onshore Wind Turbine,Op,Grady,...,281073.0,0.323447,0,35.280278,-97.975556,,SWPP,55508.0,0.011155,0.006751
2752,7869,Glenwood Landing,400 Shore Road,Glenwood Landing,NY,11547,(800) 642-4272,Natural Gas Fired Combustion Turbine; Petroleu...,Op,Nassau,...,80093.0,0.093583,1,40.8275,-73.6478,124150.0,NYIS,1390907.0,0.000783,0.000783
9030,60423,Fairhaven C,197 New Boston Rd,Fairhaven,MA,2719,,Solar Photovoltaic,Op,Bristol,...,2925.0,0.20869,0,41.66303,-70.86323,,ISNE,580164.0,0.009393,0.005969


In [16]:
subs.sample(3)

Unnamed: 0,sub_code,name,city,state,zip,type,status,county,countyfips,country,...,min_volt,max_infer,min_infer,connected_sub,deg_cent,bet_cent,clust_coef,potential_population_affected,disturbance_prob,outage_prob
64648,208303,Unknown208239,Rocky Boy,MT,0,Substation,In Service,Hill,30041,USA,...,69.0,Y,Y,208302.0,2.2e-05,0.0,0.0,16179.0,0.000391,0.000391
37076,151925,Military Trail,West Palm Beach,FL,33415,Substation,In Service,Palm Beach,12099,USA,...,138.0,Y,Y,,4.4e-05,1.7e-05,0.0,1497987.0,0.000783,0.000783
19244,132616,Aldene,Cranford Twp,NJ,7016,Substation,In Service,Union,34039,USA,...,230.0,Y,Y,133141.0,6.7e-05,4e-05,0.0,572114.0,0.012133,0.008023


In [17]:
bas.sample(3)

Unnamed: 0,ba_code,ba_name,time_zone,region_country_code,region_country_name,generation_only_ba,demand_by_ba_subregion,us_ba,connected_ba,lat,lon,deg_cent,bet_cent,clust_coef,potential_population_affected,disturbance_prob,outage_prob
56,TIDC,Turlock Irrigation District,Pacific,CAL,California,No,No,Yes,BANC; CISO,37.614709,-120.720719,0.001109,4.515416e-06,0.044944,2211996.0,0.001566,0.000391
12,DOPD,PUD No. 1 of Douglas County,Pacific,NW,Northwest,No,No,Yes,BPAT; CHPD,48.172212,-119.686599,0.000475,9.030831e-07,0.307692,42634.0,0.000391,0.000391
66,HQT,Hydro-Quebec TransEnergie,,CAN,Canada,No,No,No,NBSO,46.0,-75.140082,0.000634,0.0002023378,0.5,,0.0,0.0


In [18]:
# Drop and reorder columns
disturb = disturb[['event_id',
                   'date_start',
                   'date_end',
                   'state_affected',
                   'county_affected',
                   'county_affected_fips',
                   'ba_affected',
                   'nerc_region',
                   'event_type',
                   'demand_loss_mw',
                   'num_customers_affected',
                   'potential_population_affected']]

disturb.sample(3)

Unnamed: 0,event_id,date_start,date_end,state_affected,county_affected,county_affected_fips,ba_affected,nerc_region,event_type,demand_loss_mw,num_customers_affected,potential_population_affected
3305,1703,2015-05-25 22:45:00,2015-05-28 01:25:00,TX,Fort Bend County,48157.0,ERCO,TRE,severe_weather,,61000.0,858527.0
3099,1564,2016-05-24 08:00:00,1900-12-01 00:00:00,MO,,,SWPP,SERC,vandalism/cyber_attack,0.0,0.0,1542046.75
168,91,2021-11-16 17:37:00,2021-11-16 18:21:00,MD,Baltimore County,24005.0,PJM,RF,system_operations,0.0,0.0,849316.0


In [19]:
plants.to_csv('data/processed/hifld/power_plants.csv', index=False)
subs.to_csv('data/processed/hifld/substations.csv', index=False)
bas.to_csv('data/processed/eia/balancing_authorities.csv', index=False)
disturb.to_csv('data/processed/doe/disturbances.csv', index=False)