# Electric Disturbance Events

## Import Modules

In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

## Mount Drive

In [2]:
import os

team_name = 'capstone-power-grid-protagonists'
colab_path = f'/content/drive/Shareddrives/{team_name}/project'
studiolab_path = f'/home/studio-lab-user/sagemaker-studiolab-notebooks/{team_name}'

try:
    # Try to mount Google Drive and set project path
    from google.colab import drive
    drive.flush_and_unmount()
    drive.mount('/content/drive')
    print('')

    root_path = colab_path
    os.chdir(root_path)

except:
    try:
        # Try to set AWS SageMaker Studio Lab project path
        root_path = studiolab_path
        os.chdir(root_path)
    
    except:
        # Set current working directory as root path
        root_path = os.getcwd()
        os.chdir(root_path)

        # If the current folder is 'notebooks', move up one level
        if root_path.endswith('/notebooks'):
            root_path = '/'.join(root_path.split('/')[:-1])
            os.chdir(root_path)
        
print('Current working directory is:')
print(os.getcwd())

Mounted at /content/drive

Current working directory is:
/content/drive/Shareddrives/capstone-power-grid-protagonists/project


## Import Data

In [3]:
# Processed data
plants = pd.read_csv('data/processed/hifld/power_plants.csv', low_memory=False)
subs = pd.read_csv('data/processed/hifld/substations.csv', low_memory=False)
bas = pd.read_csv('data/processed/eia/balancing_authorities.csv', low_memory=False)
egrid = pd.read_csv('data/processed/epa/egrid.csv', low_memory=False, usecols=['orispl', 'fipscnty','pstatabb','bacode'])
disturb = pd.read_csv('data/processed/doe/disturbances.csv', low_memory=False)

In [4]:
# Import states and abbreviations
states = pd.read_csv('data/raw/doe/states.csv',
                     header=None,
                     names=['name', 'abbr'])
states = dict(states.values)

# Import census data
census = pd.read_csv('data/raw/doe/census.csv',
                     usecols=['SUMLEV', 'STNAME','CTYNAME','POPESTIMATE2021'],
                     encoding='latin-1')
census.columns = census.columns.str.lower()
census['stname'] = census['stname'].apply(lambda x: states[x] if x in states.keys() else np.NaN)

## Analysis

In [5]:
# Assign a lower weight to those disturbances where there is not county specified and only a state
disturb['weight'] = 1
disturb.loc[disturb['county_affected'].isna(), 'weight'] = .25

# Assign a boolean value as to whether there was an actual outage
disturb['outage'] = 0
disturb.loc[(disturb['demand_loss_mw'] > 0) | (disturb['num_customers_affected'] > 0), 'outage'] = 1

#### Population

In [6]:
# Merge disturbances with census population data
disturb = disturb.merge(census[census['sumlev']==50],
                        how='left',
                        left_on=['state_affected', 'county_affected'],
                        right_on=['stname', 'ctyname']) \
                 .rename(columns={'popestimate2021':'potential_population_affected'})

disturb = disturb.merge(census[census['sumlev']==40],
                        how='left',
                        left_on='state_affected',
                        right_on='stname') \
                 .rename(columns={'popestimate2021':'state_population'})

disturb['potential_population_affected'].fillna(disturb['state_population'], inplace=True)

#### Balancing Authorities

In [7]:
# Merge the balancing authority that is most likely affected
disturb = disturb.merge(egrid,
                        how='left',
                        left_on='county_affected_fips',
                        right_on='fipscnty')

disturb.rename(columns={'bacode': 'ba_affected'}, inplace=True)

# If a balancing authority was not found by county, match by state
orispl_count_by_ba = egrid.groupby(['pstatabb','bacode']).orispl.count().reset_index()
orispl_count_by_ba = orispl_count_by_ba.sort_values('orispl').drop_duplicates('pstatabb', keep='last')
state_majority_ba_dict = dict(zip(orispl_count_by_ba.pstatabb, orispl_count_by_ba.bacode))
disturb['ba_affected_by_state'] = disturb['state_affected'].apply(lambda x: state_majority_ba_dict[x])

disturb['ba_affected'].fillna(disturb['ba_affected_by_state'], inplace=True)

disturb.reset_index(drop=True, inplace=True)

In [8]:
# Calculate probability of a disturbance, whether there was an actual outage or not
bas = bas.merge(disturb.groupby('ba_affected')['weight'].sum(),
                how='left',
                left_on='ba_code',
                right_on='ba_affected') \
         .rename(columns={'weight': 'disturbance_count'})

bas['disturbance_prob'] = bas['disturbance_count']/len(disturb['event_id'].unique())
bas['disturbance_prob'].fillna(0, inplace=True)

bas.drop(columns=['disturbance_count'], inplace=True)

In [9]:
# Calculate probability of an actual outage
bas = bas.merge(disturb[disturb['outage'] == 1].groupby('ba_affected')['weight'].sum(),
                how='left',
                left_on='ba_code',
                right_on='ba_affected') \
         .rename(columns={'weight': 'outage_count'})

bas['outage_prob'] = bas['outage_count']/len(disturb['event_id'].unique())
bas['outage_prob'].fillna(0, inplace=True)

bas.drop(columns=['outage_count'], inplace=True)

#### Power Plants

In [10]:
# Calculate probability of a disturbance, whether there was an actual outage or not
plants = plants.merge(disturb.groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'disturbance_count'})

plants = plants.merge(disturb.groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'disturbance_count_state'})

# If county data was not available, fill in data based on the state which has a lower weight
plants['disturbance_count'].fillna(plants['disturbance_count_state'], inplace=True)

plants['disturbance_prob'] = plants['disturbance_count']/len(disturb['event_id'].unique())
plants['disturbance_prob'].fillna(0, inplace=True)

plants.drop(columns=['disturbance_count', 'disturbance_count_state'], inplace=True)

In [11]:
# Calculate probability of an actual outage
plants = plants.merge(disturb[disturb['outage'] == 1].groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'outage_count'})

plants = plants.merge(disturb[disturb['outage'] == 1].groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'outage_count_state'})

plants['outage_count'].fillna(plants['outage_count_state'], inplace=True)

plants['outage_prob'] = plants['outage_count']/len(disturb['event_id'].unique())
plants['outage_prob'].fillna(0, inplace=True)

plants.drop(columns=['outage_count', 'outage_count_state'], inplace=True)

#### Substations

In [12]:
# Calculate probability of a disturbance, whether there was an actual outage or not
subs = subs.merge(disturb.groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'disturbance_count'})

subs = subs.merge(disturb.groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'disturbance_count_state'})

# If county data was not available, fill in data based on the state which has a lower weight
subs['disturbance_count'].fillna(subs['disturbance_count_state'], inplace=True)

subs['disturbance_prob'] = subs['disturbance_count']/len(disturb['event_id'].unique())
subs['disturbance_prob'].fillna(0, inplace=True)

subs.drop(columns=['disturbance_count', 'disturbance_count_state'], inplace=True)

In [13]:
# Calculate probability of an actual outage
subs = subs.merge(disturb[disturb['outage'] == 1].groupby('county_affected_fips')['weight'].sum(),
                      how='left',
                      left_on='countyfips',
                      right_on='county_affected_fips') \
               .rename(columns={'weight': 'outage_count'})

subs = subs.merge(disturb[disturb['outage'] == 1].groupby('state_affected')['weight'].sum(),
                      how='left',
                      left_on='state',
                      right_on='state_affected') \
               .rename(columns={'weight': 'outage_count_state'})

subs['outage_count'].fillna(subs['outage_count_state'], inplace=True)

subs['outage_prob'] = subs['outage_count']/len(disturb['event_id'].unique())
subs['outage_prob'].fillna(0, inplace=True)

subs.drop(columns=['outage_count', 'outage_count_state'], inplace=True)

## Enriched Output

In [14]:
plants.sample(3)

Unnamed: 0,plant_code,name,address,city,state,zip,telephone,type,status,county,...,oil_used,net_gen,cap_factor,lines,source_lat,source_lon,connected_sub,connected_ba,disturbance_prob,outage_prob
6434,57708,Catalina Solar Llc,1232 Catalina Rd,Rosamond,CA,93560,(888) 903-6926,Solar Photovoltaic,Op,Kern,...,0.0,269368.0,0.279543,0,34.938261,-118.334492,306009.0,CISO,12.079843,4.026614
11932,63852,"Poet Biorefining - North Manchester, Llc",868 E 800 N,North Manchester,IN,46962,,Natural Gas Steam Turbine,Op,Wabash,...,0.0,0.0,0.0,0,40.941967,-85.803585,,MISO,0.081898,0.003033
10533,61977,Cornillie,35342 Kost Trail,North Branch,MN,55056,,Solar Photovoltaic,Op,Chisago,...,0.0,1436.0,0.163927,0,45.46013,-92.89609,,MISO,0.045401,0.123973


In [15]:
subs.sample(3)

Unnamed: 0,sub_code,name,city,state,zip,type,status,county,countyfips,country,...,max_volt,min_volt,max_infer,min_infer,connected_sub,deg_cent,bet_cent,clust_coef,disturbance_prob,outage_prob
44904,159926,Tap159926,Alcester,SD,57001,Tap,In Service,Union,46127,USA,...,69.0,69.0,Y,Y,159924; 159932,6.7e-05,2.695066e-05,0.0,0.025049,0.009589
27886,173965,Unknown173965,Carlyle,IL,62231,Substation,,Clinton,17027,USA,...,,,N,N,173962,4.4e-05,5.522203e-05,0.0,1.214971,0.974462
6006,114796,Unknown114796,Fremont,OH,43420,Substation,In Service,Sandusky,39143,USA,...,,,N,N,147772,4.4e-05,1.233649e-09,0.0,0.20362,0.099511


In [16]:
bas.sample(3)

Unnamed: 0,ba_code,ba_name,time_zone,region_country_code,region_country_name,generation_only_ba,demand_by_ba_subregion,us_ba,connected_ba,lat,lon,deg_cent,bet_cent,clust_coef,disturbance_prob,outage_prob
16,FMPP,Florida Municipal Power Pool,Eastern,FLA,Florida,No,No,Yes,FPC; FPL; JEA; TEC,28.060128,-81.251673,0.002296,1e-05,0.039801,0.120939,0.109198
4,AZPS,Arizona Public Service Company,Arizona,SW,Southwest,No,No,Yes,CISO; IID; LDWP; PACE; PNM; SRP; TEPC; WACM; WALC,33.581322,-112.235108,0.005939,0.000529,0.020246,0.488063,0.375049
63,YAD,"Alcoa Power Generating, Inc. - Yadkin Division",Eastern,CAR,Carolinas,Yes,No,Yes,CPLE; DUK,35.474932,-80.144107,0.000634,2e-06,0.153846,0.0,0.0


In [17]:
# Drop and reorder columns
disturb = disturb[['event_id',
                   'date_start',
                   'date_end',
                   'state_affected',
                   'county_affected',
                   'county_affected_fips',
                   'ba_affected',
                   'nerc_region',
                   'alert_criteria',
                   'event_type',
                   'demand_loss_mw',
                   'num_customers_affected',
                   'potential_population_affected']]

disturb.sample(3)

Unnamed: 0,event_id,date_start,date_end,state_affected,county_affected,county_affected_fips,ba_affected,nerc_region,alert_criteria,event_type,demand_loss_mw,num_customers_affected,potential_population_affected
23388,177,2021-08-24 17:00:00,2021-08-26 14:07:00,MI,Ottawa County,26139.0,MISO,RF,"Loss of electric service to more than 50,000 c...",severe_weather,,84987.0,299157.0
116613,858,2019-10-30 06:32:00,2019-11-01 13:29:00,CA,Kern County,6029.0,CISO,WECC,"Loss of electric service to more than 50,000 c...",severe_weather,285.0,114402.0,917673.0
1971,4,2022-02-24 01:09:00,2022-02-24 02:59:00,CA,Kern County,6029.0,CISO,WECC,Complete loss of monitoring or control capabil...,system_operations,246.0,0.0,917673.0


In [18]:
plants.to_csv('data/processed/hifld/power_plants.csv', index=False)
subs.to_csv('data/processed/hifld/substations.csv', index=False)
bas.to_csv('data/processed/eia/balancing_authorities.csv', index=False)
disturb.to_csv('data/processed/doe/disturbances.csv', index=False)