In [144]:
import pandas as pd 

In [145]:
df = pd.read_excel('../data/processed/DOE_Stage_2.xlsx')

In [146]:
df.head()

Unnamed: 0,Month,Time Event Began,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
0,January,2011-01-26 09:25,2011-01-27 17:50,"Carson City, Nevada",WECC,Unknown,Cyber Event,0.0,0.0
1,February,2011-02-03 14:30,2011-02-03 18:00,"Bowie, Maryland",RFC,Unknown,Cyber Event,0.0,0.0
2,February,2011-02-17 13:00,2011-02-23 20:30,"Roseville, California",WECC,Unknown,Cyber Event,0.0,0.0
3,March,2011-03-14 07:30,2011-03-14 14:15,"Baltimore, Maryland",RFC,Unknown,Cyber Event,0.0,0.0
4,April,2011-04-03 20:23,2011-04-05 11:00,Unknown,SERC,Unknown,Cyber Event,0.0,0.0


In [147]:
col = df.columns.tolist()
df = df.rename(str.lower, axis='columns')

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3892 entries, 0 to 3891
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   month                         3892 non-null   object 
 1   time event began              3892 non-null   object 
 2   time of restoration           3892 non-null   object 
 3   area affected                 3892 non-null   object 
 4   nerc region                   3892 non-null   object 
 5   alert criteria                3892 non-null   object 
 6   event type                    3892 non-null   object 
 7   demand loss (mw)              2441 non-null   float64
 8   number of customers affected  3157 non-null   float64
dtypes: float64(2), object(7)
memory usage: 273.8+ KB


In [149]:
df["time event began"] = pd.to_datetime(df["time event began"], format='%Y-%m-%d %H:%M')
df["time of restoration"] = pd.to_datetime(df["time of restoration"], format='%Y-%m-%d %H:%M')

In [150]:
df['demand loss (mw)'] = pd.to_numeric(df['demand loss (mw)'], errors='coerce')
df['number of customers affected'] = pd.to_numeric(df['number of customers affected'], errors='coerce')

In [151]:
# fillna for "demand loss" based on Event type and NERC Region averages.
df['demand loss (mw)'] = df.groupby(['event type','nerc region'])['demand loss (mw)'].transform(lambda x: x.fillna(x.mean()))
df['demand loss (mw)'] = df.groupby(['event type'])['demand loss (mw)'].transform(lambda x: x.fillna(x.mean()))

df['demand loss (mw)'] = df['demand loss (mw)'].round(2)

In [152]:
# fillna for "number of customers affected" based on Event type and NERC Region averages.
df['number of customers affected'] = df.groupby(['event type','nerc region'])['number of customers affected'].transform(lambda x: x.fillna(x.mean()))
df['number of customers affected'] = df.groupby(['event type'])['number of customers affected'].transform(lambda x: x.fillna(x.mean()))


df['number of customers affected'] = df['number of customers affected'].round(0).astype('Int64')

In [153]:
# fill in remaing missing values with event type grouping
df['demand loss (mw)'] = df.groupby(['event type','nerc region'])['demand loss (mw)'].transform(lambda x: x.fillna(x.mean()))

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3892 entries, 0 to 3891
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   month                         3892 non-null   object        
 1   time event began              3892 non-null   datetime64[ns]
 2   time of restoration           3892 non-null   datetime64[ns]
 3   area affected                 3892 non-null   object        
 4   nerc region                   3892 non-null   object        
 5   alert criteria                3892 non-null   object        
 6   event type                    3892 non-null   object        
 7   demand loss (mw)              3892 non-null   float64       
 8   number of customers affected  3892 non-null   Int64         
dtypes: Int64(1), datetime64[ns](2), float64(1), object(5)
memory usage: 277.6+ KB


In [160]:
# Calculate the duration column
df['duration'] = df['time of restoration'] - df['time event began']

# Filter rows with negative duration
negative_duration_df = df[df['duration'] < pd.Timedelta(0)]

# Display the DataFrame with negative duration
print(negative_duration_df)

         month    time event began time of restoration  \
9     November 2012-11-15 21:09:00 2012-11-15 15:13:00   
24        June 2021-06-15 11:55:00 2021-06-15 07:30:00   
25        July 2021-07-30 08:02:00 2021-07-30 08:00:00   
27     January 2022-01-18 22:22:00 2022-01-18 15:00:00   
29       March 2022-03-31 22:43:00 2022-03-31 14:53:00   
...        ...                 ...                 ...   
3886      July 2008-07-02 19:16:00 2008-07-02 00:00:00   
3888  November 2008-11-15 09:39:00 2008-11-15 00:00:00   
3889       May 2014-05-14 15:34:00 2014-05-14 00:00:00   
3890       May 2014-05-15 10:43:00 2014-05-15 00:00:00   
3891       May 2014-05-16 10:43:00 2014-05-16 00:00:00   

                                          area affected nerc region  \
9                                        Iowa; Michigan         MRO   
24                                               Texas:         TRE   
25                                           Wisconsin:         MRO   
27                 

In [155]:
df2 = df.copy()
df2 = df2.rename(str.upper, axis='columns')

df2.to_excel('../data/processed/DOE_final.xlsx', index=False)

In [156]:
# Create a date range DataFrame
date_range = pd.DataFrame({'date': pd.date_range(start='2002-01-02', end='2023-06-30', freq='D')})

# Merge the date range DataFrame with the existing DataFrame
df_merged = date_range.merge(df, how='left', left_on='date', right_on='time event began')

# Drop unnecessary columns and reorder
df_merged = df_merged.drop(columns=['time event began', 'time of restoration'])
df_merged = df_merged[['date', 'month', 'area affected', 'nerc region', 'alert criteria', 'event type', 'demand loss (mw)', 'number of customers affected']]

In [158]:
df.head(100)

Unnamed: 0,month,time event began,time of restoration,area affected,nerc region,alert criteria,event type,demand loss (mw),number of customers affected
0,January,2011-01-26 09:25:00,2011-01-27 17:50:00,"Carson City, Nevada",WECC,Unknown,Cyber Event,0.00,0
1,February,2011-02-03 14:30:00,2011-02-03 18:00:00,"Bowie, Maryland",RFC,Unknown,Cyber Event,0.00,0
2,February,2011-02-17 13:00:00,2011-02-23 20:30:00,"Roseville, California",WECC,Unknown,Cyber Event,0.00,0
3,March,2011-03-14 07:30:00,2011-03-14 14:15:00,"Baltimore, Maryland",RFC,Unknown,Cyber Event,0.00,0
4,April,2011-04-03 20:23:00,2011-04-05 11:00:00,Unknown,SERC,Unknown,Cyber Event,0.00,0
...,...,...,...,...,...,...,...,...,...
95,August,2013-08-05 18:35:00,2013-08-05 00:00:00,"Vancouver, British Columbia",WECC,Unknown,Unkown/Unspecified,724.47,114586
96,August,2013-08-29 14:57:00,2013-08-29 06:06:00,"Ashland, Wisconsin",MRO,Unknown,Unkown/Unspecified,15.00,7000
97,October,2013-10-21 05:18:00,2013-10-21 18:00:00,Location Unknown,WECC,Unknown,Unkown/Unspecified,115.00,433
98,November,2013-11-12 14:04:00,2013-11-12 14:16:00,"Valle, California",WECC,Unknown,Unkown/Unspecified,55.00,48400


In [159]:
df3 = df.copy()
df3 = df2.rename(str.upper, axis='columns')

df3.to_excel('../data/processed/DOE_full.xlsx', index=False)