In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/Grid_Disruption_00_14_standardized.csv')
data.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags
0,Severe Weather - Thunderstorms,2014,6/30/2014,8:00 PM,7/2/2014,6:30 PM,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm"
1,Severe Weather - Thunderstorms,2014,6/30/2014,11:20 PM,7/1/2014,5:00 PM,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm"
2,Severe Weather - Thunderstorms,2014,6/30/2014,5:55 PM,7/1/2014,2:53 AM,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm"
3,Fuel Supply Emergency - Coal,2014,6/27/2014,1:21 PM,Unknown,Unknown,We Energies,Wisconsin,MRO,Unknown,Unknown,"fuel supply emergency, coal"
4,Physical Attack - Vandalism,2014,6/24/2014,2:54 PM,6/24/2014,2:55 PM,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical"


In [3]:
data.shape

(1652, 12)

### Drop Canada rows

In [4]:
data = data[~((data['Geographic Areas']!= np.nan) & (data['Geographic Areas'].str.contains('Canada')))]

In [5]:
data['Number of Customers Affected'].isnull().sum()

213

In [6]:
data['Number of Customers Affected'].value_counts()

0                                      184
Unknown                                137
-                                       27
UNK                                     26
50,000                                  22
1                                       14
60,000                                  13
150,000                                 13
200,000                                 11
80,000                                  10
75,000                                  10
70,000                                  10
63,000                                   9
65,000                                   9
100,000                                  8
190,000                                  8
300,000                                  7
105,000                                  7
140,000                                  7
56,000                                   7
130,000                                  7
160,000                                  7
127,000                                  6
175,000    

### Convert 'Uknown' and like strings to Nan

In [7]:
data['Number of Customers Affected'] = data['Number of Customers Affected'].map(lambda x: x if (type(x) == float) else np.nan if (x.isalpha() ==True) else x)

### Convert '-' to Nan

In [8]:
data['Number of Customers Affected'] = data['Number of Customers Affected'].map(lambda x: x if (type(x) == float) else np.nan if (x=='-') else x)


In [9]:
data['Number of Customers Affected'].isnull().sum()

413

### Remove commas

In [10]:
data['Number of Customers Affected'] = data['Number of Customers Affected'].map(lambda x: x.replace(',','') if type(x) != float else x)


### Remove 'Approx. '

In [11]:
data['Number of Customers Affected'] = data['Number of Customers Affected'].map(lambda x: x.replace('Approx. ','')if type(x) != float else x)


In [12]:
data[(~data['Number of Customers Affected'].isnull())&data['Number of Customers Affected'].str.contains(' ')]

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags
1191,Electrical System Separation/Load Shedding/ Im...,2007,9/18/2007,5:15 a.m.,9/18/2007,6:30 a.m.,Great River Energy,"Minnesota, North Dakota, Manitoba",MRO,"8,000-10,000",GRE (1900) Total 11175,"severe weather, storm, islanding, load shedding"
1262,HIgh Winds,2006,12/14/2006,12:07 p.m.,12/17/2006,12:00 p.m.,PacifiCorp,State of Oregon Coastal area,WECC,,111000 (peak),"severe weather, wind"
1290,Made Public Appeals,2006,8/7/2006,1:00 p.m.,8/7/2006,6:00 p.m.,American Electric Power,"Tulsa, Oklahoma",RFC,75,Major Industrial Customer Load Reduction,public appeal
1299,Widespread Heat Wave/CAISO Implementation of S...,2006,7/24/2006,2:33 p.m.,7/24/2006,5:33 p.m.,Southern California Edison Company,California,WECC,414,Interruptible Tarriff 1-6 customers,"severe weather, heat"
1301,Severe Storms (3) (Many customers experienced ...,2006,7/19/2006,6:00 p.m.,7/31/2006,8:00 a.m.,Ameren Corporation,Greater St. Louis Metropolitan area (Missouri ...,MRO,1500,700000 (peak) 2500000 (actual),"severe weather, storm"
1323,Load Shed/Made Public Appeals/Rolling Blackouts,2006,4/17/2006,4:20 p.m.,4/17/2006,6:30 p.m.,Austin Energy,State of Texas (all of Austin Energy),ERCOT,37- 40,8000 -10000,"load shedding, public appeal"
1373,Hurricane Katrina,2005,8/29/2005,6:00 a.m.,8/30/2005,6:00 a.m.,Entergy Corporation,"Buras, Louisiana",SPP,,1.1 million and 100000 gas customers,"severe weather, hurricane/tropical storm"
1496,Severe Thunderstorms,2004,5/21/2004,4:00 p.m.,5/24/2004,8:00 p.m.,Detroit Edison,Southeast Michigan,ECAR,630,Greater than 250000,"severe weather, thunderstorm"
1498,High Winds and Heavy Rains,2004,5/21/2004,5:30 a.m.,5/25/2004,12:00 a.m.,Allegheny Power,"Western Pennsylvania, Northern West Virginia, ...",MAAC,"60 at peak, total 162",94366 at peak total 225353,"severe weather, wind, rain"
1501,Strong Thunderstorms,2004,5/11/2004,3:30 p.m.,5/11/2004,6:00 p.m.,CenterPoint Energy,"Houston, Texas and surrounding suburban areas",ERCOT,Approx. 85,62500 at peak,"severe weather, thunderstorm"


### Manually set specific rows

In [13]:
data.loc[1637,'Number of Customers Affected']= 2000000    # 2 million
data.loc[1191,'Number of Customers Affected']= 11175      # GRE (1900) Total 11175
data.loc[1262,'Number of Customers Affected']= 111000     # 111000 (peak)
data.loc[1290,'Number of Customers Affected']= np.nan     # Major Industrial Customer Load Reduction
data.loc[1299,'Number of Customers Affected']= 4          # Interruptible Tarriff 1-6 customers
data.loc[1301,'Number of Customers Affected']= 2500000    # 700000 (peak) 2500000 (actual)
data.loc[1323,'Number of Customers Affected']= 9000       # 8000 -10000
data.loc[1351,'Number of Customers Affected']= np.nan     # --
data.loc[1373,'Number of Customers Affected']= 1100000    # 1.1 million and 100000 gas customers
data.loc[1383,'Number of Customers Affected']= np.nan     # PG&E
data.loc[1412,'Number of Customers Affected']= np.nan     # PG&E
data.loc[1496,'Number of Customers Affected']= 250000     # Greater than 250000
data.loc[1498,'Number of Customers Affected']= 225353     # 94366 at peak total 225353
data.loc[1501,'Number of Customers Affected']= 62500      # 62500 at peak
data.loc[1505,'Number of Customers Affected']= 85000      # 85000 at peak
data.loc[1512,'Number of Customers Affected']= np.nan     # PG&E
data.loc[1529,'Number of Customers Affected']= 6          # 6 (utilities)
data.loc[1530,'Number of Customers Affected']= 200000     # 200000 (Peak)
data.loc[1532,'Number of Customers Affected']= 104195     # 104195 at 5:23 p.m. 11/13/03
data.loc[1540,'Number of Customers Affected']= 108000     # 108000 (Dist. And Trans. Combined)
data.loc[1541,'Number of Customers Affected']= 530000     # Over 530000 peak on 9/19/03
data.loc[1543,'Number of Customers Affected']= 50000      # Under 50000
data.loc[1545,'Number of Customers Affected']= 32000      # peak 32000 9/18/03 7:00 p.m.
data.loc[1547,'Number of Customers Affected']= 1800000    # 1.8 million
data.loc[1548,'Number of Customers Affected']= 4          # 4 (industrial)
data.loc[1550,'Number of Customers Affected']= 133000     # 93000 at peak 133000 cumulative
data.loc[1576,'Number of Customers Affected']= 115739     # 102842 (Georgia); 12897 (Alabama)
data.loc[1582,'Number of Customers Affected']= 340000     # over 340000
data.loc[1587,'Number of Customers Affected']= 1500000    # 1.5 million
data.loc[1595,'Number of Customers Affected']= 1          # 1 PG&E
data.loc[1599,'Number of Customers Affected']= 1          # 1 PG&E
data.loc[1600,'Number of Customers Affected']= 1          # 1 PG&E
data.loc[1609,'Number of Customers Affected']= 36073      # (residential) 36073
data.loc[1612,'Number of Customers Affected']= 127566     # 70848; 56718
data.loc[1615,'Number of Customers Affected']= 50000      # 25000 per hour
data.loc[1645,'Number of Customers Affected']= 425000     # 40000-45000

In [14]:
data['Number of Customers Affected'] =data['Number of Customers Affected'].map(lambda x: int(x)if type(x) != float else x)


In [15]:
data.to_csv('../data/Grid_Disruption_00_14_standardized_cleaned.csv', index=False)