In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import numpy as np

## Import facility data and NERC labels

In [2]:
path = os.path.join('Data storage', 'Facility gen fuels and CO2 2017-05-25.zip')
facility_df = pd.read_csv(path, parse_dates=['datetime'])

In [8]:
path = os.path.join('Data storage', 'Facility NERC labels.csv')
labels = pd.read_csv(path)

In [9]:
labels.head()

Unnamed: 0,plant id,region
0,2,SERC
1,3,SERC
2,4,SERC
3,7,SERC
4,8,SERC


In [30]:
facility_df = facility_df.merge(labels, on='plant id')

Filter out data older than 2014 to reduce size

In [31]:
facility_df['state'] = facility_df['geography'].str[-2:]
keep_cols = ['fuel', 'year', 'datetime', 'state', 'plant id', 'region',
             'generation (MWh)', 'elec fuel (mmbtu)']
facility_df = facility_df.loc[facility_df['year'] >= 2014, keep_cols]

In [36]:
facility_fuel_cats = {'COW': ['SUB', 'BIT', 'LIG', 'WC', 'SC', 'RC', 'SGC'],
                      'NG': ['NG'],
                      'PEL': ['DFO', 'RFO', 'KER', 'JF',
                              'PG', 'WO', 'SGP'],
                      'PC': ['PC'],
                      'HYC': ['WAT'],
                      'HPS': [],
                      'GEO': ['GEO'],
                      'NUC': ['NUC'],
                      'OOG': ['BFG', 'OG', 'LFG'],
                      'OTH': ['OTH', 'MSN', 'MSW', 'PUR', 'TDF', 'WH'],
                      'SUN': ['SUN'],
                      'DPV': [],
                      'WAS': ['OBL', 'OBS', 'OBG', 'MSB', 'SLW'],
                      'WND': ['WND'],
                      'WWW': ['WDL', 'WDS', 'AB', 'BLQ']
                      }
def fuel2category(fuel):
    if fuel in 

In [39]:
for category in facility_fuel_cats.keys():
    fuels = facility_fuel_cats[category]
    facility_df.loc[facility_df['fuel'].isin(fuels),
                    'fuel category'] = category

In [40]:
facility_df.head()

Unnamed: 0,fuel,year,datetime,state,plant id,region,generation (MWh),elec fuel (mmbtu),fuel category
0,NG,2017,2017-03-01,FL,10275,FRCC,0.0,0.0,NG
1,NG,2017,2017-02-01,FL,10275,FRCC,0.0,0.0,NG
2,NG,2017,2017-01-01,FL,10275,FRCC,0.0,0.0,NG
3,NG,2016,2016-12-01,FL,10275,FRCC,0.0,0.0,NG
4,NG,2016,2016-11-01,FL,10275,FRCC,0.0,0.0,NG


In [25]:
facility_df.dtypes

fuel                        object
year                         int64
datetime            datetime64[ns]
state                       object
plant id                     int64
region                      object
generation (MWh)           float64
dtype: object

I'm a little surprised that the only plants with NERC region matches are in AK and HI.

In [15]:
facility_df.loc[facility_df['region'].isnull(), 'state'].unique()

array(['AK', 'HI'], dtype=object)

## Import state-level generation data

In [16]:
folder = os.path.join('Data storage', 'Derived data', 'state gen data')
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", 
          "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", 
          "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", 
          "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", 
          "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [52]:
state_list = []
for state in states:
    path = os.path.join(folder, '{} fuels gen.csv'.format(state))
    df = pd.read_csv(path, parse_dates=['datetime'])
    state_list.append(df)
state_df = pd.concat(state_list)
state_df.reset_index(inplace=True, drop=True)

In [53]:
state_df.dtypes

end                            int64
f                             object
geography                     object
last_updated                  object
sector                         int64
series_id                     object
start                          int64
type                          object
units                         object
year                           int64
month                          int64
generation (MWh)             float64
datetime              datetime64[ns]
quarter                        int64
total fuel (mmbtu)           float64
elec fuel (mmbtu)            float64
all fuel CO2 (kg)            float64
elec fuel CO2 (kg)           float64
dtype: object

In [54]:
state_df['state'] = state_df['geography'].str[-2:]
keep_cols = ['state', 'type', 'year', 'datetime', 'generation (MWh)',
             'elec fuel (mmbtu)']

fuel_cats = facility_fuel_cats.keys()
state_df = state_df.loc[(state_df['year'] >= 2014) &
                        (state_df['type'].isin(fuel_cats)), keep_cols]

In [55]:
state_df['type'].unique()

array(['COW', 'HYC', 'NUC', 'NG', 'PEL', 'DPV', 'OTH', 'OOG', 'WWW', 'SUN',
       'WAS', 'WND', 'HPS', 'PC', 'GEO'], dtype=object)

## Total generation and fuel consumption for each fuel category

### Annual

In [59]:
annual_facility = facility_df.groupby(['year', 'state', 'fuel category']).sum()
# annual_facility.reset_index(inplace=True)
annual_facility.drop('plant id', axis=1, inplace=True)

In [60]:
annual_facility.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,generation (MWh),elec fuel (mmbtu)
year,state,fuel category,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,AK,COW,558292.181,7216953.0
2014,AK,HYC,1538738.0,14633403.0
2014,AK,NG,3288022.319,32828304.0
2014,AK,OOG,56165.769,546450.0
2014,AK,PEL,445621.447,6927101.0


In [61]:
annual_state = state_df.groupby(['year', 'state', 'type']).sum()
# annual_state.reset_index(inplace=True)

In [100]:
annual_state.head(n=25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,generation (MWh),elec fuel (mmbtu)
year,state,type,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,AK,COW,558292.17,7216950.0
2014,AK,HYC,1538738.0,
2014,AK,NG,3288022.33,32828310.0
2014,AK,OOG,,
2014,AK,OTH,-2312.99,
2014,AK,PEL,445621.46,6927090.0
2014,AK,WAS,62511.68,
2014,AK,WND,151957.0,
2014,AK,WWW,0.0,
2014,AL,COW,47301626.28,488993810.0


In [122]:
annual_state.loc[2016, 'CA', 'SUN']

generation (MWh)     19030396.62
elec fuel (mmbtu)            NaN
Name: (2016, CA, SUN), dtype: float64

In [123]:
annual_facility.loc[2016, 'CA', 'SUN']

generation (MWh)      14343691.0
elec fuel (mmbtu)    133668844.0
Name: (2016, CA, SUN), dtype: float64

How much generation from large sources (Hydro, wind, coal, natural gas, and nuclear) is missed by monthly 923 data? 

In [124]:
for fuel in ['HYC', 'WND', 'COW', 'NG', 'NUC', 'SUN']:
    state_total = annual_state.loc[2016, :, fuel]['generation (MWh)'].sum()
    facility_total = annual_facility.loc[2016, :, fuel]['generation (MWh)'].sum()
    
    error = (state_total - facility_total) / state_total
    print '{} has an error of {:.2f}%'.format(fuel, error * 100)

HYC has an error of 24.60%
WND has an error of 3.48%
COW has an error of 1.36%
NG has an error of 5.66%
NUC has an error of 0.00%
SUN has an error of 37.47%


#### States that include more than one NERC region

In [79]:
NERC_states = ['WY', 'SD', 'NE', 'OK', 'TX', 'NM', 'LA', 'AR',
               'MO', 'MN', 'IL', 'KY', 'VA', 'FL']

In [93]:
error_list = []
for state in NERC_states:
    error = (annual_state.loc[2016, state]
             - annual_facility.loc[2016, state]) / annual_state.loc[2016, state]
    error['state'] = state
    
    for col in ['generation (MWh)']:#, 'elec fuel (mmbtu)']:
        if error.loc[error[col] > 0.05, col].any():
            error_list.append(error.loc[error[col] > 0.05])

The dataframe below shows all states with more than one NERC region where facility generation is at least 5% below EIA's state-level estimate in 2016. 

In [94]:
pd.concat(error_list)

Unnamed: 0,generation (MWh),elec fuel (mmbtu),state
COW,0.057014,0.058988,WY
HYC,0.095835,,WY
NG,0.413092,0.508736,WY
NG,0.21255,0.233675,SD
HYC,1.0,,NE
NG,0.141895,0.122942,NE
WAS,1.0,,NE
HYC,0.327894,,OK
OTH,0.359396,,OK
PEL,0.098354,0.089782,OK
