# Extract facility generation and fuel use data
This notebook creates dataframes with monthly facility generation and fuel use data, merges them, and exports the results. The code assumes that you have already downloaded an `ELEC.txt` file from [EIA's bulk download website](https://www.eia.gov/opendata/bulkfiles.php).


In [2]:
import json
import pandas as pd
import os
from os.path import join
import numpy as np
from joblib import Parallel, delayed
import sys

cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')

In [3]:
%load_ext watermark

In [4]:
%watermark -iv -v

json        2.0.9
yapf        0.16.3
pandas      0.20.2
numpy       1.13.1
CPython 3.6.2
IPython 6.2.1


In [2]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

In [7]:
# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [9]:
%aimport Data.data_extraction
from Data.data_extraction import facility_line_to_df

%aimport Analysis.index
from Analysis.index import add_datetime, add_quarter

## Read `ELEC.txt` file
Download the most current file from [EIA's bulk download site](https://www.eia.gov/opendata/bulkfiles.php). Save it to `\Data storage\Raw data`. I've tried to do this via the requests library, but the data file often gets corrupted.

In [2]:
path = join(data_path, 'Raw data', '2017-08-31 ELEC.txt')
with open(path, 'r') as f:
    raw_txt = f.readlines()

## Filter lines  to only include facility generation 
- Include `ELEC.PLANT` in the `series_id`
- Include "All" as the only allowable prime mover
    - Some facilities have incorrect data at the individual prime mover level
- Do not include "All" as a fuel code
- Only monthly frequency

In [3]:
gen_rows = [row for row in raw_txt if 'ELEC.PLANT.GEN' in row 
            and 'series_id' in row 
            and 'ALL.M' in row 
            and 'ALL-' not in row]
total_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_TOT_BTU' in row 
                   and 'series_id' in row 
                   and 'ALL.M' in row 
                   and 'ALL-' not in row]
eg_fuel_rows = [row for row in raw_txt if 'ELEC.PLANT.CONS_EG_BTU' in row 
                and 'series_id' in row 
                and 'ALL.M' in row 
                and 'ALL-' not in row]

## Combine generation into one large dataframe

In [4]:
if __name__ == '__main__':
    exception_list = []
    facility_gen = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in gen_rows))
    facility_gen.reset_index(drop=True, inplace=True)
    facility_gen.rename_axis({'value':'generation (MWh)'}, axis=1, inplace=True)

In [5]:
facility_gen.loc[:,'lat'] = facility_gen.loc[:,'lat'].astype(float)
facility_gen.loc[:,'lon'] = facility_gen.loc[:,'lon'].astype(float)
facility_gen.loc[:, 'plant id'] = facility_gen.loc[:, 'plant id'].astype(int)

In [6]:
#drop
facility_gen.tail()

Unnamed: 0,f,fuel,geography,last_updated,lat,lon,month,plant id,prime mover,series_id,units,generation (MWh),year
1620516,M,MSW,USA-NY,2016-07-07T17:18:42-04:00,40.7389,-73.5906,5,10642,ALL,ELEC.PLANT.GEN.10642-MSW-ALL.M,megawatthours,42745.0,2001
1620517,M,MSW,USA-NY,2016-07-07T17:18:42-04:00,40.7389,-73.5906,4,10642,ALL,ELEC.PLANT.GEN.10642-MSW-ALL.M,megawatthours,45311.0,2001
1620518,M,MSW,USA-NY,2016-07-07T17:18:42-04:00,40.7389,-73.5906,3,10642,ALL,ELEC.PLANT.GEN.10642-MSW-ALL.M,megawatthours,49284.0,2001
1620519,M,MSW,USA-NY,2016-07-07T17:18:42-04:00,40.7389,-73.5906,2,10642,ALL,ELEC.PLANT.GEN.10642-MSW-ALL.M,megawatthours,40350.0,2001
1620520,M,MSW,USA-NY,2016-07-07T17:18:42-04:00,40.7389,-73.5906,1,10642,ALL,ELEC.PLANT.GEN.10642-MSW-ALL.M,megawatthours,41493.0,2001


## Combine total fuel use into one large dataframe

In [9]:
if __name__ == '__main__':
    exception_list = []
    facility_all_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in total_fuel_rows))
    facility_all_fuel.reset_index(drop=True, inplace=True)
    facility_all_fuel.rename_axis({'value':'total fuel (mmbtu)'}, axis=1, inplace=True)

In [10]:
facility_all_fuel.loc[:,'lat'] = facility_all_fuel.loc[:,'lat'].astype(float)
facility_all_fuel.loc[:,'lon'] = facility_all_fuel.loc[:,'lon'].astype(float)
facility_all_fuel.loc[:,'plant id'] = facility_all_fuel.loc[:,'plant id'].astype(int)

## Combine total fuel use for electricity into one large dataframe

In [13]:
if __name__ == '__main__':
    exception_list = []
    facility_eg_fuel = pd.concat(Parallel(n_jobs=-1)(delayed(facility_line_to_df)(json.loads(row)) for row in eg_fuel_rows))
    facility_eg_fuel.reset_index(drop=True, inplace=True)
    facility_eg_fuel.rename_axis({'value':'elec fuel (mmbtu)'}, axis=1, inplace=True)

In [14]:
facility_eg_fuel.loc[:,'lat'] = facility_eg_fuel.loc[:,'lat'].astype(float)
facility_eg_fuel.loc[:,'lon'] = facility_eg_fuel.loc[:,'lon'].astype(float)
facility_eg_fuel.loc[:,'plant id'] = facility_eg_fuel.loc[:,'plant id'].astype(int)

## Merge dataframes
Need to be careful here because there are fuel/prime mover combinations that have generation but no fuel use (e.g. the steam cycle of a combined cycle system - but only in some cases). 

In [17]:
keep_cols = ['fuel', 'generation (MWh)', 'month', 'plant id', 'prime mover', 'year',
             'geography', 'lat', 'lon', 'last_updated']
merge_cols = ['fuel', 'month', 'plant id', 'year']
gen_total_fuel = facility_all_fuel.merge(facility_gen.loc[:,keep_cols], 
                                    how='outer', on=merge_cols)

### Fill in missing values from the first merge

In [19]:
def fill_missing(df):
    cols = [col[:-2] for col in df.columns if '_x' in col]
    
    # Create new column from the _x version, fill missing values from the _y version
    for col in cols:
        df[col] = df.loc[:, col + '_x']
        df.loc[df[col].isnull(), col] = df.loc[df[col].isnull(), col + '_y']
        
        df.drop([col+'_x', col+'_y'], axis=1, inplace=True)

In [20]:
fill_missing(gen_total_fuel)

In [22]:
keep_cols = ['fuel', 'elec fuel (mmbtu)', 'month', 'plant id', 'prime mover', 'year',
             'geography', 'lat', 'lon', 'last_updated']
all_facility_data = gen_total_fuel.merge(facility_eg_fuel.loc[:,keep_cols], 
                                         how='outer', on=merge_cols)

### FIll in missing values from second merge and drop units/series_id columns

In [23]:
fill_missing(all_facility_data)

In [25]:
all_facility_data.drop(['units', 'series_id'], axis=1, inplace=True)
all_facility_data.head()

Unnamed: 0,f,fuel,month,plant id,total fuel (mmbtu),year,generation (MWh),elec fuel (mmbtu),geography,last_updated,lat,lon,prime mover
0,M,WAT,12,10140,0.0,2016,0.0,0.0,USA-ID,2017-03-06T16:49:40-05:00,44.027444,-112.719439,ALL
1,M,WAT,11,10140,0.0,2016,0.0,0.0,USA-ID,2017-03-06T16:49:40-05:00,44.027444,-112.719439,ALL
2,M,WAT,10,10140,0.0,2016,0.0,0.0,USA-ID,2017-03-06T16:49:40-05:00,44.027444,-112.719439,ALL
3,M,WAT,9,10140,0.0,2016,0.0,0.0,USA-ID,2017-03-06T16:49:40-05:00,44.027444,-112.719439,ALL
4,M,WAT,8,10140,0.0,2016,0.0,0.0,USA-ID,2017-03-06T16:49:40-05:00,44.027444,-112.719439,ALL


### Add datetime and quarter columns

In [None]:
add_quarter(all_facility_data)

## Load emission factors
These are mostly [EIA emission factors](https://www.eia.gov/tools/faqs/faq.cfm?id=76&t=11)

In [29]:
path = join(data_path, 'Final emission factors.csv')
ef = pd.read_csv(path, index_col=0)

### Apply factors to facility generation

In [30]:
fossil_factors = dict(zip(ef.index, ef['Fossil Factor']))
total_factors = dict(zip(ef.index, ef['Total Factor']))
fossil_factors, total_factors

({'AB': 0.0,
  'BFG': 274.31999999999999,
  'BIT': 93.299999999999997,
  'BLQ': 0.0,
  'DFO': 73.159999999999997,
  'GEO': 7.71,
  'JF': 70.900000000000006,
  'KER': 72.299999999999997,
  'LFG': 0.0,
  'LIG': 97.700000000000003,
  'MSB': 0.0,
  'MSN': 90.700000000000003,
  'MSW': 41.689999999999998,
  'NG': 53.07,
  'NUC': 0.0,
  'OBG': 0.0,
  'OBL': 0.0,
  'OBS': 0.0,
  'OG': 59.0,
  'OTH': 0.0,
  'PC': 102.09999999999999,
  'PG': 63.07,
  'PUR': 0.0,
  'RC': 93.299999999999997,
  'RFO': 78.790000000000006,
  'SC': 93.299999999999997,
  'SGC': 93.299999999999997,
  'SGP': 73.159999999999997,
  'SLW': 0.0,
  'SUB': 97.200000000000003,
  'SUN': 0.0,
  'TDF': 85.969999999999999,
  'WAT': 0.0,
  'WC': 93.299999999999997,
  'WDL': 0.0,
  'WDS': 0.0,
  'WH': 0.0,
  'WND': 0.0,
  'WO': 95.25},
 {'AB': 118.17,
  'BFG': 274.31999999999999,
  'BIT': 93.299999999999997,
  'BLQ': 94.400000000000006,
  'DFO': 73.159999999999997,
  'GEO': 7.71,
  'JF': 70.900000000000006,
  'KER': 72.29999999999999

## Apply emission factors
Fuel emission factor is kg/mmbtu

In [31]:
# Start with 0 emissions in all rows
# For fuels where we have an emission factor, replace the 0 with the calculated value
all_facility_data['all fuel fossil CO2 (kg)'] = 0
all_facility_data['elec fuel fossil CO2 (kg)'] = 0
all_facility_data['all fuel total CO2 (kg)'] = 0
all_facility_data['elec fuel total CO2 (kg)'] = 0
for fuel in total_factors.keys():
    # All fuel CO2 emissions
    all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel fossil CO2 (kg)'] = \
        all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * fossil_factors[fuel]      
    all_facility_data.loc[all_facility_data['fuel']==fuel,'all fuel total CO2 (kg)'] = \
        all_facility_data.loc[all_facility_data['fuel']==fuel,'total fuel (mmbtu)'] * total_factors[fuel]
    
    # Electric fuel CO2 emissions
    all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel fossil CO2 (kg)'] = \
        all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * fossil_factors[fuel]
    all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel total CO2 (kg)'] = \
        all_facility_data.loc[all_facility_data['fuel']==fuel,'elec fuel (mmbtu)'] * total_factors[fuel]

### Set nan and negative emissions to 0
When no fuel was used for electricity production, or when negative fuel is somehow reported by EIA, set the emissions to 0. This is implemented by filtering out all values that are greater than or equal to 0.

In [32]:
# Fossil CO2
all_facility_data.loc[~(all_facility_data['all fuel fossil CO2 (kg)']>=0),
                      'all fuel fossil CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel fossil CO2 (kg)']>=0),
                      'elec fuel fossil CO2 (kg)'] = 0
# Total CO2
all_facility_data.loc[~(all_facility_data['all fuel total CO2 (kg)']>=0),
                      'all fuel total CO2 (kg)'] = 0
all_facility_data.loc[~(all_facility_data['elec fuel total CO2 (kg)']>=0),
                      'elec fuel total CO2 (kg)'] = 0

### Export

In [33]:
path = join(data_path, 'Facility gen fuels and CO2 2017-08-30.csv')
all_facility_data.to_csv(path, index=False)