In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from os.path import join
import sys

cwd = os.getcwd()

# Set data path
data_path = join(cwd, '..', '..', 'data')

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

In [4]:
%aimport data.clean_import
from data.clean_import import import_epa_emissions, import_plant_capacity, import_plant_generation

## Import data
Use functions that were written based on experience from the `2 - Explore file imports` notebook. These functions will do all the processing work and keep this notebook cleaner.

In [5]:
epa_path = join(data_path, 'external', 'epa_emissions_2016.txt')
cap_path = join(data_path, 'external', '3_1_Generator_Y2016.xlsx')
gen_path = join(data_path, 'external', 'EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx')

In [6]:
epa = import_epa_emissions(epa_path)
cap = import_plant_capacity(cap_path)
gen = import_plant_generation(gen_path)

## View each dataset

In [7]:
epa.head()

Unnamed: 0,state,facility_name,plant_id,month,year,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu
0,AL,AMEA Sylacauga Plant,56018,1,2016,4534.0,0.014,2.229,3101.8,52585.8
1,AL,AMEA Sylacauga Plant,56018,2,2016,792.0,0.002,0.361,542.0,9186.0
2,AL,AMEA Sylacauga Plant,56018,3,2016,1498.0,0.005,0.677,1024.2,17365.5
3,AL,AMEA Sylacauga Plant,56018,4,2016,1405.0,0.005,0.586,884.1,14987.3
4,AL,AMEA Sylacauga Plant,56018,5,2016,1791.0,0.006,0.756,1145.1,19412.8


In [8]:
cap.head()

Unnamed: 0,plant_id,state,utility_id,nameplate_capacity_mw,nameplate_power_factor,summer_capacity_mw,winter_capacity_mw,minimum_load_mw,month_uprate_or_derate_completed,year_uprate_or_derate_completed,operating_month,operating_year,planned_retirement_month,planned_retirement_year,sector,turbines_or_hydrokinetic_buoys,planned_net_summer_capacity_uprate_mw,planned_net_winter_capacity_uprate_mw,planned_uprate_month,planned_uprate_year,planned_net_summer_capacity_derate_mw,planned_net_winter_capacity_derate_mw,planned_derate_month,planned_derate_year,planned_new_nameplate_capacity_mw,planned_repower_month,planned_repower_year,other_modifications_month,other_modifications_year,technology
0,2,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
1,3,AL,1950,2569.5,8.5,2172.5,2274.5,1421.0,0.0,0.0,61,19848,24.0,4038.0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Steam Coal
2,4,AL,585,225.0,2.571,225.9,228.0,225.0,0.0,0.0,23,5901,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
3,7,AL,390,138.0,1.7,130.0,130.0,56.0,0.0,0.0,11,3898,24.0,4038.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Natural Gas Steam Turbine
4,8,AL,585,1166.7,2.55,1053.7,1053.7,780.0,0.0,0.0,21,5886,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Steam Coal


In [9]:
gen.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel
0,2,1,-77.112,HYC
1,2,2,-69.679,HYC
2,2,3,-48.374,HYC
3,2,4,-24.341,HYC
4,2,5,-11.476,HYC


## Join data
Joining (or merging) different datasets based on some common factor is a powerful tool. It's a common SQL operation but difficult to implement in Excel (vlookup anyone?). Methods 2 and 3 are from [this stackoverflow post](https://stackoverflow.com/questions/23668427/pandas-joining-multiple-dataframes-on-columns).

### Method 1: Two independent joins

I'm going to do a "left" merge here, where all values from the left dataframe will be kept. When no corresponding values exist in the right dataframe (epa emissions) Pandas will insert `np.nan`. 

#### Join generation with epa emissions
I'm using the `pd.merge` function to join two dataframes. One is specified as "left" and the other as "right".

In [10]:
gen_epa = pd.merge(left=gen, right=epa, on=['plant_id', 'month'], how='left')

In [11]:
gen_epa.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,state,facility_name,year,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu
0,2,1,-77.112,HYC,,,,,,,,
1,2,2,-69.679,HYC,,,,,,,,
2,2,3,-48.374,HYC,,,,,,,,
3,2,4,-24.341,HYC,,,,,,,,
4,2,5,-11.476,HYC,,,,,,,,


#### Join gen_epa with capacity data
The `merge` function can also be used as a method of the dataframe, which is automatically considered the "left" object in the join.

In [12]:
final1 = gen_epa.merge(right=cap, on='plant_id', how='inner')

In [13]:
final1.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,state_x,facility_name,year,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu,state_y,utility_id,nameplate_capacity_mw,nameplate_power_factor,summer_capacity_mw,winter_capacity_mw,minimum_load_mw,month_uprate_or_derate_completed,year_uprate_or_derate_completed,operating_month,operating_year,planned_retirement_month,planned_retirement_year,sector,turbines_or_hydrokinetic_buoys,planned_net_summer_capacity_uprate_mw,planned_net_winter_capacity_uprate_mw,planned_uprate_month,planned_uprate_year,planned_net_summer_capacity_derate_mw,planned_net_winter_capacity_derate_mw,planned_derate_month,planned_derate_year,planned_new_nameplate_capacity_mw,planned_repower_month,planned_repower_year,other_modifications_month,other_modifications_year,technology
0,2,1,-77.112,HYC,,,,,,,,,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
1,2,2,-69.679,HYC,,,,,,,,,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
2,2,3,-48.374,HYC,,,,,,,,,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
3,2,4,-24.341,HYC,,,,,,,,,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric
4,2,5,-11.476,HYC,,,,,,,,,AL,195,53.9,0.92,56.0,52.0,52.0,0.0,0.0,7,1963,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Conventional Hydroelectric


### Method 2: Chain the `join` method
I'm also going to limit the columns that are kept in some of the dataframes

In [23]:
epa_keep = [
    'plant_id', 'month', 'gross_load_mwh', 'so2_tons',
    'nox_tons', 'co2_short_tons', 'heat_input_mmbtu'
]

cap_keep = [
    'plant_id', 'state', 'nameplate_capacity_mw', 'summer_capacity_mw',
    'winter_capacity_mw', 'minimum_load_mw', 'technology'
]

In [24]:
cap.columns

Index(['plant_id', 'state', 'utility_id', 'nameplate_capacity_mw',
       'nameplate_power_factor', 'summer_capacity_mw', 'winter_capacity_mw',
       'minimum_load_mw', 'month_uprate_or_derate_completed',
       'year_uprate_or_derate_completed', 'operating_month', 'operating_year',
       'planned_retirement_month', 'planned_retirement_year', 'sector',
       'turbines_or_hydrokinetic_buoys',
       'planned_net_summer_capacity_uprate_mw',
       'planned_net_winter_capacity_uprate_mw', 'planned_uprate_month',
       'planned_uprate_year', 'planned_net_summer_capacity_derate_mw',
       'planned_net_winter_capacity_derate_mw', 'planned_derate_month',
       'planned_derate_year', 'planned_new_nameplate_capacity_mw',
       'planned_repower_month', 'planned_repower_year',
       'other_modifications_month', 'other_modifications_year', 'technology'],
      dtype='object')

In [25]:
final2 = (pd.merge(left=gen, right=epa[epa_keep], on=['plant_id', 'month'], how='left')
            .merge(right=cap[cap_keep], on='plant_id', how='inner'))

In [26]:
final2.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,minimum_load_mw,technology
0,2,1,-77.112,HYC,,,,,,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric
1,2,2,-69.679,HYC,,,,,,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric
2,2,3,-48.374,HYC,,,,,,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric
3,2,4,-24.341,HYC,,,,,,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric
4,2,5,-11.476,HYC,,,,,,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric


### Method 3: Reduce function
This might not work well when mixing inner/left/right joins and what columns to join on.

In [18]:
from functools import reduce

In [19]:
df_list = [cap[cap_keep], gen, epa[epa_keep]]
final3 = reduce(lambda left, right: pd.merge(left, right, on='plant_id', how='left'), df_list)

In [20]:
final3.head()

Unnamed: 0,plant_id,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,minimum_load_mw,technology,month_x,net_gen_mwh,primary_gen_fuel,month_y,year,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu
0,2,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric,1.0,-77.112,HYC,,,,,,,
1,2,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric,2.0,-69.679,HYC,,,,,,,
2,2,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric,3.0,-48.374,HYC,,,,,,,
3,2,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric,4.0,-24.341,HYC,,,,,,,
4,2,AL,53.9,56.0,52.0,52.0,Conventional Hydroelectric,5.0,-11.476,HYC,,,,,,,


## Export the combined data

In [21]:
out_path = join(data_path, 'processed', 'facility_gen_cap_emissions.csv')

In [27]:
final2.to_csv(out_path, index=False)