In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join
import sys

cwd = os.getcwd()

# Set data path
data_path = join(cwd, '..', '..', 'data')

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Using `autoreload`
[`autoreload` is a useful extension](https://ipython.org/ipython-doc/3/config/extensions/autoreload.html)  that reloads imported modules. It lets you make edits to functions in a script without having to restart your kernel and run everything in the notebook from the start. I discovered it as part of [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/). Even if you don't use the Cookiecutter package I recommend checking out their suggestions for project structure. 

In [3]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

In [4]:
# Import functions to load/clean the data files
%aimport data.clean_import
from data.clean_import import import_epa_emissions, import_plant_generation

## Import data
Use functions that were written based on experience from the `2 - Explore file imports` notebook. These functions will do all the processing work and keep this notebook cleaner.

In [6]:
epa_path = join(data_path, 'external', 'epa_emissions_2016.txt')
gen_path = join(data_path, 'external', 'EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx')

In [8]:
epa = import_epa_emissions(epa_path)
gen = import_plant_generation(gen_path)

## View each dataset

In [9]:
epa.head()

Unnamed: 0,state,facility_name,plant_id,month,year,gross_load_mwh,heat_input_mmbtu,so2_kg,nox_kg,co2_kg
0,AL,AMEA Sylacauga Plant,56018,1,2016,4534.0,52585.8,12.700586,2022.114696,2813906.0
1,AL,AMEA Sylacauga Plant,56018,2,2016,792.0,9186.0,1.814369,327.493677,491694.1
2,AL,AMEA Sylacauga Plant,56018,3,2016,1498.0,17365.5,4.535923,614.164042,929138.6
3,AL,AMEA Sylacauga Plant,56018,4,2016,1405.0,14987.3,4.535923,531.610234,802042.0
4,AL,AMEA Sylacauga Plant,56018,5,2016,1791.0,19412.8,5.443108,685.831633,1038817.0


In [10]:
gen.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel
0,2,1,-77.112,HYC
1,2,2,-69.679,HYC
2,2,3,-48.374,HYC
3,2,4,-24.341,HYC
4,2,5,-11.476,HYC


## Does each dataset have the same plants?
Clearly we have generation data for many more plants than we have emissions data.

In [25]:
len(epa.plant_id.unique()), len(gen.plant_id.unique())

(1227, 8121)

In [20]:
set(epa['plant_id']) - set(gen['plant_id'])

{50,
 330,
 1381,
 1443,
 2709,
 3468,
 3644,
 7258,
 7762,
 7765,
 10380,
 10619,
 10803,
 50202,
 50736,
 54571,
 55192,
 58478,
 70454}

In [22]:
len(set(gen['plant_id']) - set(epa['plant_id']))

6913

## Join data
Joining (or merging) different datasets based on some common factor is a powerful tool. It's a common SQL operation but difficult to implement in Excel (vlookup anyone?).

### Inner joins
Inner joins will only keep rows with common keys from each dataframe. Notice that the first value of `plant_id` in the result below is 3. 

In [11]:
inner = pd.merge(left=gen, right=epa, on=['plant_id', 'month'], how='inner')

In [12]:
inner.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,state,facility_name,year,gross_load_mwh,heat_input_mmbtu,so2_kg,nox_kg,co2_kg
0,3,1,1123279.005,NG,AL,Barry,2016,1167143.0,9108381.575,513769.53989,406174.712916,629378600.0
1,3,2,977043.999,NG,AL,Barry,2016,1013372.5,7912099.5,481840.267189,339913.942428,537775200.0
2,3,3,904562.996,NG,AL,Barry,2016,942802.75,7467475.775,224331.35417,263205.12575,524584700.0
3,3,4,788708.0,NG,AL,Barry,2016,809007.5,5759692.3,180525.219376,95520.198617,327528600.0
4,3,5,1028473.0,NG,AL,Barry,2016,1048450.25,8471790.125,540601.341762,373002.597176,614235700.0


### Left/right joins
Left and right joins will keep all values of one dataframe, merging in matching values from the second dataframe. The results below have kept plant 2 and filled the `epa` column values with `NaN`.

In [13]:
left = pd.merge(left=gen, right=epa, on=['plant_id', 'month'], how='left')

In [14]:
left.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,state,facility_name,year,gross_load_mwh,heat_input_mmbtu,so2_kg,nox_kg,co2_kg
0,2,1,-77.112,HYC,,,,,,,,
1,2,2,-69.679,HYC,,,,,,,,
2,2,3,-48.374,HYC,,,,,,,,
3,2,4,-24.341,HYC,,,,,,,,
4,2,5,-11.476,HYC,,,,,,,,


### Outer joins
Outer joins will keep all rows from both dataframes, filling in missing values with `NaN`.

In [15]:
outer = pd.merge(left=gen, right=epa, on=['plant_id', 'month'], how='outer')

In [16]:
outer.head()

Unnamed: 0,plant_id,month,net_gen_mwh,primary_gen_fuel,state,facility_name,year,gross_load_mwh,heat_input_mmbtu,so2_kg,nox_kg,co2_kg
0,2,1,-77.112,HYC,,,,,,,,
1,2,2,-69.679,HYC,,,,,,,,
2,2,3,-48.374,HYC,,,,,,,,
3,2,4,-24.341,HYC,,,,,,,,
4,2,5,-11.476,HYC,,,,,,,,


## Export the combined data
I'm going to export the left join results.

In [26]:
out_path = join(data_path, 'processed', 'facility_gen_emissions.csv')

In [28]:
left.to_csv(out_path, index=False)