# Initial exploration of each data file
Pandas makes importing data from files easy. But sometimes the file contents are poorly formatted or can hold hidden surprises. Make sure that the data - and data types - are what you expect them to be before starting your analysis.

In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join

cwd = os.getcwd()
data_path = join(cwd, '..', '..', 'data')

### Don't write absolute paths
An absolute path is something like `/Users/Home/Documents/GitHub/python-data-analysis-class/data/epa_emissions_2016.txt`. 

In [2]:
# Paths to each of the data files (epa emissions, eia capacity by generator, and eia generation)

epa_path = join(data_path, 'epa_epa_2016.txt')
cap_path = join(data_path, '3_1_Generator_Y2016.xlsx')
gen_path = join(data_path, 'EIA923_Schedules_2_3_4_5_M_10_2016.xlsx')

## Load EPA epa data
Lets load the file and see what needs to be done to make sure the data is in good shape and accessible.

In [3]:
epa = pd.read_csv(epa_path)

It looks like the header column is not well aligned with the data. States are showing up as the index and the first column is labeled **State**.

In [7]:
epa.head()

Unnamed: 0,State,Facility Name,Facility ID (ORISPL),Month,Year,Gross Load (MW-h),SO2 (tons),NOx (tons),CO2 (short tons),Heat Input (MMBtu)
AL,AMEA Sylacauga Plant,56018,1,2016,4534.0,0.014,2.229,3101.8,52585.8,
AL,AMEA Sylacauga Plant,56018,2,2016,792.0,0.002,0.361,542.0,9186.0,
AL,AMEA Sylacauga Plant,56018,3,2016,1498.0,0.005,0.677,1024.2,17365.5,
AL,AMEA Sylacauga Plant,56018,4,2016,1405.0,0.005,0.586,884.1,14987.3,
AL,AMEA Sylacauga Plant,56018,5,2016,1791.0,0.006,0.756,1145.1,19412.8,


In [9]:
with open(epa_path) as f:
    f.read()

In [27]:
with open(epa_path) as f:
    head = [next(f) for x in range(5)]

for line in head:
    print(line) 

State, Facility Name, Facility ID (ORISPL), Month, Year, Gross Load (MW-h), SO2 (tons), NOx (tons), CO2 (short tons), Heat Input (MMBtu)

"AL","AMEA Sylacauga Plant","56018","1","2016","4534","0.014","2.229","3101.8","52585.8",

"AL","AMEA Sylacauga Plant","56018","2","2016","792","0.002","0.361","542","9186",

"AL","AMEA Sylacauga Plant","56018","3","2016","1498","0.005","0.677","1024.2","17365.5",

"AL","AMEA Sylacauga Plant","56018","4","2016","1405","0.005","0.586","884.1","14987.3",



In [28]:
epa = pd.read_csv(epa_path, index_col=False)

In [29]:
epa.head()

Unnamed: 0,State,Facility Name,Facility ID (ORISPL),Month,Year,Gross Load (MW-h),SO2 (tons),NOx (tons),CO2 (short tons),Heat Input (MMBtu)
0,AL,AMEA Sylacauga Plant,56018,1,2016,4534.0,0.014,2.229,3101.8,52585.8
1,AL,AMEA Sylacauga Plant,56018,2,2016,792.0,0.002,0.361,542.0,9186.0
2,AL,AMEA Sylacauga Plant,56018,3,2016,1498.0,0.005,0.677,1024.2,17365.5
3,AL,AMEA Sylacauga Plant,56018,4,2016,1405.0,0.005,0.586,884.1,14987.3
4,AL,AMEA Sylacauga Plant,56018,5,2016,1791.0,0.006,0.756,1145.1,19412.8


In [30]:
epa.tail()

Unnamed: 0,State,Facility Name,Facility ID (ORISPL),Month,Year,Gross Load (MW-h),SO2 (tons),NOx (tons),CO2 (short tons),Heat Input (MMBtu)
14476,WY,Wyodak,6101,8,2016,279246.0,233.875,347.809,321484.0,3065249.8
14477,WY,Wyodak,6101,9,2016,267691.0,228.71,334.644,309080.6,2946991.1
14478,WY,Wyodak,6101,10,2016,253110.0,211.495,314.882,290879.0,2773451.5
14479,WY,Wyodak,6101,11,2016,249476.0,207.293,306.374,283543.1,2703507.4
14480,WY,Wyodak,6101,12,2016,252072.0,207.199,304.01,282007.9,2688870.4


### Access parts of the dataframe

Look at the column names

In [32]:
epa.columns

Index(['State', ' Facility Name', ' Facility ID (ORISPL)', ' Month', ' Year',
       ' Gross Load (MW-h)', ' SO2 (tons)', ' NOx (tons)', ' CO2 (short tons)',
       ' Heat Input (MMBtu)'],
      dtype='object')

Notice that most of the columns have a leading space? We need to strip out those leading spaces and it might be nice to do some extra formatting.

In [9]:
epa.columns.str.strip()

Index(['State', 'Facility Name', 'Facility ID (ORISPL)', 'Unit ID',
       'Associated Stacks', 'Month', 'Year', 'Program(s)', 'Operating Time',
       'Gross Load (MW-h)', 'Steam Load (1000lb)', 'SO2 (tons)',
       'Avg. NOx Rate (lb/MMBtu)', 'NOx (tons)', 'CO2 (short tons)',
       'Heat Input (MMBtu)', 'EPA Region', 'NERC Region', 'Unit Type',
       'Fuel Type (Primary)'],
      dtype='object')

In [36]:
epa.columns = epa.columns.str.strip()

In [37]:
epa.columns = (epa.columns.str.lower()
                  .str.replace(' ', '_')
                  .str.replace('.', '')
                  .str.replace('-', '')
                  .str.replace('(', '')
                  .str.replace(')', ''))

### Data types of each column
Numeric columns will either be `int` or `float`. If a column is of type `object` it is either all strings or a mix of types. Watch out for columns that should be numeric but should up as `object`.

In [38]:
epa.dtypes

state                  object
facility_name          object
facility_id_orispl      int64
month                   int64
year                    int64
gross_load_mwh        float64
so2_tons              float64
nox_tons              float64
co2_short_tons        float64
heat_input_mmbtu      float64
dtype: object

## Basic statistics of the data

In [44]:
epa.describe()

Unnamed: 0,facility_id_orispl,month,year,gross_load_mwh,so2_tons,nox_tons,co2_short_tons,heat_input_mmbtu
count,14481.0,14481.0,14481.0,12262.0,12465.0,12700.0,12022.0,12722.0
mean,22422.507769,6.490988,2016.0,197438.3,117.769365,91.650835,158286.2,1743909.0
std,24871.051651,3.442153,0.0,292661.4,369.451131,214.962978,267839.1,2650419.0
min,3.0,1.0,2016.0,0.0,0.0,0.0,0.0,0.0
25%,2399.0,4.0,2016.0,4474.02,0.028,1.55375,4555.913,51533.96
50%,7145.0,6.0,2016.0,56867.4,0.356,9.534,45365.91,549351.2
75%,55238.0,9.0,2016.0,289037.0,16.868,55.8515,179968.9,2454411.0
max,70454.0,12.0,2016.0,2093063.0,5165.046,2394.967,2341848.0,22328830.0


Index into a dataframe using `.loc` or `.iloc` with square brackets and row,column notation

In [41]:
epa.iloc[0:5, :3]

Unnamed: 0,state,facility_name,facility_id_orispl
0,AL,AMEA Sylacauga Plant,56018
1,AL,AMEA Sylacauga Plant,56018
2,AL,AMEA Sylacauga Plant,56018
3,AL,AMEA Sylacauga Plant,56018
4,AL,AMEA Sylacauga Plant,56018


## Load capacity data

In [47]:
capacity = pd.read_excel(cap_path, sheet_name='Operable')

  **kwds)


In [48]:
capacity.head()

Unnamed: 0,"2016 Form EIA-860 Data - Schedule 3, 'Generator Data' (Operable Units Only)",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,Utility ID,Utility Name,Plant Code,Plant Name,State,County,Generator ID,Technology,Prime Mover,Unit Code,...,Planned Energy Source 1,Planned New Nameplate Capacity (MW),Planned Repower Month,Planned Repower Year,Other Planned Modifications?,Other Modifications Month,Other Modifications Year,Multiple Fuels?,Cofire Fuels?,Switch Between Oil and Natural Gas?
1,195,Alabama Power Co,2,Bankhead Dam,AL,Tuscaloosa,1,Conventional Hydroelectric,HY,,...,,,,,,,,N,,
2,195,Alabama Power Co,3,Barry,AL,Mobile,1,Natural Gas Steam Turbine,ST,,...,,,,,,,,N,,
3,195,Alabama Power Co,3,Barry,AL,Mobile,2,Natural Gas Steam Turbine,ST,,...,,,,,,,,N,,
4,195,Alabama Power Co,3,Barry,AL,Mobile,4,Conventional Steam Coal,ST,,...,,,,,,,,N,,


Looks like the first row isn't the column names and the last row is a footnote

In [50]:
capacity.tail()

Unnamed: 0,Utility ID,Utility Name,Plant Code,Plant Name,State,County,Generator ID,Technology,Prime Mover,Unit Code,...,Planned Energy Source 1,Planned New Nameplate Capacity (MW),Planned Repower Month,Planned Repower Year,Other Planned Modifications?,Other Modifications Month,Other Modifications Year,Multiple Fuels?,Cofire Fuels?,Switch Between Oil and Natural Gas?
20720,61053,Mount Sinai Hospital,61416.0,Mount Sinai Hospital,NY,New York,GP8,Petroleum Liquids,IC,,...,,,,,,,,N,,
20721,61053,Mount Sinai Hospital,61416.0,Mount Sinai Hospital,NY,New York,GP9,Petroleum Liquids,IC,,...,,,,,,,,N,,
20722,61053,Mount Sinai Hospital,61416.0,Mount Sinai Hospital,NY,New York,HES13,Petroleum Liquids,IC,,...,,,,,,,,N,,
20723,61053,Mount Sinai Hospital,61416.0,Mount Sinai Hospital,NY,New York,HES14,Petroleum Liquids,IC,,...,,,,,,,,N,,
20724,NOTE: Information on planned capacity changes ...,,,,,,,,,,...,,,,,,,,,,


In [51]:
capacity = pd.read_excel(cap_path, sheet_name='Operable', header=1, skipfooter=1)

In [52]:
capacity.head()

Unnamed: 0,Utility ID,Utility Name,Plant Code,Plant Name,State,County,Generator ID,Technology,Prime Mover,Unit Code,...,Planned Energy Source 1,Planned New Nameplate Capacity (MW),Planned Repower Month,Planned Repower Year,Other Planned Modifications?,Other Modifications Month,Other Modifications Year,Multiple Fuels?,Cofire Fuels?,Switch Between Oil and Natural Gas?
0,195,Alabama Power Co,2,Bankhead Dam,AL,Tuscaloosa,1,Conventional Hydroelectric,HY,,...,,,,,,,,N,,
1,195,Alabama Power Co,3,Barry,AL,Mobile,1,Natural Gas Steam Turbine,ST,,...,,,,,,,,N,,
2,195,Alabama Power Co,3,Barry,AL,Mobile,2,Natural Gas Steam Turbine,ST,,...,,,,,,,,N,,
3,195,Alabama Power Co,3,Barry,AL,Mobile,4,Conventional Steam Coal,ST,,...,,,,,,,,N,,
4,195,Alabama Power Co,3,Barry,AL,Mobile,5,Conventional Steam Coal,ST,,...,,,,,,,,N,,


### Check the column names


In [53]:
capacity.columns

Index(['Utility ID', 'Utility Name', 'Plant Code', 'Plant Name', 'State',
       'County', 'Generator ID', 'Technology', 'Prime Mover', 'Unit Code',
       'Ownership', 'Duct Burners',
       'Can Bypass Heat Recovery Steam Generator?',
       'RTO/ISO LMP Node Designation',
       'RTO/ISO Location Designation for Reporting Wholesale Sales Data to FERC',
       'Nameplate Capacity (MW)', 'Nameplate Power Factor',
       'Summer Capacity (MW)', 'Winter Capacity (MW)', 'Minimum Load (MW)',
       'Uprate or Derate Completed During Year',
       'Month Uprate or Derate Completed', 'Year Uprate or Derate Completed',
       'Status', 'Synchronized to Transmission Grid', 'Operating Month',
       'Operating Year', 'Planned Retirement Month', 'Planned Retirement Year',
       'Associated with Combined Heat and Power System', 'Sector Name',
       'Sector', 'Topping or Bottoming', 'Energy Source 1', 'Energy Source 2',
       'Energy Source 3', 'Energy Source 4', 'Energy Source 5',
       'Ene

In [54]:
capacity.dtypes

Utility ID                                                                   int64
Utility Name                                                                object
Plant Code                                                                   int64
Plant Name                                                                  object
State                                                                       object
County                                                                      object
Generator ID                                                                object
Technology                                                                  object
Prime Mover                                                                 object
Unit Code                                                                   object
Ownership                                                                   object
Duct Burners                                                                object
Can 

In [71]:
capacity.columns = [name.strip() for name in capacity.columns]
capacity.columns

Index([u'Entity ID', u'Entity Name', u'Plant ID', u'Plant Name', u'Sector',
       u'Plant State', u'Generator ID', u'Nameplate Capacity (MW)',
       u'Net Summer Capacity (MW)', u'Technology', u'Energy Source Code',
       u'Prime Mover Code', u'Operating Month', u'Operating Year',
       u'Planned Retirement Month', u'Planned Retirement Year', u'Status',
       u'Planned Derate Year', u'Planned Derate Month',
       u'Planned Derate of Summer Capacity (MW)', u'Planned Uprate Year',
       u'Planned Uprate Month', u'Planned Uprate of Summer Capacity (MW)',
       u'County', u'Latitude', u'Longitude'],
      dtype='object')

## Boolean filtering

In [43]:
PA_cap = capacity.loc[capacity['Plant State']=='PA',:]
PA_cap

Unnamed: 0,Entity ID,Entity Name,Plant ID,Plant Name,Sector,Plant State,Generator ID,Nameplate Capacity (MW),Net Summer Capacity (MW),Technology,...,Status,Planned Derate Year,Planned Derate Month,Planned Derate of Summer Capacity (MW),Planned Uprate Year,Planned Uprate Month,Planned Uprate of Summer Capacity (MW),County,Latitude,Longitude
5223,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,1A,25.5,15,Petroleum Liquids,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5224,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,2A,65.3,46,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5225,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,2B,65.3,48,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5226,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,3,65.3,49,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5227,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,ST4,144.0,101,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5228,17235,NRG REMA LLC,3109,Hamilton (PA),IPP Non-CHP,PA,1,19.6,18,Petroleum Liquids,...,(OP) Operating,,,,,,,Adams,39.9087,-76.9885
5229,17235,NRG REMA LLC,3110,Hunterstown,IPP Non-CHP,PA,1,20.0,18,Petroleum Liquids,...,(OP) Operating,,,,,,,Adams,39.8662,-77.1648
5230,17235,NRG REMA LLC,3110,Hunterstown,IPP Non-CHP,PA,2,20.0,17,Petroleum Liquids,...,(OP) Operating,,,,,,,Adams,39.8662,-77.1648
5231,17235,NRG REMA LLC,3110,Hunterstown,IPP Non-CHP,PA,3,20.0,18,Petroleum Liquids,...,(OP) Operating,,,,,,,Adams,39.8662,-77.1648
5232,17235,NRG REMA LLC,3111,Mountain,IPP Non-CHP,PA,1,27.0,18,Petroleum Liquids,...,(OP) Operating,,,,,,,Cumberland,40.1229,-77.1723


In [45]:
PA_NGCC_cap = capacity.loc[(capacity['Plant State']=='PA') &
             (capacity['Technology']=='Natural Gas Fired Combined Cycle'),:]
PA_NGCC_cap

Unnamed: 0,Entity ID,Entity Name,Plant ID,Plant Name,Sector,Plant State,Generator ID,Nameplate Capacity (MW),Net Summer Capacity (MW),Technology,...,Status,Planned Derate Year,Planned Derate Month,Planned Derate of Summer Capacity (MW),Planned Uprate Year,Planned Uprate Month,Planned Uprate of Summer Capacity (MW),County,Latitude,Longitude
5224,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,2A,65.3,46,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5225,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,2B,65.3,48,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5226,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,3,65.3,49,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5227,14165,NRG Power Midwest LP,3096,Brunot Island,IPP Non-CHP,PA,ST4,144.0,101,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Allegheny,40.4649,-80.0438
5381,19391,UGI Development Co,3176,Hunlock Power Station,IPP Non-CHP,PA,3,49.9,30.1,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Luzerne,41.2006,-76.07
5382,19391,UGI Development Co,3176,Hunlock Power Station,IPP Non-CHP,PA,5,48.0,48.7,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Luzerne,41.2006,-76.07
5383,19391,UGI Development Co,3176,Hunlock Power Station,IPP Non-CHP,PA,6,48.0,48.4,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Luzerne,41.2006,-76.07
11869,2468,Bucknell University,54333,Bucknell University,Commercial CHP,PA,G001,4.7,4.3,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Union,40.955,-76.8788
11870,2468,Bucknell University,54333,Bucknell University,Commercial CHP,PA,G502,1.2,0.5,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,Union,40.955,-76.8788
12320,56516,"Morris Energy Operations Company, LLC",54693,York Generation Company LLC,IPP Non-CHP,PA,GT#1,8.3,46.2,Natural Gas Fired Combined Cycle,...,(OP) Operating,,,,,,,York,39.9856,-76.6762


### Repeat `groupby` and `sum` to get capacity of facilities

In [72]:
cols = ['Plant ID', 'Nameplate Capacity (MW)']
facility_cap = capacity.loc[:,cols].groupby('Plant ID').sum()
facility_cap

Unnamed: 0_level_0,Nameplate Capacity (MW)
Plant ID,Unnamed: 1_level_1
2,53.9
3,2569.5
4,225.0
7,138.0
8,1166.7
9,80.5
10,1288.4
11,72.9
12,46.9
13,100.0


# Load generation data

In [4]:
generation = pd.read_excel(eia923_path, header=5, na_values='.')

In [5]:
generation.columns = ((generation.columns.str.strip()
                             .str.lower()
                             .str.replace('\n', ' ')
                             .str.replace(' ', '_')
                             .str.replace('-', '')
                             .str.replace('(', '')
                             .str.replace(')', '')))

Something weird is going on here. I know there are lots of rows with numeric data that are missing from this `describe` table.

In [8]:
value_cols = [col for col in generation.columns if 'netgen' in col]

In [14]:
df = pd.melt(generation, id_vars=['plant_id'],
        value_vars=value_cols, value_name='net_gen',
        var_name='month').groupby(['plant_id', 'month'], as_index=False).sum()

In [15]:
df['month'] = df.month.str.replace('netgen_', '')

In [16]:
df.head()

Unnamed: 0,plant_id,month,net_gen
0,3,april,788708.0
1,3,august,1291524.003
2,3,december,0.0
3,3,february,977043.999
4,3,january,1123279.005


In [6]:
generation.describe()

Unnamed: 0,plant_id,nuclear_unit_id,operator_id,reserved,naics_code,eia_sector_number,reserved.1,reserved.2,quantity_january,quantity_february,...,netgen_september,netgen_october,netgen_november,netgen_december,total_fuel_consumption_quantity,electric_fuel_consumption_quantity,total_fuel_consumption_mmbtu,elec_fuel_consumption_mmbtu,net_generation_megawatthours,year
count,6846.0,100.0,6846.0,0.0,6846.0,6846.0,0.0,0.0,6638.0,6640.0,...,6665.0,6689.0,0.0,0.0,6846.0,6846.0,6846.0,6846.0,6846.0,6846.0
mean,47395.24759,1.59,45494.41265,,38365.911043,2.550394,,,167812.9,151298.2,...,52767.04,46761.55,,,1735993.0,1523509.0,5046923.0,4736994.0,501836.2,2016.0
std,39659.073362,0.697687,38449.729044,,68095.379303,2.051354,,,675242.3,631019.7,...,147551.9,138284.2,,,6397009.0,5648343.0,14236080.0,14123520.0,1379263.0,0.0
min,3.0,1.0,21.0,,22.0,1.0,,,0.0,0.0,...,-87350.0,-73694.0,,,0.0,0.0,0.0,0.0,-702830.0,2016.0
25%,6001.25,1.0,12492.0,,22.0,1.0,,,0.0,0.0,...,0.0,0.0,,,0.0,0.0,676.25,474.75,180.9423,2016.0
50%,55061.0,1.0,22129.0,,22.0,2.0,,,30.5,7.0,...,3021.806,2042.451,,,2378.5,1470.5,317317.0,208896.0,31276.57,2016.0
75%,99999.0,2.0,99999.0,,99999.0,3.0,,,29858.25,26131.5,...,28045.0,24602.0,,,446972.2,298367.2,2892082.0,2288356.0,292165.2,2016.0
max,99999.0,4.0,99999.0,,562213.0,7.0,,,15535490.0,14529210.0,...,1851736.0,1721639.0,,,153343400.0,120817600.0,150503800.0,150503800.0,15444040.0,2016.0


In [74]:
generation.head()

Unnamed: 0,Plant Id,Combined Heat And Power Plant,Nuclear Unit Id,Plant Name,Operator Name,Operator Id,Plant State,Census Region,NERC Region,Reserved,...,Netgen September,Netgen October,Netgen November,Netgen December,Total Fuel Consumption Quantity,Electric Fuel Consumption Quantity,Total Fuel Consumption MMBtu,Elec Fuel Consumption MMBtu,Net Generation (Megawatthours),YEAR
0,3,N,,Barry,Alabama Power Co,195,AL,ESC,SERC,,...,251839.0,219263.0,.,.,4782719,4782719,4877614,4877614,2386339.0,2016
1,3,N,,Barry,Alabama Power Co,195,AL,ESC,SERC,,...,463646.0,401506.0,.,.,42413079,42413079,43208127,43208127,4486082.0,2016
2,3,N,,Barry,Alabama Power Co,195,AL,ESC,SERC,,...,473794.0,286736.0,.,.,1773704,1773704,37426485,37426485,3706973.3,2016
3,3,N,,Barry,Alabama Power Co,195,AL,ESC,SERC,,...,11246.3,14387.7,.,.,857741,857741,874585,874585,86059.704,2016
4,4,N,,Walter Bouldin Dam,Alabama Power Co,195,AL,ESC,SERC,,...,3380.0,884.0,.,.,0,0,4239930,4239930,454977.0,2016


Turns out that there are lots of dots (.) where it is no value. I'm going to replace these with zeros.

In [75]:
generation.tail()

Unnamed: 0,Plant Id,Combined Heat And Power Plant,Nuclear Unit Id,Plant Name,Operator Name,Operator Id,Plant State,Census Region,NERC Region,Reserved,...,Netgen September,Netgen October,Netgen November,Netgen December,Total Fuel Consumption Quantity,Electric Fuel Consumption Quantity,Total Fuel Consumption MMBtu,Elec Fuel Consumption MMBtu,Net Generation (Megawatthours),YEAR
6841,99999,Y,,State-Fuel Level Increment,State-Fuel Level Increment,99999,WY,MTN,,,...,20.03,20.073,.,.,2471,197,14391,1144,225.055,2016
6842,99999,Y,,State-Fuel Level Increment,State-Fuel Level Increment,99999,WY,MTN,,,...,8371.86,6890.44,.,.,5750996,817170,5662749,804426,84996.866,2016
6843,99999,Y,,State-Fuel Level Increment,State-Fuel Level Increment,99999,WY,MTN,,,...,282.565,112.177,.,.,7449697,152028,1362700,27810,3212.683,2016
6844,99999,Y,,State-Fuel Level Increment,State-Fuel Level Increment,99999,WY,MTN,,,...,0.0,0.0,.,.,0,0,0,0,0.0,2016
6845,99999,Y,,State-Fuel Level Increment,State-Fuel Level Increment,99999,WY,MTN,,,...,10293.0,7668.97,.,.,406340,92966,7110741,1626327,133198.24,2016


In [110]:
generation.replace('.', 0, inplace=True)

There are line breaks in the middle of column names. I don't see any breaks or spaces at the beginning or end of names, but will still `strip` just to be safe.

In [76]:
generation.columns

Index([u'Plant Id', u'Combined Heat And\nPower Plant', u'Nuclear Unit Id',
       u'Plant Name', u'Operator Name', u'Operator Id', u'Plant State',
       u'Census Region', u'NERC Region', u'Reserved', u'NAICS Code',
       u'EIA Sector Number', u'Sector Name', u'Reported\nPrime Mover',
       u'Reported\nFuel Type Code', u'AER\nFuel Type Code', u'Reserved.1',
       u'Reserved.2', u'Physical\nUnit Label', u'Quantity\nJanuary',
       u'Quantity\nFebruary', u'Quantity\nMarch', u'Quantity\nApril',
       u'Quantity\nMay', u'Quantity\nJune', u'Quantity\nJuly',
       u'Quantity\nAugust', u'Quantity\nSeptember', u'Quantity\nOctober',
       u'Quantity\nNovember', u'Quantity\nDecember', u'Elec_Quantity\nJanuary',
       u'Elec_Quantity\nFebruary', u'Elec_Quantity\nMarch',
       u'Elec_Quantity\nApril', u'Elec_Quantity\nMay', u'Elec_Quantity\nJune',
       u'Elec_Quantity\nJuly', u'Elec_Quantity\nAugust',
       u'Elec_Quantity\nSeptember', u'Elec_Quantity\nOctober',
       u'Elec_Quantity\

In [79]:
generation.columns = [name.strip().replace('\n', ' ') for name in generation.columns]
generation.columns

Index([u'Plant Id', u'Combined Heat And Power Plant', u'Nuclear Unit Id',
       u'Plant Name', u'Operator Name', u'Operator Id', u'Plant State',
       u'Census Region', u'NERC Region', u'Reserved', u'NAICS Code',
       u'EIA Sector Number', u'Sector Name', u'Reported Prime Mover',
       u'Reported Fuel Type Code', u'AER Fuel Type Code', u'Reserved.1',
       u'Reserved.2', u'Physical Unit Label', u'Quantity January',
       u'Quantity February', u'Quantity March', u'Quantity April',
       u'Quantity May', u'Quantity June', u'Quantity July', u'Quantity August',
       u'Quantity September', u'Quantity October', u'Quantity November',
       u'Quantity December', u'Elec_Quantity January',
       u'Elec_Quantity February', u'Elec_Quantity March',
       u'Elec_Quantity April', u'Elec_Quantity May', u'Elec_Quantity June',
       u'Elec_Quantity July', u'Elec_Quantity August',
       u'Elec_Quantity September', u'Elec_Quantity October',
       u'Elec_Quantity November', u'Elec_Quantity 

## Stack data by month rather than having multiple columns
Not sure if I'll have time for this section

I'm lazy and want to get a list of month names without typing them all

In [82]:
# could have done this as a list comprehension, but it would have been harder to read
months = []
for name in generation.columns:
    if 'Netgen' in name:
        month = name.split()[-1]
        months.append(month)
months

[u'January',
 u'February',
 u'March',
 u'April',
 u'May',
 u'June',
 u'July',
 u'August',
 u'September',
 u'October',
 u'November',
 u'December']

In [90]:
id_cols = ['Plant Id', 'Plant State', 'NERC Region', 'AER Fuel Type Code']
monthly_cols = []
def find_col_names(cols):
    for col in cols:
        if 'January' in col:
            monthly_cols.append(col.split()[0])

find_col_names(generation.columns)
id_cols + monthly_cols

['Plant Id',
 'Plant State',
 'NERC Region',
 'AER Fuel Type Code',
 u'Quantity',
 u'Elec_Quantity',
 u'MMBtuPer_Unit',
 u'Tot_MMBtu',
 u'Elec_MMBtu',
 u'Netgen']

In [91]:
pd.DataFrame(columns=id_cols + monthly_cols + ['Month'])

Unnamed: 0,Plant Id,Plant State,NERC Region,AER Fuel Type Code,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen


In [113]:
gen_list = []
for month in months:
    gen_df = pd.DataFrame(columns=id_cols + monthly_cols)
    
    # Took me a few tries to figure out that I couldn't use .loc for gen_df
    gen_df[id_cols] = generation.loc[:,id_cols]
    gen_df['Month'] = month
    
    for col in monthly_cols:
        gen_df.loc[:,col] = generation.loc[:,col + ' ' + month]
    
    gen_list.append(gen_df)

In [120]:
gen_stack = pd.concat(gen_list)
gen_stack.describe()

Unnamed: 0,Plant Id,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen
count,82152.0,82152.0,82152.0,82152.0,82152.0,82152.0,82152.0
mean,47395.24759,144666.1,126959.0,2.397057,420577.0,394749.5,41819.68
std,39656.418102,614450.2,548021.8,5.409176,1371208.0,1359613.0,133118.6
min,3.0,0.0,0.0,0.0,0.0,0.0,-116419.0
25%,6001.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,55061.0,0.0,0.0,0.0,2546.5,1822.5,321.7385
75%,99999.0,16222.25,9554.75,1.04,174268.5,127142.2,17208.87
max,99999.0,16503520.0,13567010.0,34.18,19665220.0,19665220.0,1940054.0


### Tag lines as using a combustion fuel or not

In [115]:
gen_stack['AER Fuel Type Code'].unique()

array([u'NG', u'COL', u'HYC', u'DFO', u'NUC', u'WOO', u'HPS', u'SUN',
       u'RFO', u'MLG', u'PC', u'ORW', u'GEO', u'OTH', u'OOG', u'WWW',
       u'WOC', nan, u'WND'], dtype=object)

In [116]:
non_combust = ['HYC', 'NUC', 'SUN', 'GEO', 'WND'] # might be incomplete

In [117]:
def tag_combust(row):
    if row['AER Fuel Type Code'] in non_combust:
        return 0
    else:
        return 1

In [121]:
gen_stack['Combust'] = gen_stack.apply(tag_combust, axis=1)

In [122]:
gen_stack.head()

Unnamed: 0,Plant Id,Plant State,NERC Region,AER Fuel Type Code,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Month,Combust
0,3,AL,SERC,NG,57253,57253,1.017,58226,58226,268797.0,January,1
1,3,AL,SERC,NG,5248798,5248798,1.017,5338028,5338028,511773.0,January,1
2,3,AL,SERC,COL,159951,159951,20.589,3293231,3293231,329513.4,January,1
3,3,AL,SERC,NG,129803,129803,1.016,131880,131880,13195.605,January,1
4,4,AL,SERC,HYC,0,0,0.0,1301314,1301314,139641.0,January,0


## Now group and sum
Only keep data for May

In [149]:
test = gen_stack.loc[gen_stack['Month']=='May',:].groupby(['Plant Id', 'NERC Region']).sum()
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust
Plant Id,NERC Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,SERC,4662196,4662196,24.476,8393587,8393587,1028473.0,4
4,SERC,0,0,0.0,95361,95361,10233.0,0
8,SERC,188233,188233,30.364,4588547,4588547,471836.004,2
10,SERC,116191,116191,2.066,120025,120025,6734.0,7
14,SERC,0,0,0.0,132060,132060,14171.0,0


In [150]:
test.reset_index('NERC Region')

Unnamed: 0_level_0,NERC Region,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust
Plant Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,SERC,4662196,4662196,24.476,8393587,8393587,1028473.000,4
4,SERC,0,0,0.000,95361,95361,10233.000,0
8,SERC,188233,188233,30.364,4588547,4588547,471836.004,2
10,SERC,116191,116191,2.066,120025,120025,6734.000,7
14,SERC,0,0,0.000,132060,132060,14171.000,0
15,SERC,0,0,0.000,219444,219444,23548.000,0
16,SERC,0,0,0.000,145265,145265,15588.000,0
17,SERC,0,0,0.000,193742,193742,20790.000,0
18,SERC,0,0,0.000,54386,54386,5836.000,0
26,SERC,197510,197510,30.196,4577634,4577634,453833.000,4


In [123]:
facility_gen = gen_stack.loc[gen_stack['Month']=='May',:].groupby('Plant Id').sum()

In [124]:
facility_gen.head()

Unnamed: 0_level_0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust
Plant Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,4662196,4662196,24.476,8393587,8393587,1028473.0,4
4,0,0,0.0,95361,95361,10233.0,0
8,188233,188233,30.364,4588547,4588547,471836.004,2
10,116191,116191,2.066,120025,120025,6734.0,7
14,0,0,0.0,132060,132060,14171.0,0


If I want to keep the NERC Region, I can do that in the groupby

In [153]:
facility_gen = gen_stack.loc[gen_stack['Month']=='May',:].groupby(['Plant Id', 'NERC Region']).sum()
facility_gen.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust
Plant Id,NERC Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,SERC,4662196,4662196,24.476,8393587,8393587,1028473.0,4
4,SERC,0,0,0.0,95361,95361,10233.0,0
8,SERC,188233,188233,30.364,4588547,4588547,471836.004,2
10,SERC,116191,116191,2.066,120025,120025,6734.0,7
14,SERC,0,0,0.0,132060,132060,14171.0,0


In [154]:
facility_gen.reset_index('NERC Region', inplace=True)
facility_gen.head()

Unnamed: 0_level_0,NERC Region,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust
Plant Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,SERC,4662196,4662196,24.476,8393587,8393587,1028473.0,4
4,SERC,0,0,0.0,95361,95361,10233.0,0
8,SERC,188233,188233,30.364,4588547,4588547,471836.004,2
10,SERC,116191,116191,2.066,120025,120025,6734.0,7
14,SERC,0,0,0.0,132060,132060,14171.0,0


# Merge data from all three sources

In [155]:
merged = facility_gen.merge(facility_cap, how='inner', left_index=True, right_index=True)

In [156]:
merged.describe()

Unnamed: 0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust,Nameplate Capacity (MW)
count,2204.0,2204.0,2204.0,2204.0,2204.0,2204.0,2204.0,2204.0
mean,434198.0,394022.2,6.088443,1324327.0,1269274.0,134921.7,1.699183,484.658122
std,1116450.0,1004303.0,12.526411,2633492.0,2631382.0,266337.9,1.944481,615.0467
min,0.0,0.0,0.0,0.0,0.0,-63723.0,0.0,1.8
25%,0.0,0.0,0.0,103860.5,98491.25,10001.0,0.0,94.575
50%,376.5,285.0,0.9245,345588.5,306422.5,32209.0,1.0,233.9
75%,254560.5,218256.8,3.066,1264416.0,1035504.0,113805.7,3.0,650.0
max,16221170.0,11247780.0,99.754,25805550.0,25805550.0,2467542.0,11.0,6809.0


Save the non-combustion units, because I'm going to join the merged dataframe with the epa dataframe and want to add back in the non-combustion

In [157]:
non_combust = merged.loc[merged['Combust']==0,:]
non_combust.describe()

Unnamed: 0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust,Nameplate Capacity (MW)
count,933.0,933.0,933.0,933.0,933.0,933.0,933.0,933.0
mean,0.0,0.0,0.0,1090682.0,1090682.0,108846.6,0.0,248.966131
std,0.0,0.0,0.0,3094933.0,3094933.0,297643.1,0.0,463.451807
min,0.0,0.0,0.0,0.0,0.0,-15.0,0.0,1.8
25%,0.0,0.0,0.0,123971.0,123971.0,13303.0,0.0,61.2
50%,0.0,0.0,0.0,251054.0,251054.0,26940.0,0.0,110.0
75%,0.0,0.0,0.0,491335.0,491335.0,52724.0,0.0,200.0
max,0.0,0.0,0.0,25805550.0,25805550.0,2467542.0,0.0,4209.6


In [158]:
non_combust.head()

Unnamed: 0,NERC Region,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust,Nameplate Capacity (MW)
4,SERC,0,0,0.0,95361,95361,10233.0,0,225.0
14,SERC,0,0,0.0,132060,132060,14171.0,0,128.1
15,SERC,0,0,0.0,219444,219444,23548.0,0,177.0
16,SERC,0,0,0.0,145265,145265,15588.0,0,210.6
17,SERC,0,0,0.0,193742,193742,20790.0,0,170.0


In [159]:
merged = merged.merge(facility_emiss, how='inner', left_index=True, right_index=True)
merged.describe()

Unnamed: 0,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust,Nameplate Capacity (MW),Operating Time,Gross Load (MW-h),Steam Load (1000lb),SO2 (tons),Avg. NOx Rate (lb/MMBtu),NOx (tons),CO2 (short tons),Heat Input (MMBtu),EPA Region
count,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,787.0,28.0,804.0,810.0,810.0,782.0,810.0,902.0
mean,802669.9,798409.2,9.428446,1737592.0,1732175.0,188060.1,2.871397,811.238581,825.084268,219861.5,481478.0,107.486081,0.493989,92.464405,168708.9,1937353.0,5.252772
std,1368488.0,1361816.0,12.991945,2442123.0,2440015.0,260528.4,1.620855,651.587349,892.628864,276075.0,534346.1,300.205896,1.316566,187.144645,244870.1,2477358.0,2.177569
min,0.0,0.0,0.0,0.0,0.0,-5056.0,1.0,27.0,0.0,0.0,510.31,0.0,-7.8152,0.0,0.0,3.4,1.0
25%,30703.0,30376.5,1.03125,60534.5,60402.25,4117.25,2.0,349.05,89.235,13838.68,71591.51,0.08675,0.059175,3.705,12737.75,151568.4,4.0
50%,221098.0,221098.0,2.0695,723224.5,714953.5,77540.0,2.0,631.0,579.315,115268.7,302729.5,0.5575,0.17925,12.956,81697.69,1027198.0,5.0
75%,908300.2,908300.2,18.05275,2485223.0,2470881.0,294797.8,4.0,1071.05,1326.4275,332032.2,653281.5,44.93075,0.43505,73.797,209590.7,2694102.0,7.0
max,11247780.0,11247780.0,75.589,16811460.0,16811460.0,1775310.0,11.0,4317.5,5190.81,1652769.0,1783969.0,3639.271,17.0998,1722.928,1607836.0,15670910.0,10.0


In [160]:
merged.head()

Unnamed: 0,NERC Region,Quantity,Elec_Quantity,MMBtuPer_Unit,Tot_MMBtu,Elec_MMBtu,Netgen,Combust,Nameplate Capacity (MW),Operating Time,Gross Load (MW-h),Steam Load (1000lb),SO2 (tons),Avg. NOx Rate (lb/MMBtu),NOx (tons),CO2 (short tons),Heat Input (MMBtu),EPA Region
3,SERC,4662196,4662196,24.476,8393587,8393587,1028473.0,4,2569.5,3844.0,1048450.25,,595.911,0.576,411.165,677078.976,8471790.125,4.0
8,SERC,188233,188233,30.364,4588547,4588547,471836.004,2,1166.7,1837.25,505777.25,,125.662,0.5375,440.348,531418.603,5179513.525,4.0
10,SERC,116191,116191,2.066,120025,120025,6734.0,7,1288.4,188.25,8897.75,,48.579,0.8474,10.162,11340.124,144258.15,4.0
26,SERC,197510,197510,30.196,4577634,4577634,453833.0,4,2034.0,744.0,495791.0,,173.698,0.0646,165.37,524604.9,5113099.4,4.0
47,SERC,58,58,5.8,336,336,62.0,7,1026.0,20.0,177.0,,0.333,4.4379,0.722,192.4,2765.6,4.0


Now concat the two dataframes

In [161]:
final = pd.concat([merged, non_combust])
final

Unnamed: 0,Avg. NOx Rate (lb/MMBtu),CO2 (short tons),Combust,EPA Region,Elec_MMBtu,Elec_Quantity,Gross Load (MW-h),Heat Input (MMBtu),MMBtuPer_Unit,NERC Region,NOx (tons),Nameplate Capacity (MW),Netgen,Operating Time,Quantity,SO2 (tons),Steam Load (1000lb),Tot_MMBtu
3,0.5760,677078.976,4,4.0,8393587,4662196,1048450.25,8471790.125,24.476,SERC,411.165,2569.5,1028473.000,3844.00,4662196,595.911,,8393587
8,0.5375,531418.603,2,4.0,4588547,188233,505777.25,5179513.525,30.364,SERC,440.348,1166.7,471836.004,1837.25,188233,125.662,,4588547
10,0.8474,11340.124,7,4.0,120025,116191,8897.75,144258.150,2.066,SERC,10.162,1288.4,6734.000,188.25,116191,48.579,,120025
26,0.0646,524604.900,4,4.0,4577634,197510,495791.00,5113099.400,30.196,SERC,165.370,2034.0,453833.000,744.00,197510,173.698,,4577634
47,4.4379,192.400,7,4.0,336,58,177.00,2765.600,5.800,SERC,0.722,1026.0,62.000,20.00,58,0.333,,336
50,,,2,4.0,0,0,,,0.000,SERC,,575.0,0.000,0.00,0,,,0
51,0.2080,270128.775,3,6.0,2799148,241664,262392.63,2481213.560,14.902,SPP,241.863,720.7,238020.998,553.48,241664,1081.656,,2799148
54,0.3753,11101.898,2,4.0,189569,177832,15652.65,186799.693,1.066,SERC,2.787,1055.0,15037.000,254.45,177832,0.057,,189569
56,0.5125,129301.129,2,4.0,1098557,49159,112793.47,1260257.488,28.560,SERC,167.878,538.0,97056.000,759.23,49159,63.902,,1098557
60,0.2169,44877.292,2,7.0,404792,24713,36111.00,427905.242,22.206,MRO,39.165,324.3,31448.000,753.95,24713,176.503,,404792


In [162]:
final.index.rename('Plant ID', inplace=True)

In [163]:
final.describe()

Unnamed: 0,Avg. NOx Rate (lb/MMBtu),CO2 (short tons),Combust,EPA Region,Elec_MMBtu,Elec_Quantity,Gross Load (MW-h),Heat Input (MMBtu),MMBtuPer_Unit,NOx (tons),Nameplate Capacity (MW),Netgen,Operating Time,Quantity,SO2 (tons),Steam Load (1000lb),Tot_MMBtu
count,810.0,782.0,1835.0,902.0,1835.0,1835.0,787.0,810.0,1835.0,810.0,1835.0,1835.0,902.0,1835.0,804.0,28.0,1835.0
mean,0.493989,168708.9,1.411444,5.252772,1406010.0,392460.5,219861.5,1937353.0,4.634582,92.464405,525.352916,147784.3,825.084268,394554.9,107.486081,481478.0,1408673.0
std,1.316566,244870.1,1.830965,2.177569,2809884.0,1034649.0,276075.0,2477358.0,10.254386,187.144645,629.910855,282727.4,892.628864,1039785.0,300.205896,534346.1,2811094.0
min,-7.8152,0.0,0.0,1.0,0.0,0.0,0.0,3.4,0.0,0.0,1.8,-5056.0,0.0,0.0,0.0,510.31,0.0
25%,0.059175,12737.75,0.0,4.0,105225.5,0.0,13838.68,151568.4,0.0,3.705,102.0,10724.5,89.235,0.0,0.08675,71591.51,105673.0
50%,0.17925,81697.69,0.0,5.0,332064.0,0.0,115268.7,1027198.0,0.0,12.956,257.0,33894.0,579.315,0.0,0.5575,302729.5,332064.0
75%,0.43505,209590.7,2.0,7.0,1256538.0,205051.5,332032.2,2694102.0,2.064,73.797,696.15,128458.5,1326.4275,205051.5,44.93075,653281.5,1256538.0
max,17.0998,1607836.0,11.0,10.0,25805550.0,11247780.0,1652769.0,15670910.0,75.589,1722.928,4317.5,2467542.0,5190.81,11247780.0,3639.271,1783969.0,25805550.0


In [164]:
final['CO2 (short tons)'].sum() * 2000 * 2.2046 / final['Netgen'].sum()

2145.06392182917