# TODO:

1. Verify that the merge result is as expected.  I noticed that when I checked the length of the EPA for a 6-month period, the length was longer than the post-merge 6-month period.  This implies that the EIA data doesn't have some facilities that are listed in EPA.  I haven't verified this though.

2. NaN values also need to be replaced with 0's.

3. Numbers are being stored in scientific notation

In [1]:
import pandas as pd
import os
import glob
import re

Loading the EIA Data, the path may need to be updated...
This will take a few minutes to run.

In [2]:
#Iterate through the directory to find all the files to import
#Modified so that it also works on macs
path = os.path.join('EIA Data', '923-No_Header')
full_path = os.path.join(path, '*.*')


eiaNames = os.listdir(path)

#Rename the keys for easier merging later
fileNameMap = {'EIA923 SCHEDULES 2_3_4_5 Final 2010.xls':2010,
                'EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS':2009,
                'eia923December2008.xls':2008,
                'EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx':2011,
                'EIA923_Schedules_2_3_4_5_2012_Final_Release_12.04.2013.xlsx':2012,
                'EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx':2013,
                'EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx':2014,
                'EIA923_Schedules_2_3_4_5_M_12_2015_Final.xlsx':2015,
                'f906920_2007.xls':2007}

#Load the files into data frames, one df per file
eiaDict = {fileNameMap[fn]:pd.read_excel(os.path.join(path, fn)) for fn in eiaNames}
eiaDict = {key:val[val["NERC Region"] == "TRE"] for key, val in eiaDict.iteritems()}

The excel documents have different column names so we need to standardize them all

In [3]:
#Dict of values to replace to standardize column names across all dataframes
monthDict = {"JANUARY":"JAN",
           "FEBRUARY":"FEB",
           "MARCH":"MAR",
           "APRIL":"APR",
           "MAY":"MAY",
           "JUNE":"JUN",
           "JULY":"JUL",
           "AUGUST":"AUG",
           "SEPTEMBER":"SEP",
           "OCTOBER":"OCT",
           "NOVEMBER":"NOV",
           "DECEMBER":"DEC"}
           
replaceDict = {"ELECTRIC":"ELEC",
               "&":"AND",
               "I.D.":"ID",
               "MMBTUPER":"MMBTU_PER"}
               
#Add "MMBTUMON" : "MMBTU_MON" to be replaced
for month in monthDict.values():
    replaceDict["MMBTU"+month] = "MMBTU_" + month

#Replace the column name
def rename(col):
    for old, new in monthDict.iteritems():
        col = col.replace(old, new)
        
    for old, new in replaceDict.iteritems():
        col = col.replace(old, new)
        
    col = col.replace("MMBTUS", "MMBTU")
    return col
    
#Iterate through each column name of each dataframe to standardize
for key, df in eiaDict.iteritems():
    colNames = [name.replace("\n", "_").replace(" ", "_").strip().upper() for name in df.columns]
    colNames = [rename(col) for col in colNames]
    eiaDict[key].columns = colNames

Define which columns we need to sum, and which columns don't need to be summed, but we still need to keep.

Note: If we don't care about monthly stuff we can delete the second block of code.

In [6]:
#Define the columns that are necessary but are not summable
allCols = eiaDict[fileNameMap.values()[0]].columns
nonSumCols = ["PLANT_ID", "PLANT_NAME", "YEAR"]

#Define the columns that contain the year's totals (Used to calc fuel type %)
yearCols = ["TOTAL_FUEL_CONSUMPTION_QUANTITY", "ELEC_FUEL_CONSUMPTION_QUANTITY",
            "TOTAL_FUEL_CONSUMPTION_MMBTU", "ELEC_FUEL_CONSUMPTION_MMBTU",
            "NET_GENERATION_(MEGAWATTHOURS)"]


#Define the columns that are necessary and summable
sumCols = []
sumCols.extend(yearCols)
# regex = re.compile(r"^ELEC_QUANTITY_.*")
# sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^MMBTU_PER_UNIT_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^TOT_MMBTU_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^ELEC_MMBTUS_.*")
sumCols.extend([col for col in allCols if regex.search(col)])
regex = re.compile(r"^NETGEN_.*")
sumCols.extend([col for col in allCols if regex.search(col)])

Get a list of all the different fuel type codes.  If we don't care about all of them, then just hardcode the list

In [5]:
fuelTypes = []
fuelTypes.extend([fuelType for df in eiaDict.values() for fuelType in df["REPORTED_FUEL_TYPE_CODE"].tolist()])
fuelTypes = set(fuelTypes)

In [None]:
fuelTypes

3 parts to aggregate by facility, and to calculate the % of each type of fuel.  This will take a few minutes to run.

The end result is aggEIADict.

In [7]:
#Actually calculate the % type for each facility grouping
def calcPerc(group, aggGroup, fuelType, col):
    #Check to see if the facility has a record for the fuel type, and if the total column > 0
    if len(group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType]) > 0 and aggGroup[col] > 0:
        #summing fuel type because a facility may have multiple plants with the same fuel type        
        return float((group[group["REPORTED_FUEL_TYPE_CODE"] == fuelType][col]).sum())/aggGroup[col] 
    else:
        return 0

#Perform the aggregation on facility level
def aggAndCalcPerc(group):
    aggGroup = group.iloc[0][nonSumCols] #Get the non-agg columns
    aggGroup = aggGroup.append(group[sumCols].sum())   #Aggregate the agg columns and append to non-agg
    percCols = {col + " %" + fuelType:calcPerc(group, aggGroup, fuelType, col) for col in yearCols for fuelType in fuelTypes}
    aggGroup = aggGroup.append(pd.Series(percCols))
    return aggGroup    

#Iterate through each dataframe to perform aggregation by facility
aggEIADict = dict()
for key, df in eiaDict.iteritems():
    gb = df.groupby(by="PLANT_ID")
    #aggGroup will be a list of panda series, each series representing a facility
    aggGroup = [aggAndCalcPerc(gb.get_group(group)) for group in gb.groups]
    aggEIADict[key] = pd.DataFrame(aggGroup)

Loading the EPA Data, the path may need to be updated...

In [8]:
#Read the EPA files into a dataframe
path2 = os.path.join('EPA air markets')
epaNames = os.listdir(path2)
filePaths = {dn:os.path.join(path2, dn, "*.txt") for dn in epaNames}
filePaths = {dn:glob.glob(val) for dn, val in filePaths.iteritems()}
epaDict = {key:pd.read_csv(fp, index_col = False) for key, val in filePaths.iteritems() for fp in val}

First rename the column name so we can merge on that column, then change the datatype of date to a datetime object

In [9]:
#Rename the column names to remove the leading space.
for key, df in epaDict.iteritems():
    colNames = [name.upper().strip() for name in df.columns]
    colNames[colNames.index("FACILITY ID (ORISPL)")] = "PLANT_ID"
    epaDict[key].columns = colNames
    
#Convert to datetime object
for key, df in epaDict.iteritems():
    epaDict[key]["DATE"] = pd.to_datetime(df["DATE"])

The DataFrames in `epaDict` contain all power plants in Texas. We can filter on `NERC REGION` so that it only includes ERCOT.

In [9]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])

{nan, 'ERCOT', 'SERC', 'SPP', 'WECC'}

In [10]:
#Boolean filter to only keep ERCOT plants
for key, df in epaDict.iteritems():
    epaDict[key] = df[df["NERC REGION"] == "ERCOT"].reset_index(drop = True)
    

In [11]:
set(epaDict['2015 July-Dec'].loc[:,'NERC REGION'])

{'ERCOT'}

Finally join the two data sources

Switch to an inner join?

**No need to join. Can keep them as separate databases, since one is hourly data and the other is annual/monthly** Create a clustering dataframe with index of all plant IDs (from the EPA hourly data), add columns with variables. Calculate the inputs in separate dataframes - example is to calculate ramp rate values in the EPA hourly data, then put the results in the clustering dataframe.

In [12]:
#Join the two data sources on PLANT_ID
fullData = {key:df.merge(aggEIADict[df["YEAR"][0]], on="PLANT_ID") for key, df in epaDict.iteritems()}

In [13]:
fullData[fullData.keys()[0]].head()

Unnamed: 0,STATE,FACILITY NAME,PLANT_ID,YEAR_x,DATE,HOUR,GROSS LOAD (MW),STEAM LOAD (1000LB/HR),SO2 (POUNDS),NOX (POUNDS),...,TOT_MMBTU_FEB,TOT_MMBTU_JAN,TOT_MMBTU_JUL,TOT_MMBTU_JUN,TOT_MMBTU_MAR,TOT_MMBTU_MAY,TOT_MMBTU_NOV,TOT_MMBTU_OCT,TOT_MMBTU_SEP,YEAR_y
0,TX,Barney M. Davis,4939,2012,2012-01-01,0,,,,,...,1494820.0,1289560.0,1988450.0,2022120.0,1938440.0,2067860.0,905868,1512420.0,1888220.0,2012
1,TX,Barney M. Davis,4939,2012,2012-01-01,1,,,,,...,1494820.0,1289560.0,1988450.0,2022120.0,1938440.0,2067860.0,905868,1512420.0,1888220.0,2012
2,TX,Barney M. Davis,4939,2012,2012-01-01,2,,,,,...,1494820.0,1289560.0,1988450.0,2022120.0,1938440.0,2067860.0,905868,1512420.0,1888220.0,2012
3,TX,Barney M. Davis,4939,2012,2012-01-01,3,,,,,...,1494820.0,1289560.0,1988450.0,2022120.0,1938440.0,2067860.0,905868,1512420.0,1888220.0,2012
4,TX,Barney M. Davis,4939,2012,2012-01-01,4,,,,,...,1494820.0,1289560.0,1988450.0,2022120.0,1938440.0,2067860.0,905868,1512420.0,1888220.0,2012


BIT, SUB, LIG, NG, DFO, RFO

In [17]:
[x for x in fullData[fullData.keys()[0]].columns]

['STATE',
 'FACILITY NAME',
 'PLANT_ID',
 'YEAR_x',
 'DATE',
 'HOUR',
 'GROSS LOAD (MW)',
 'STEAM LOAD (1000LB/HR)',
 'SO2 (POUNDS)',
 'NOX (POUNDS)',
 'CO2 (SHORT TONS)',
 'HEAT INPUT (MMBTU)',
 'EPA REGION',
 'NERC REGION',
 'COUNTY',
 'FACILITY LATITUDE',
 'FACILITY LONGITUDE',
 u'ELEC_FUEL_CONSUMPTION_MMBTU',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %AB',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %BIT',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %BLQ',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %DFO',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %JF',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %LFG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %LIG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %MWH',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %NG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %NUC',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBL',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OBS',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OG',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %OTH',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %PC',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %PUR',
 u'ELEC_FUEL_CONSUMPTION_MMBTU %RFO',
 u'ELE

## Calculating ramp rate
Calculate rate of change over 1,2,3 hours for positive change. 

# Assumptions
1. Plant capacity changes at the start of the year and is constant for the entire year
2. Same for ramp rate - no changes over the course of the year