In [1]:
#Load dependencies
import os
import pandas as pd

In [2]:
#Define data files to import
consumptionfile='../Raw Data Files/EIA Annual Electricity Fuel Consumption by State.xls'
generationfile='../Raw Data Files/EIA Annual Electricity Generation by State.xls'
coolheatfile='../Raw Data Files/EIA Cooling-Heating Degree Days by Region.csv'
emissionfile='../Raw Data Files/EIA Emission Annual Data by State.xls'
greenhousefile='../Raw Data Files/ghgp_data_by_year.xlsx'

In [3]:
#Read data files into Panda dataframes
tempconsumption=pd.ExcelFile(consumptionfile)
tempgeneration=pd.ExcelFile(generationfile)
tempcoolheat=pd.read_csv(coolheatfile,header=4)
tempemission=pd.ExcelFile(emissionfile)
tempgreenhouse=pd.ExcelFile(greenhousefile)
consumption=tempconsumption.parse('Consumption_1990 - 2018 Final',skiprows=1,header=0)
generation=tempgeneration.parse('Net_Generation_1990-2018 Final',skiprows=1,header=0)
emission=tempemission.parse('State Emissions',header=0)
greenhouse=tempgreenhouse.parse('Direct Emitters',skiprows=3,header=0)

In [4]:
#Rename columns of dataframes for consistency and simplicity
consumption.columns=['year','state','producer type','energy source','consumption']
generation.columns=['year','state','producer type','energy source','generation (mwh)']
tempcoolheat.columns=['year','US cooling','US heating','Pacific heating',
                  'Mountain heating','West South Central heating',
                  'East South Central heating','South Atlantic heating',
                  'West North Central heating','East North Central heating',
                  'Middle Atlantic heating','New England heating',
                  'Pacific cooling','Mountain cooling','West South Central cooling',
                  'East South Central cooling','South Atlantic cooling',
                  'West North Central cooling','East North Central cooling',
                  'Middle Atlantic cooling','New England cooling']
emission.columns=['year','state','producer type','energy source','CO2 (MT)','SO2 (MT)','NOx (MT)']
greenhouse.columns=['facility id','frs id','facility name','city','state','zip','address','county',
                    'latitude','longitude','naics code','industry type (subparts)','industry type (sectors)',
                    '2018 direct emissions','2017 direct emissions','2016 direct emissions','2015 direct emissions',
                    '2014 direct emissions','2013 direct emissions','2012 direct emissions','2011 direct emissions']

In [5]:
#Change values in energy source column for consistency and create US state instead of US-Total
consumption=consumption.replace({'state':{'US-Total':'US','US-TOTAL':'US'},'energy source':{'Natural Gas (Mcf)':'Natural Gas','Coal (Short Tons)':'Coal',
                                                      'Petroleum (Barrels)':'Petroleum','Geothermal (Billion Btu)':'Geothermal',
                                                      'Other Gases (Billion Btu)':'Other Gases','Other Gases (Billion BTU)':'Other Gases'}})
emission=emission.replace({'state':{'US-Total':'US','US-TOTAL':'US'},'energy source':{'All Sources':'Total'}})
generation=generation.replace({'state':{'US-Total':'US','US-TOTAL':'US'}})

In [6]:
#Select data for the total power industry within states and exclude Guam, Puetro Rico, and Virgin Islands
stateconsumption=consumption.loc[consumption['producer type']=='Total Electric Power Industry']
stategeneration=generation.loc[generation['producer type']=='Total Electric Power Industry']
stateemission=emission.loc[emission['producer type']=='Total Electric Power Industry']
temppowergreenhouse=greenhouse.loc[greenhouse['industry type (sectors)']=='Power Plants']
powergreenhouse=temppowergreenhouse[~temppowergreenhouse.state.isin(['GU','PR',"VI"])]

In [7]:
#Create state greenhouse emissions dataframe
tempgreenhouse=powergreenhouse.groupby(['state'])['2018 direct emissions','2017 direct emissions','2016 direct emissions',
                                                   '2015 direct emissions','2014 direct emissions','2013 direct emissions',
                                                   '2012 direct emissions','2011 direct emissions'].sum()
tempgreenhouse.columns=['2018','2017','2016','2015','2014','2013','2012','2011']
seriesgreenhouse=tempgreenhouse.stack()
stategreenhouse=pd.DataFrame(seriesgreenhouse)
stategreenhouse.reset_index(inplace=True)
stategreenhouse.columns=['state','year','greenhouse emissions']
stategreenhouse['year']=pd.to_numeric(stategreenhouse['year'])

In [8]:
#Merge state datasets together to create state dataframe
tempstatedata=pd.merge(stateconsumption,stategeneration,on=['year','state','producer type','energy source'],how='outer')
statedata=pd.merge(stateemission,tempstatedata,on=['year','state','producer type','energy source'],how='outer')

In [9]:
#Create greenhouse dataframe
facility=powergreenhouse.drop(['city','zip','address','county','naics code','industry type (subparts)','industry type (sectors)',
                    '2018 direct emissions','2017 direct emissions','2016 direct emissions','2015 direct emissions',
                    '2014 direct emissions','2013 direct emissions','2012 direct emissions','2011 direct emissions'],axis=1)
tempfacilityemission=powergreenhouse.drop(['frs id','facility name','city','state','zip','address','county',
                    'latitude','longitude','naics code','industry type (subparts)','industry type (sectors)'],axis=1)
tempfacilityemission.columns=['facility id','2018','2017','2016','2015','2014','2013','2012','2011']
facilityemission=pd.melt(tempfacilityemission,id_vars=['facility id'],value_vars=['2018','2017','2016','2015','2014','2013','2012','2011']) 
facilityemission.columns=['facility id','year','greenhouse emissions']

In [10]:
#Create coolheat dataframe
tempheat=tempcoolheat.drop(['US cooling','Pacific cooling','Mountain cooling','West South Central cooling','East South Central cooling',
                    'South Atlantic cooling','West North Central cooling','East North Central cooling','Middle Atlantic cooling','New England cooling'],axis=1)
tempcool=tempcoolheat.drop(['US heating','Pacific heating','Mountain heating','West South Central heating','East South Central heating',
                    'South Atlantic heating','West North Central heating','East North Central heating','Middle Atlantic heating','New England heating'],axis=1)
tempheat.columns=['year','US','Pacific','Mountain','West South Central','East South Central','South Atlantic','West North Central','East North Central',
              'Middle Atlantic','New England']
tempcool.columns=['year','US','Pacific','Mountain','West South Central','East South Central','South Atlantic','West North Central','East North Central',
              'Middle Atlantic','New England']
heat=pd.melt(tempheat,id_vars=['year'],value_vars=['US','Pacific','Mountain','West South Central','East South Central','South Atlantic','West North Central',
                                                   'East North Central','Middle Atlantic','New England'])
cool=pd.melt(tempcool,id_vars=['year'],value_vars=['US','Pacific','Mountain','West South Central','East South Central','South Atlantic','West North Central',
                                                   'East North Central','Middle Atlantic','New England'])
heat.columns=['year','region','heating degree days']
cool.columns=['year','region','cooling degree days']
mergecoolheat=pd.merge(heat,cool,on=['year','region'],how='outer')
coolheat=mergecoolheat[~mergecoolheat.year.isin(['2021','2020','2019'])]

In [11]:
#Export dataframes as csv files
statedata.to_csv('../Clean Data Files/state data.csv',index=False,header=True)
stategreenhouse.to_csv('../Clean Data Files/state greenhouse emissions.csv',index=False,header=True)
facility.to_csv('../Clean Data Files/facility.csv',index=False,header=True)
facilityemission.to_csv('../Clean Data Files/facility emissions.csv',index=False,header=True)
coolheat.to_csv('../Clean Data Files/region degree days.csv',index=False,header=True)