In [210]:
import pandas as pd
import numpy as np
import os

This notebook is used to clean up and format the raw EIA data in ../data/EIA/electricity_power_monthly

# Functions

In [211]:
def format_table_602(df_602, table_label):
    ''' Formats the raw excel dataframe for Table 6.2.A and Table 6.2.B into a workable dataframe
    ==============================================================================================
    Input:
        df_602: pd.read_excel output of Table 6.2.A/B 
        table_label: either 'A' or 'B'
    ==============================================================================================
    '''
    
    # Fix source labels
    if table_label == 'A':
        
        column_headings_602A = ['State', 'Renewable Sources', 'Renewable Sources', 'Fossil Fuels', 'Fossil Fuels', 
                                'Hydroelectric Pumped Storage', 'Hydroelectric Pumped Storage', 'Energy Storage', 
                                'Energy Storage', 'Nuclear', 'Nuclear', 'Other', 'Other', 'All', 'All']
        column_headings_602A[1:] = [x + ' Cap' for x in column_headings_602A[1:]]
        df_602.columns = column_headings_602A
        
        
    elif table_label == 'B':
        
        column_headings_602B = ['State', 'Wind', 'Wind', 'Solar Photovoltaic', 'Solar Photovoltaic', 'Solar Thermal', 
                                       'Solar Thermal', 'Conventional Hydroelectric', 'Conventional Hydroelectric', 'Biomass', 
                                       'Biomass', 'Geothermal', 'Geothermal', 'Renewable Sources', 'Renewable Sources', 
                                       'Est Dist Solar Photovoltaic', 'Est Dist Solar Photovoltaic', 
                                       'Est Tot Solar Photovoltaic', 'Est Tot Solar Photovoltaic', 'Est Tot Solar', 
                                       'Est Tot Solar']
        column_headings_602B[1:] = [x + ' Cap' for x in column_headings_602B[1:]]
        df_602.columns = column_headings_602B
        df_602 = df_602.iloc[1:,:]
        
    else:
        
        raise ValueError('Unknown table label')

    # Transpose table
    df_602 = df_602.iloc[1:-1,:].T.reset_index()

    # Fix transposed column headings
    column_headings = df_602.iloc[0].copy()
    column_headings[0] = 'Source'
    column_headings[1] = 'Date'
    df_602.columns = column_headings 

    # Drop last column (contains extraneous info)
    df_602 = df_602.iloc[1:,:]
    
    # Melt data frame so it is aggregated as Source/Date/State/Capacity
    df_602 = pd.melt(df_602, id_vars = ['Source', 'Date'], value_vars = df_602.columns[2:])
    df_602.columns = ['Source', 'Date', 'State', 'Capacity']
    
    # Fix NA values
    df_602['Capacity'] = df_602['Capacity'].apply(lambda x: np.nan if x == 'NM' else x) 
    
    return df_602

In [212]:
def format_table_13A(df_13A):
    ''' Formats the raw excel dataframe for Table 1.3.A into a workable dataframe
    ==============================================================================================
    Input:
        df_602: pd.read_excel output of Table 1.3.A
    ==============================================================================================
    '''
    
    # Fix source labels
    df_13A.columns = ['State', 'All Sectors Gen', 'All Sectors Gen', 'Percent Change', 
                      'Electricty Power Sector Utility Gen', 'Electricity Power Sector Utility Gen', 
                      'Electricty Power Sector Independent Gen', 'Electricity Power Sector Independent Gen', 
                      'Commercial Sector Gen', 'Commercial Sector Gen', 
                      'Industrial Sector Gen', 'Industrial Sector Gen']

    # Transpose table
    df_13A = df_13A.drop('Percent Change', axis = 1)
    df_13A = df_13A.iloc[4:-1,:].T.reset_index()

    # Fix transposed column headings
    column_headings = df_13A.iloc[0].copy()
    column_headings[0] = 'Source'
    column_headings[1] = 'Date'
    df_13A.columns = column_headings 

    # Drop last column (contains extraneous info)
    df_13A = df_13A.iloc[1:,:]
    
    # Melt data frame so it is aggregated as Source/Date/State/Capacity
    df_13A = pd.melt(df_13A, id_vars = ['Source', 'Date'], value_vars = df_13A.columns[2:])
    df_13A.columns = ['Source', 'Date', 'State', 'Generation']
    
    # Fix NA values
    df_13A['Generation'] = df_13A['Generation'].apply(lambda x: np.nan if x == 'NM' else x) 
    
    return df_13A

# Main

## Import and Prepare Data

In [213]:
folders_monthyear = [x for x in os.walk('../data/EIA/electricity_power_monthly/')][0][1]

### Tables 6.2.A and 6.2.B (Capacity)

In [214]:
# list of Table 6.2 dataframes for each month 
df_602_list = []

# For each month/year (subfolder), import Table 6.2.A and Table 6.2.B data
for subfolder in folders_monthyear:
    
    folder = '../data/EIA/electricity_power_monthly/' + subfolder + '/'
    
    # Import Table 6.2.A and 6.2.B
    df_602A = pd.read_excel('../data/EIA/electricity_power_monthly/' + subfolder + '/Table_6_02_A.xlsx')
    df_602B = pd.read_excel('../data/EIA/electricity_power_monthly/' + subfolder + '/Table_6_02_B.xlsx')
    
    # Format 
    df_602A = format_table_602(df_602A, 'A')
    df_602B = format_table_602(df_602B, 'B')
    
    # Merge and add to list 
    df_602_merged = pd.concat([df_602A, df_602B.query('Source != "Renewable Sources"')])
    df_602_list.append(df_602_merged)

### Table 1.3.A (Net Generation)

In [215]:
# list of Table 1.3.A dataframes for each month 
df_13A_list = []

# For each month/year (subfolder), import Table 1.3.A
for subfolder in folders_monthyear:

    # Import Table 1.3.A
    df_13A = pd.read_excel('../data/EIA/electricity_power_monthly/' + subfolder + '/Table_1_03_A.xlsx')
    
    # Format 
    df_13A = format_table_13A(df_13A)
    
    # Append to list
    df_13A_list.append(df_13A)

## Merge Data

In [216]:
# Concatenate data for each month/year
df_602_concat = pd.concat(df_602_list).reset_index(drop = True)
df_13A_concat = pd.concat(df_13A_list).reset_index(drop = True)

In [217]:
## Pivot columns so index is State/Date and columns describe the value

# Table 1.3.A
df_13A_pivot = df_13A_concat.groupby(['State', 'Date', 'Source'])['Generation'].first().unstack('Source').reset_index()
df_13A_pivot.columns.name = ''

# Tables 6.2.A and 6.2.B
df_602_pivot = df_602_concat.groupby(['State', 'Date', 'Source'])['Capacity'].first().unstack('Source').reset_index()
df_602_pivot.columns.name = ''

In [218]:
# Merge Table 1.3.A and Tables 6.2.A and 6.2.B
df_merged = df_602_pivot.merge(df_13A_pivot, on = ['State', 'Date'])
df_merged.head()

Unnamed: 0,State,Date,All Cap,Biomass Cap,Conventional Hydroelectric Cap,Energy Storage Cap,Est Dist Solar Photovoltaic Cap,Est Tot Solar Cap,Est Tot Solar Photovoltaic Cap,Fossil Fuels Cap,...,Solar Photovoltaic Cap,Solar Thermal Cap,Wind Cap,All Sectors Gen,Commercial Sector Gen,Electricity Power Sector Independent Gen,Electricity Power Sector Utility Gen,Electricty Power Sector Independent Gen,Electricty Power Sector Utility Gen,Industrial Sector Gen
0,Alabama,April 2015,31746.3,615.9,3271.0,0.0,1.9,1.9,1.9,22793.0,...,0.0,0.0,0.0,10323.0,0.0,2569.0,7387.0,,,367.0
1,Alabama,April 2016,30157.4,660.5,3271.0,0.0,1.9,1.9,1.9,21159.5,...,0.0,0.0,0.0,9775.0,0.0,,,2401.0,7055.0,319.0
2,Alabama,August 2015,31746.3,615.9,3271.0,0.0,1.9,1.9,1.9,22793.0,...,0.0,0.0,0.0,14342.0,0.0,3839.0,10128.0,,,376.0
3,Alabama,August 2016,29220.8,668.9,3271.0,0.0,2.3,2.3,2.3,20214.5,...,0.0,0.0,0.0,14213.0,0.0,,,4433.0,9408.0,372.0
4,Alabama,December 2014,31953.3,615.9,3271.0,0.0,1.8,1.8,1.8,23000.0,...,0.0,0.0,0.0,12787.0,0.0,3028.0,9418.0,,,340.0


## Export

In [219]:
df_merged.to_csv('../data/EIA/monthly_state_capacity_generation.csv', index = False)