In [404]:
import pandas as pd
import numpy as np

In [405]:
# Data sources
data_dir   = '../data/'
energy_source_dirs = {'coal': 'coal/', 'naturalgas' : 'naturalgas/', 'solar' : 'solar/', 'wind' : 'wind/', 'oil' : 'oil/'}

## Import Data

### Variables

In [406]:
net_gen_data = pd.read_csv(data_dir + 'net_generation_2016.csv')
con_for_gen_data = pd.read_csv(data_dir + 'consumption_for_gen_elc_2016.csv')
avg_cost_data = pd.read_csv(data_dir + 'average_cost_fossil_fuels_2016.csv')

### Keys

In [407]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

## Clean Data

## Net Generation

In [408]:
net_gen_data['Fuel'] = net_gen_data['Fuel'].str.strip()
net_gen_data['Sector'] = net_gen_data['Sector'].str.strip()
net_gen_data['State Name'] = net_gen_data['State Name'].str.strip().str.upper()

# Melt month columns
net_gen_data = net_gen_data.melt(id_vars = ['State Name', 'Fuel', 'Sector', 'Unit'], value_vars = list(month_key.keys()), var_name='month', value_name='net_gen').copy()
net_gen_data['month'] = net_gen_data['month'].apply(lambda x: month_key[x])

# Pivot sector column
net_gen_data = net_gen_data.groupby(['State Name', 'Sector', 'Fuel', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
net_gen_data.columns = ['State Name', 'Fuel', 'month', 'net_generation' + '_all', 'net_gen' + '_elc']

# Pivot fuel column
net_gen_data = net_gen_data.groupby(['State Name', 'month', 'Fuel']).sum().unstack('Fuel').reset_index()

# Fix column names
net_gen_data.columns = ['%s%s' % (a, '_%s' % b.lower().replace(' ', '_') if b else '') for a, b in net_gen_data.columns]

In [409]:
net_gen_data.head()

Unnamed: 0,State Name,month,net_generation_all_all,net_generation_all_biomass,net_generation_all_coal,net_generation_all_geothermal,net_generation_all_hydroelectric,net_generation_all_natural_gas,net_generation_all_nuclear,net_generation_all_oil,...,net_gen_elc_all,net_gen_elc_biomass,net_gen_elc_coal,net_gen_elc_geothermal,net_gen_elc_hydroelectric,net_gen_elc_natural_gas,net_gen_elc_nuclear,net_gen_elc_oil,net_gen_elc_solar,net_gen_elc_wind
0,ALABAMA,1,12942.0,280.0,2464.0,0.0,1665.0,4833.0,3689.0,11.0,...,12568.0,19.0,2455.0,0.0,1665.0,4731.0,3689.0,9.0,0.0,0.0
1,ALABAMA,2,11433.0,276.0,2302.0,0.0,1498.0,4185.0,3167.0,4.0,...,11071.0,21.0,2293.0,0.0,1498.0,4090.0,3167.0,3.0,0.0,0.0
2,ALABAMA,3,11060.0,270.0,2369.0,0.0,1040.0,4359.0,3013.0,6.0,...,10703.0,20.0,2361.0,0.0,1040.0,4265.0,3013.0,5.0,0.0,0.0
3,ALABAMA,4,9631.0,219.0,1976.0,0.0,523.0,3685.0,3223.0,2.0,...,9333.0,20.0,1969.0,0.0,523.0,3597.0,3223.0,0.0,0.0,0.0
4,ALABAMA,5,11120.0,291.0,2449.0,0.0,246.0,4638.0,3491.0,3.0,...,10749.0,21.0,2441.0,0.0,246.0,4547.0,3491.0,2.0,0.0,0.0


## Consumption for Generation

In [410]:
con_for_gen_data['Fuel'] = con_for_gen_data['Fuel'].str.strip()
con_for_gen_data['State Name'] = con_for_gen_data['State Name'].str.strip().str.upper()

# Melt month columns
con_for_gen_data = con_for_gen_data.melt(id_vars = ['State Name', 'Fuel', 'Unit'], value_vars = list(month_key.keys()), 
                                         var_name='month', value_name='con_for_gen').copy()
con_for_gen_data['month'] = con_for_gen_data['month'].apply(lambda x: month_key[x])

# Pivot fuel column
con_for_gen_data = con_for_gen_data.groupby(['State Name', 'month', 'Fuel']).sum().unstack('Fuel').reset_index()

# Fix column names
con_for_gen_data.columns = ['%s%s' % (a, '_%s' % b.lower().replace(' ', '_') if b else '') for a, b in con_for_gen_data.columns]

In [411]:
con_for_gen_data.head()

Unnamed: 0,State Name,month,Unit_coal,Unit_natural_gas,Unit_oil,con_for_gen_coal,con_for_gen_natural_gas,con_for_gen_oil
0,ALABAMA,1,thousand tons,thousand Mcf,thousand barrels,1274,32419,20
1,ALABAMA,2,thousand tons,thousand Mcf,thousand barrels,1179,27911,5
2,ALABAMA,3,thousand tons,thousand Mcf,thousand barrels,1218,29483,9
3,ALABAMA,4,thousand tons,thousand Mcf,thousand barrels,964,25632,1
4,ALABAMA,5,thousand tons,thousand Mcf,thousand barrels,1214,31744,4


## Average Fuel Cost

In [412]:
avg_cost_data['Fuel'] = avg_cost_data['Fuel'].str.strip()
avg_cost_data['State Name'] = avg_cost_data['State Name'].str.strip().str.upper()

# Melt month columns
avg_cost_data = avg_cost_data.melt(id_vars = ['State Name', 'Unit', 'Fuel'], value_vars = list(month_key.keys()), 
                                         var_name='month', value_name='average_cost').copy()
avg_cost_data['month'] = avg_cost_data['month'].apply(lambda x: month_key[x])

# Drop units
avg_cost_data = avg_cost_data.drop('Unit', axis = 1)

# Pivot fuel column
avg_cost_data = avg_cost_data.groupby(['State Name', 'month', 'Fuel']).sum().unstack('Fuel').reset_index()

# Fix column names
avg_cost_data.columns = ['%s%s' % (a, '_%s' % b.lower().replace(' ', '_') if b else '') for a, b in avg_cost_data.columns]

# Replace 0's with NaN
avg_cost_data = avg_cost_data.replace(to_replace = 0, value = np.nan)

In [413]:
avg_cost_data.head()

Unnamed: 0,State Name,month,average_cost_coal,average_cost_natural_gas,average_cost_oil
0,ALABAMA,1,47.02,,
1,ALABAMA,2,49.29,,51.45
2,ALABAMA,3,47.62,,52.94
3,ALABAMA,4,52.38,,62.33
4,ALABAMA,5,49.77,,66.58


In [414]:
merged_data = avg_cost_data.merge(net_gen_data).merge(con_for_gen_data).merge(state_codes_key)

# Export

In [415]:
merged_data.to_csv(data_dir + 'processed/fuel_data.csv')