In [1]:
import pandas as pd
import numpy as np

In [2]:
data_dir = '../data/'
elec_dir = data_dir + 'electricity/'

# Import Data

## Data

In [3]:
# Instruments
data_instruments = pd.read_csv(data_dir + 'processed/' + 'instruments_data.csv')

# Fuel generation and consumption
data_fuel_gen_con = pd.read_csv(data_dir + 'processed/' + 'fuel_data.csv')

# IMPLAN
data_implan = pd.read_csv(data_dir + 'processed/' + 'implan_data.csv')

# Electricity Prices
elc_pr_file_loc = elec_dir + 'avg_elec_price.csv'
data_elc_pr = pd.read_csv(elc_pr_file_loc, na_values = ['NA'])

## Keys

In [4]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

# Clean Data

In [5]:
# Melt month columns
data_elc_pr = data_elc_pr.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_elc_pr.columns[4:16], var_name='month', value_name='elc_price')
data_elc_pr['month'] = data_elc_pr['month'].apply(lambda x: month_key[x])

# Convert from cents per kwh to dollars per thousand kwh
data_elc_pr['elc_price'] = pd.to_numeric(data_elc_pr['elc_price'])*1000/100

# Pivot sector column
data_elc_pr['Sector'] = data_elc_pr['Sector'].str.strip()
data_elc_pr = data_elc_pr.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_elc_pr.columns = ['State Name', 'month', 'elc_price_all', 'elc_price_com', 'elc_price_ind', 'elc_price_oth', 'elc_price_res', 'elc_price_trn']

# Fill 0's with NaNs
data_elc_pr = data_elc_pr.replace(to_replace = 0, value = np.nan)

In [6]:
data_elc_pr.head()

Unnamed: 0,State Name,month,elc_price_all,elc_price_com,elc_price_ind,elc_price_oth,elc_price_res,elc_price_trn
0,Alabama,1,89.4,107.9,53.6,,110.0,
1,Alabama,2,90.1,108.8,52.8,,114.5,
2,Alabama,3,91.0,108.4,55.5,,120.7,
3,Alabama,4,92.6,110.5,58.7,,123.7,
4,Alabama,5,93.2,110.2,60.0,,120.5,


# Merge Data

In [7]:
data_elc_pr['State Name'] = data_elc_pr['State Name'].str.upper().str.strip()
data_elc_pr = data_elc_pr.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

In [8]:
data_merged = data_instruments.merge(data_elc_pr, how = 'outer', on = ['month', 'State FIPS'])
data_merged = data_merged.merge(data_fuel_gen_con, how = 'outer', on = ['month', 'State FIPS'])
data_merged = data_merged.merge(data_implan, how = 'outer', on = ['month', 'State FIPS'])
data_merged = data_merged.drop(['NCDC Code_x', 'NCDC Code_y'], axis = 1)

# Export Data

In [9]:
data_merged.to_csv(data_dir + 'processed/' + 'merged_data.csv', index = False)

In [10]:
(' + ').join(['{gamma_%s}*fips_%s' % (str(i), str(i)) for i in range(1,100)])


'{gamma_1}*fips_1 + {gamma_2}*fips_2 + {gamma_3}*fips_3 + {gamma_4}*fips_4 + {gamma_5}*fips_5 + {gamma_6}*fips_6 + {gamma_7}*fips_7 + {gamma_8}*fips_8 + {gamma_9}*fips_9 + {gamma_10}*fips_10 + {gamma_11}*fips_11 + {gamma_12}*fips_12 + {gamma_13}*fips_13 + {gamma_14}*fips_14 + {gamma_15}*fips_15 + {gamma_16}*fips_16 + {gamma_17}*fips_17 + {gamma_18}*fips_18 + {gamma_19}*fips_19 + {gamma_20}*fips_20 + {gamma_21}*fips_21 + {gamma_22}*fips_22 + {gamma_23}*fips_23 + {gamma_24}*fips_24 + {gamma_25}*fips_25 + {gamma_26}*fips_26 + {gamma_27}*fips_27 + {gamma_28}*fips_28 + {gamma_29}*fips_29 + {gamma_30}*fips_30 + {gamma_31}*fips_31 + {gamma_32}*fips_32 + {gamma_33}*fips_33 + {gamma_34}*fips_34 + {gamma_35}*fips_35 + {gamma_36}*fips_36 + {gamma_37}*fips_37 + {gamma_38}*fips_38 + {gamma_39}*fips_39 + {gamma_40}*fips_40 + {gamma_41}*fips_41 + {gamma_42}*fips_42 + {gamma_43}*fips_43 + {gamma_44}*fips_44 + {gamma_45}*fips_45 + {gamma_46}*fips_46 + {gamma_47}*fips_47 + {gamma_48}*fips_48 + {gamma_49