In [19]:
import numpy as np
import pandas as pd
import datetime

In [20]:
data_dir  = '../data/'
coal_dir = data_dir + 'coal/'
natgas_dir = data_dir + 'naturalgas/'

## Import Data

### Variables

In [23]:
# Coal Generation Data
coal_gen_file_loc = coal_dir + 'coal_generation_2016.csv'
data_coal_gen = pd.read_csv(coal_gen_file_loc)

# Natural Gas Data
natgas_gen_file_loc = natgas_dir + 'naturalgas_generation_2016.csv'
data_natgas_gen = pd.read_csv(natgas_gen_file_loc)

# Coal Consumption for Electricity Gen Data
coal_con_elc_file_loc = coal_dir + 'coal_consumption_elc_2016.csv'
data_coal_con_elc = pd.read_csv(coal_con_elc_file_loc)

# Natural Gas Consumption for Electricity Gen Data
natgas_con_elc_file_loc = natgas_dir + 'naturalgas_consumption_elc_2016.csv'
data_natgas_con_elc = pd.read_csv(coal_con_elc_file_loc)

### Keys

In [24]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

## Clean Data

In [30]:
def clean_rawdata_gen_df(rawdata, value_label, unit_conversion_factor):
    # Melt month columns
    rawdata = rawdata.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = rawdata.columns[4:16], var_name='month', value_name=value_label).copy()
    rawdata['month'] = rawdata['month'].apply(lambda x: month_key[x])

    # Convert units
    rawdata[value_label] = rawdata[value_label]*unit_conversion_factor

    # Pivot sector column
    rawdata['Sector'] = rawdata['Sector'].str.strip()
    rawdata = rawdata.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

    # Fix column names
    rawdata.columns = ['State Name', 'month', value_label + '_all', value_label + '_elc']
    
    return rawdata

In [33]:
data_coal_gen = clean_rawdata_gen_df(data_coal_gen, 'coal_net_gen', 1000)
data_natgas_gen = clean_rawdata_gen_df(data_natgas_gen, 'coal_net_gen', 1000)

### Coal

In [13]:
# Melt month columns
data_coal_gen = data_coal_gen.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_coal_gen.columns[4:16], var_name='month', value_name='coal_net_gen')
data_coal_gen['month'] = data_coal_gen['month'].apply(lambda x: month_key[x])

# Convert to thousand kwh
data_coal_gen['coal_net_gen'] = data_coal_gen['coal_net_gen']*1000

# Pivot sector column
data_coal_gen['Sector'] = data_coal_gen['Sector'].str.strip()
data_coal_gen = data_coal_gen.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_coal_gen.columns = ['State Name', 'month', 'coal_net_gen_all', 'coal_net_gen_elc']

In [14]:
data_coal_gen.head()

Unnamed: 0,State Name,month,coal_net_gen_all,coal_net_gen_elc
0,Alabama,1,2464000.0,2455000.0
1,Alabama,2,2302000.0,2293000.0
2,Alabama,3,2369000.0,2361000.0
3,Alabama,4,1976000.0,1969000.0
4,Alabama,5,2449000.0,2441000.0


### Natural Gas Generation

In [15]:
# Melt month columns
data_natgas_gen = data_natgas_gen.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_natgas_gen.columns[4:16], var_name='month', value_name='natgas_net_gen')
data_natgas_gen['month'] = data_natgas_gen['month'].apply(lambda x: month_key[x])

# Convert to thousand kwh
data_natgas_gen['natgas_net_gen'] = data_natgas_gen['natgas_net_gen']*1000

# Pivot sector column
data_natgas_gen['Sector'] = data_natgas_gen['Sector'].str.strip()
data_natgas_gen = data_natgas_gen.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_natgas_gen.columns = ['State Name', 'month', 'natgas_net_gen_all', 'natgas_net_gen_elc']

In [16]:
data_natgas_gen.head()

Unnamed: 0,State Name,month,natgas_net_gen_all,natgas_net_gen_elc
0,Alabama,1,4833000.0,4731000.0
1,Alabama,2,4185000.0,4090000.0
2,Alabama,3,4359000.0,4265000.0
3,Alabama,4,3685000.0,3597000.0
4,Alabama,5,4638000.0,4547000.0


In [34]:
data_coal_gen['State Name'] = data_coal_gen['State Name'].str.upper().str.strip()
data_coal_gen.merge(state_codes_key).sort_values(by = 'coal_net_gen_all')#.drop(state_fips_indicators, axis = 1)

Unnamed: 0,State Name,month,coal_net_gen_all,coal_net_gen_elc,NCDC Code,State Abbreviation,State FIPS
74,CONNECTICUT,3,-3000.0,-3000.0,6,CT,9
75,CONNECTICUT,4,-2000.0,-2000.0,6,CT,9
76,CONNECTICUT,5,-2000.0,-2000.0,6,CT,9
77,CONNECTICUT,6,-2000.0,-2000.0,6,CT,9
81,CONNECTICUT,10,-2000.0,-2000.0,6,CT,9
94,DELAWARE,11,-2000.0,-2000.0,7,DE,10
82,CONNECTICUT,11,-1000.0,-1000.0,6,CT,9
516,VERMONT,1,0.0,0.0,43,VT,50
517,VERMONT,2,0.0,0.0,43,VT,50
518,VERMONT,3,0.0,0.0,43,VT,50
