In [379]:
import numpy as np
import pandas as pd
import datetime

In [380]:
data_dir  = '../data/'
solar_dir = data_dir + 'solar/'
elec_dir  = data_dir + 'electricity/'
tmpr_dir  = data_dir + 'temperature/'

## Import Data

### Variables

In [381]:
# Solar Radiation Data
solar_rad_file_loc = solar_dir + 'solar_radiation.csv'
data_solar_rad = pd.read_csv(solar_rad_file_loc, na_values = ['-']).dropna()

# Electricity Prices Data
elc_pr_file_loc = elec_dir + 'avg_elec_price.csv'
data_elc_pr = pd.read_csv(elc_pr_file_loc, na_values = ['NA'])

# Solar Generation Data
solar_gen_file_loc = solar_dir + 'solar_generation_2016.csv'
data_solar_gen = pd.read_csv(solar_gen_file_loc)

## Temperature Data
# CDD
cdd_file_loc = tmpr_dir + 'CDD_State.txt'
data_cdd = pd.read_fwf(cdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})
# HDD
hdd_file_loc = tmpr_dir + 'HDD_State.txt'
data_hdd = pd.read_fwf(hdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})

### Keys

In [382]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

## Clean Data

### Solar Generation

In [383]:
# Melt month columns
data_solar_gen = data_solar_gen.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_solar_gen.columns[4:16], var_name='month', value_name='solar_net_gen')
data_solar_gen['month'] = data_solar_gen['month'].apply(lambda x: month_key[x])

# Convert to thousand kwh
data_solar_gen['solar_net_gen'] = data_solar_gen['solar_net_gen']*1000

# Pivot sector column
data_solar_gen['Sector'] = data_solar_gen['Sector'].str.strip()
data_solar_gen = data_solar_gen.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_solar_gen.columns = ['State Name', 'month', 'solar_net_gen_all', 'solar_net_gen_elc']

In [384]:
data_solar_gen.head()

Unnamed: 0,State Name,month,solar_net_gen_all,solar_net_gen_elc
0,Alabama,1,0.0,0.0
1,Alabama,2,0.0,0.0
2,Alabama,3,0.0,0.0
3,Alabama,4,0.0,0.0
4,Alabama,5,0.0,0.0


#### Solar Radiation

In [385]:
# Melt month columns
data_solar_rad_avg_cols = [x for x in data_solar_rad.columns if x[-20:] == 'Average (kWh/m2/day)' and x[0:3] != 'Ann']
data_solar_rad = data_solar_rad.melt(id_vars = 'State', value_vars = data_solar_rad_avg_cols, var_name='month', value_name='solar_avg_rad')

# Convert months to numbers
data_solar_rad['month'] = data_solar_rad['month'].apply(lambda x: month_key.get(x.split(' ')[0]))

# Convert average solar radiation (kwh) to thousands of kwh
data_solar_rad['solar_avg_rad'] = data_solar_rad['solar_avg_rad']/1000;

In [386]:
data_solar_rad.head()

Unnamed: 0,State,month,solar_avg_rad
0,Alabama,1,0.00387
1,Arizona,1,0.00653
2,Arkansas,1,0.0035
3,California,1,0.00434
4,Colorado,1,0.00474


### Electricity Prices

In [387]:
# Melt month columns
data_elc_pr = data_elc_pr.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_elc_pr.columns[4:16], var_name='month', value_name='elc_price')
data_elc_pr['month'] = data_elc_pr['month'].apply(lambda x: month_key[x])

# Convert from cents per kwh to dollars per thousand kwh
data_elc_pr['elc_price'] = pd.to_numeric(data_elc_pr['elc_price'])*1000/100

# Pivot sector column
data_elc_pr['Sector'] = data_elc_pr['Sector'].str.strip()
data_elc_pr = data_elc_pr.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_elc_pr.columns = ['State Name', 'month', 'elc_price_all', 'elc_price_com', 'elc_price_ind', 'elc_price_oth', 'elc_price_res', 'elc_price_trn']

# Fill 0's with NaNs
data_elc_pr = data_elc_pr.replace(to_replace = 0, value = np.nan)

In [388]:
data_elc_pr.head()

Unnamed: 0,State Name,month,elc_price_all,elc_price_com,elc_price_ind,elc_price_oth,elc_price_res,elc_price_trn
0,Alabama,1,89.4,107.9,53.6,,110.0,
1,Alabama,2,90.1,108.8,52.8,,114.5,
2,Alabama,3,91.0,108.4,55.5,,120.7,
3,Alabama,4,92.6,110.5,58.7,,123.7,
4,Alabama,5,93.2,110.2,60.0,,120.5,


### Temperature Data

In [389]:
# Translate first column 
data_cdd['NCDC Code'] = data_cdd[0].apply(lambda x: int(str(x)[:3]))
data_cdd['Year'] = data_cdd[0].apply(lambda x: str(x)[-4:])
data_hdd['NCDC Code'] = data_hdd[0].apply(lambda x: int(str(x)[:3]))
data_hdd['Year'] = data_hdd[0].apply(lambda x: str(x)[-4:])

# Only keep 2016 data
data_cdd = data_cdd.query('Year == "2016"')
data_hdd = data_hdd.query('Year == "2016"')

# Drop unnecessary columns
data_cdd = data_cdd.drop([0, 'Year'], axis = 1)
data_hdd = data_hdd.drop([0, 'Year'], axis = 1)

# Melt months columns
data_cdd = data_cdd.melt(id_vars = ['NCDC Code'], value_vars = data_cdd.columns[0:12], var_name='month', value_name='CDD')
data_hdd = data_hdd.melt(id_vars = ['NCDC Code'], value_vars = data_hdd.columns[0:12], var_name='month', value_name='HDD')

In [390]:
display(data_cdd.head())
display(data_hdd.head())

Unnamed: 0,NCDC Code,month,CDD
0,1,1,4.0
1,2,1,0.0
2,3,1,0.0
3,4,1,0.0
4,5,1,0.0


Unnamed: 0,NCDC Code,month,HDD
0,1,1,713.0
1,2,1,507.0
2,3,1,807.0
3,4,1,507.0
4,5,1,1190.0


## Merge Data

### Add FIPS codes

In [391]:
# For solar generation data
data_solar_gen['State Name'] = data_solar_gen['State Name'].str.upper().str.strip()
data_solar_gen = data_solar_gen.merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For solar radiation data
data_solar_rad['State Name'] = data_solar_rad['State'].str.upper().str.strip()
data_solar_rad = data_solar_rad.drop(['State'], axis = 1).merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For electricity prices
data_elc_pr['State Name'] = data_elc_pr['State Name'].str.upper().str.strip()
data_elc_pr = data_elc_pr.merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For temperature data
data_cdd = data_cdd.merge(state_codes_key).drop(state_fips_indicators, axis = 1)
data_hdd = data_hdd.merge(state_codes_key).drop(state_fips_indicators, axis = 1)

In [403]:
data_solar = data_solar_rad.merge(data_solar_gen).merge(data_elc_pr).merge(data_cdd).merge(data_hdd)
data_solar.head()

Unnamed: 0,month,solar_avg_rad,NCDC Code,State FIPS,solar_net_gen_all,solar_net_gen_elc,elc_price_all,elc_price_com,elc_price_ind,elc_price_oth,elc_price_res,elc_price_trn,CDD,HDD
0,1,0.00387,1,1,0.0,0.0,89.4,107.9,53.6,,110.0,,4.0,713.0
1,2,0.00431,1,1,0.0,0.0,90.1,108.8,52.8,,114.5,,6.0,454.0
2,3,0.00477,1,1,0.0,0.0,91.0,108.4,55.5,,120.7,,50.0,238.0
3,4,0.00506,1,1,0.0,0.0,92.6,110.5,58.7,,123.7,,52.0,108.0
4,5,0.00499,1,1,0.0,0.0,93.2,110.2,60.0,,120.5,,174.0,27.0


## Export

In [404]:
data_solar.to_csv(data_dir + 'processed/solar_data.csv', index = False)

## References

* NREL. Solar Summaries. Solar Data. https://www.nrel.gov/gis/assets/docs/solarsummaries/solarsummaries.xlsx
* EIA. Average Price by State by Provider (EIA-861). Electricity Detailed State Data. https://www.eia.gov/electricity/data/state/avgprice_annual.xlsx
* EIA. Solar Energy Consumption. State Energy Data System (SEDS): 2016. https://www.eia.gov/state/seds/sep_fuel/html/csv/fuel_so.csv
* EIA. Codes and Descriptions. https://www.eia.gov/state/seds/CDF/Codes_and_Descriptions.xlsx
* US Census. ANSI Codes for States. https://www.census.gov/geo/reference/ansi_statetables.html