In [288]:
import numpy as np
import pandas as pd
import datetime

In [289]:
data_dir  = '../data/'
solar_dir = data_dir + 'solar/'
elec_dir  = data_dir + 'electricity/'
tmpr_dir  = data_dir + 'temperature/'

## Import Data

### Variables

In [290]:
# Solar Radiation Data
solar_rad_file_loc = solar_dir + 'solar_radiation.csv'
data_solar_rad = pd.read_csv(solar_rad_file_loc, na_values = ['-']).dropna()

# Electricity Prices Data
elc_pr_file_loc = elec_dir + 'state_prices.csv'
data_elc_pr = pd.read_csv(elc_pr_file_loc, na_values = ['NA'])

# Solar Generation Data
solar_gen_file_loc = solar_dir + 'solar_generation_2016.csv'
data_solar_gen = pd.read_csv(solar_gen_file_loc)

## Temperature Data
# CDD
cdd_file_loc = tmpr_dir + 'CDD_State.txt'
data_cdd = pd.read_fwf(cdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})
# HDD
hdd_file_loc = tmpr_dir + 'HDD_State.txt'
data_hdd = pd.read_fwf(hdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})

### Keys

In [291]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

## Clean Data

### Solar Generation

In [292]:
# Add code descriptions to solar consumption data
data_solar_gen = data_solar_gen.merge(msn_codes_key).sort_values('State')

# Select solar energy net generation and consumption columns 
data_solar_gen = data_solar_gen.rename(columns = {'2016': 'Energy'})
data_solar_gen = data_solar_gen.pivot(index = 'State', columns= 'MSN', values = 'Energy').reset_index()

# Compute solar energy consumption in kWh
data_solar_gen['Consumption_com'] = data_solar_gen['SOCCB']*293.29722
data_solar_gen['Consumption_ind'] = data_solar_gen['SOICB']*293.29722
data_solar_gen['Consumption_res'] = data_solar_gen['SORCB']*293.29722
data_solar_gen['Consumption_elc'] = data_solar_gen['SOEGB']*293.29722
data_solar_gen['Consumption_tot'] = data_solar_gen['SOTCB']*293.29722

data_solar_gen['Net_Generation_com'] = data_solar_gen['SOCCP']
data_solar_gen['Net_Generation_ind'] = data_solar_gen['SOICP']
data_solar_gen['Net_Generation_res'] = data_solar_gen['SOR7P']
data_solar_gen['Net_Generation_elc'] = data_solar_gen['SOEGP']
data_solar_gen['Net_Generation_tot'] = data_solar_gen['SOTGP']

# Hold relevant columns
data_solar_gen = data_solar_gen.drop([x for x in data_solar_gen.columns if x[0:2] == "SO"], axis = 1)

In [293]:
data_solar_gen.head()

MSN,State,Consumption_com,Consumption_ind,Consumption_res,Consumption_elc,Consumption_tot,Net_Generation_com,Net_Generation_ind,Net_Generation_res,Net_Generation_elc,Net_Generation_tot
0,AK,1466.486,0.0,2639.675,0.0,4106.161,0.0,0.0,1.0,0.0,1.0
1,AL,14078.27,293.2972,26983.34,84762.9,125824.5,5.0,0.0,2.0,31.0,39.0
2,AR,8212.322,0.0,29623.02,70684.63,108813.3,3.0,0.0,4.0,26.0,33.0
3,AZ,1335089.0,461356.5,3549776.0,10132250.0,15478470.0,493.0,170.0,964.0,3742.0,5369.0
4,CA,4941178.0,3757724.0,19069010.0,50572070.0,78339980.0,1825.0,1388.0,5147.0,18677.0,27037.0


#### Solar Radiation

In [294]:
# Get relevant columns
data_solar_avg_cols = [x for x in data_solar_rad.columns if x[-20:] == 'Average (kWh/m2/day)']
data_solar_rad = data_solar_rad[['State'] + data_solar_avg_cols] 

# Rename solar radiation cols
data_solar_rad = data_solar_rad.rename(columns = dict(zip(data_solar_avg_cols, [x[:-21] + '_Avg_Rad' for x in data_solar_avg_cols])))

# Convert to yearly, millions of kWh
for col in data_solar_rad.columns:
    if col[-3:] == "Rad":
        data_solar_rad[col] = data_solar_rad[col]*365*(10**-6)

In [295]:
data_solar_rad.head()

Unnamed: 0,State,Annual_Avg_Rad,January_Avg_Rad,February_Avg_Rad,March_Avg_Rad,April_Avg_Rad,May_Avg_Rad,June_Avg_Rad,July_Avg_Rad,August_Avg_Rad,September_Avg_Rad,October_Avg_Rad,November_Avg_Rad,December_Avg_Rad
0,Alabama,0.001621,0.001413,0.001573,0.001741,0.001847,0.001821,0.001756,0.00165,0.001679,0.001526,0.001591,0.00154,0.001296
2,Arizona,0.002792,0.002383,0.002431,0.002832,0.003092,0.00338,0.003555,0.002774,0.002584,0.002832,0.002767,0.00257,0.002285
3,Arkansas,0.001632,0.001277,0.001358,0.001522,0.001789,0.001686,0.001916,0.002022,0.002037,0.001756,0.001591,0.001347,0.001267
4,California,0.002427,0.001584,0.001672,0.00223,0.002467,0.002909,0.003263,0.003259,0.003095,0.002854,0.002362,0.001836,0.001551
5,Colorado,0.002281,0.00173,0.001803,0.002128,0.002365,0.002606,0.002997,0.002796,0.002489,0.002562,0.002274,0.001953,0.001664


### Electricity Prices

In [296]:
# Get data for 2016 of total electric industry
data_elc_pr = data_elc_pr[data_elc_pr.apply(lambda x: x['Industry Sector Category'] == "Total Electric Industry", axis = 1)].query('Year == 2016')

# Rename columns
data_elc_pr = data_elc_pr.rename(columns = {'Residential': 'Residential_Price', 'Commercial': 'Commercial_Price', 'Industrial': 'Industial_Price',
                                           'Transportation': 'Transportation_Price', 'Total(cents/kWH)': 'Total_Price'})

# Change cost to dollars per million kWH
for col in data_elc_pr.columns:
    if 'Price' in col:
        data_elc_pr[col] = data_elc_pr[col].astype(float)*(1/100)*(10**6)
        
# Drop unnecessary columns
data_elc_pr = data_elc_pr.drop(['Year', 'Other', 'Industry Sector Category'], axis = 1)

In [297]:
data_elc_pr.head()

Unnamed: 0,State,Residential_Price,Commercial_Price,Industial_Price,Transportation_Price,Total_Price
0,AK,203000.0,175600.0,152200.0,0.0,179300.0
1,AL,119900.0,111100.0,60400.0,0.0,95600.0
2,AR,99200.0,82300.0,60800.0,104000.0,81300.0
3,AZ,121500.0,104100.0,60700.0,99300.0,103300.0
4,CA,173900.0,150700.0,119200.0,98000.0,152300.0


### Temperature Data

In [298]:
# Translate first column 
data_cdd['NCDC Code'] = data_cdd[0].apply(lambda x: int(str(x)[:3]))
data_cdd['Year'] = data_cdd[0].apply(lambda x: str(x)[-4:])
data_hdd['NCDC Code'] = data_hdd[0].apply(lambda x: int(str(x)[:3]))
data_hdd['Year'] = data_hdd[0].apply(lambda x: str(x)[-4:])

# Only keep 2016 data
data_cdd = data_cdd.query('Year == "2016"')
data_hdd = data_hdd.query('Year == "2016"')

# Rename columns
months_dict = {i: datetime.date(2016, i, 1).strftime('%B') for i in range(1,13)}
cdd_months_dict = {i: 'CDD_' + month for (i, month) in months_dict.items()}
hdd_months_dict = {i: 'HDD_' + month for (i, month) in months_dict.items()}
data_cdd = data_cdd.rename(columns = cdd_months_dict)
data_hdd = data_hdd.rename(columns = hdd_months_dict)

# Drop unnecessary columns
data_cdd = data_cdd.drop([0], axis = 1)
data_hdd = data_hdd.drop([0], axis = 1)

# Add annual averages
data_cdd['Annual_Avg_CDD'] = data_cdd.iloc[:,0:12].mean(axis = 1)
data_hdd['Annual_Avg_HDD'] = data_hdd.iloc[:,0:12].mean(axis = 1)

In [299]:
display(data_cdd.head())
display(data_hdd.head())

Unnamed: 0,CDD_January,CDD_February,CDD_March,CDD_April,CDD_May,CDD_June,CDD_July,CDD_August,CDD_September,CDD_October,CDD_November,CDD_December,NCDC Code,Year,Annual_Avg_CDD
121,4.0,6.0,50.0,52.0,174.0,420.0,518.0,505.0,395.0,150.0,22.0,9.0,1,2016,192.083333
245,0.0,31.0,73.0,118.0,236.0,647.0,743.0,599.0,409.0,281.0,43.0,0.0,2,2016,265.0
369,0.0,1.0,27.0,44.0,107.0,410.0,520.0,454.0,316.0,121.0,11.0,0.0,3,2016,167.583333
493,0.0,10.0,6.0,22.0,33.0,201.0,282.0,268.0,145.0,47.0,10.0,0.0,4,2016,85.333333
617,0.0,0.0,0.0,0.0,0.0,109.0,158.0,63.0,27.0,1.0,0.0,0.0,5,2016,29.833333


Unnamed: 0,HDD_January,HDD_February,HDD_March,HDD_April,HDD_May,HDD_June,HDD_July,HDD_August,HDD_September,HDD_October,HDD_November,HDD_December,NCDC Code,Year,Annual_Avg_HDD
121,713.0,454.0,238.0,108.0,27.0,0.0,0.0,0.0,0.0,52.0,253.0,504.0,1,2016,195.75
245,507.0,226.0,164.0,101.0,35.0,0.0,0.0,0.0,4.0,21.0,188.0,408.0,2,2016,137.833333
369,807.0,521.0,338.0,133.0,60.0,0.0,0.0,1.0,5.0,67.0,305.0,723.0,3,2016,246.666667
493,507.0,270.0,336.0,212.0,59.0,4.0,1.0,0.0,21.0,127.0,267.0,524.0,4,2016,194.0
617,1190.0,888.0,842.0,639.0,50.0,0.0,2.0,7.0,70.0,399.0,724.0,244.0,5,2016,421.25


## Merge Data

### Add FIPS codes

In [300]:
# For solar generation data
data_solar_gen = data_solar_gen.rename(columns = {'State': 'State Abbreviation'}).merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For solar radiation data
data_solar_rad = data_solar_rad.rename(columns = {'State': 'State Name'}).copy()
data_solar_rad['State Name'] = data_solar_rad['State Name'].str.upper()
data_solar_rad = data_solar_rad.merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For electricity prices
data_elc_pr = data_elc_pr.rename(columns = {'State': 'State Abbreviation'}).merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# For temperature data
data_cdd = data_cdd.merge(state_codes_key).drop(state_fips_indicators, axis = 1)
data_hdd = data_hdd.merge(state_codes_key).drop(state_fips_indicators, axis = 1)

In [301]:
data_solar = data_solar_rad.merge(data_solar_gen).merge(data_elc_pr).merge(data_cdd).merge(data_hdd)
data_solar.head()

Unnamed: 0,Annual_Avg_Rad,January_Avg_Rad,February_Avg_Rad,March_Avg_Rad,April_Avg_Rad,May_Avg_Rad,June_Avg_Rad,July_Avg_Rad,August_Avg_Rad,September_Avg_Rad,...,HDD_April,HDD_May,HDD_June,HDD_July,HDD_August,HDD_September,HDD_October,HDD_November,HDD_December,Annual_Avg_HDD
0,0.001621,0.001413,0.001573,0.001741,0.001847,0.001821,0.001756,0.00165,0.001679,0.001526,...,108.0,27.0,0.0,0.0,0.0,0.0,52.0,253.0,504.0,195.75
1,0.002792,0.002383,0.002431,0.002832,0.003092,0.00338,0.003555,0.002774,0.002584,0.002832,...,101.0,35.0,0.0,0.0,0.0,4.0,21.0,188.0,408.0,137.833333
2,0.001632,0.001277,0.001358,0.001522,0.001789,0.001686,0.001916,0.002022,0.002037,0.001756,...,133.0,60.0,0.0,0.0,1.0,5.0,67.0,305.0,723.0,246.666667
3,0.002427,0.001584,0.001672,0.00223,0.002467,0.002909,0.003263,0.003259,0.003095,0.002854,...,212.0,59.0,4.0,1.0,0.0,21.0,127.0,267.0,524.0,194.0
4,0.002281,0.00173,0.001803,0.002128,0.002365,0.002606,0.002997,0.002796,0.002489,0.002562,...,639.0,50.0,0.0,2.0,7.0,70.0,399.0,724.0,244.0,421.25


## Export

In [302]:
data_solar.to_csv(data_dir + 'processed/solar_data.csv', index = False)

## References

* NREL. Solar Summaries. Solar Data. https://www.nrel.gov/gis/assets/docs/solarsummaries/solarsummaries.xlsx
* EIA. Average Price by State by Provider (EIA-861). Electricity Detailed State Data. https://www.eia.gov/electricity/data/state/avgprice_annual.xlsx
* EIA. Solar Energy Consumption. State Energy Data System (SEDS): 2016. https://www.eia.gov/state/seds/sep_fuel/html/csv/fuel_so.csv
* EIA. Codes and Descriptions. https://www.eia.gov/state/seds/CDF/Codes_and_Descriptions.xlsx
* US Census. ANSI Codes for States. https://www.census.gov/geo/reference/ansi_statetables.html