In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import urllib.request
from numba import vectorize, float64
import shapefile
from shapely.geometry import shape, Point
import zipfile

In [23]:
data_dir  = '../data/'
wind_dir  = data_dir + 'wind/'
gis_dir   = data_dir + 'state/gis/'
solar_dir = data_dir + 'solar/'

## Keys

In [26]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

# Wind Speeds

## Download Data

### Wind Data

In [3]:
# Get wind speed data from NOAA through FTP server
ftp_loc = 'ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/'
uwind_filename = 'uwnd.sig995.2016.nc'
vwind_filename = 'vwnd.sig995.2016.nc'

In [4]:
# Code to download
print('Downloading wind data...')
urllib.request.urlretrieve(ftp_loc + uwind_filename, wind_dir + uwind_filename)
urllib.request.urlretrieve(ftp_loc + vwind_filename, wind_dir + vwind_filename)
print('Complete')

Downloading wind data...
Complete


### Shapefile Data

In [5]:
# Get US shapefile from US Census
url_loc = 'http://www2.census.gov/geo/tiger/GENZ2017/shp/'
shapefile_filename = 'cb_2017_us_county_5m.zip'
shapefile_foldername = 'us_county_5m/'

In [6]:
print('Downloading shapefile data...')
urllib.request.urlretrieve(url_loc + shapefile_filename, gis_dir + shapefile_foldername + shapefile_filename)
print('Complete')

Downloading shapefile data...
Complete


In [7]:
# unzip file
zip_ref = zipfile.ZipFile(gis_dir + shapefile_foldername + shapefile_filename, 'r')
zip_ref.extractall(gis_dir + shapefile_foldername)
zip_ref.close()

## Convert .NC to Pandas DF

In [8]:
# Convert nc files to dataframes
uwind_df = xr.open_dataset(wind_dir + uwind_filename).to_dataframe().reset_index()
vwind_df = xr.open_dataset(wind_dir + vwind_filename).to_dataframe().reset_index()

In [9]:
# Function to do sqrt(a^2 + b^2)
@vectorize
def f_diag(a,b): 
    return np.sqrt(np.power(a, 2) + np.power(b, 2))

In [10]:
# Find wind speed using u_speed and v_speed
data_wind_speed = uwind_df.merge(vwind_df)
data_wind_speed['wind_speed'] = f_diag(data_wind_speed['uwnd'].values, data_wind_speed['vwnd'].values)

In [11]:
# Convert longitudinal coordinates from [0:365] into [0:180 -180:0]
def convert_lon(x):
    if x > 180:
        return x - 360
    else: 
        return x
    
data_wind_speed['lon'] = data_wind_speed['lon'].apply(lambda x: convert_lon(x))

In [12]:
data_wind_speed.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
0,90.0,0.0,0,2016-01-01,-0.449999,1893408.0,-8.449999,8.461973
1,90.0,0.0,0,2016-01-02,-0.699999,1893432.0,-2.049999,2.166216
2,90.0,0.0,0,2016-01-03,2.425002,1893456.0,3.725001,4.444802
3,90.0,0.0,0,2016-01-04,7.625001,1893480.0,1.350002,7.743587
4,90.0,0.0,0,2016-01-05,5.200001,1893504.0,-0.199999,5.203845


## Reverse Geocode

In [13]:
# Read shapefile of counties from US Census
r = shapefile.Reader(gis_dir + "us_county_5m/cb_2017_us_county_5m.shp")
shapes = r.shapes()
records = r.records()

In [14]:
def reverse_geocode_county(lat, lon, shapes, records, bbox = [-180, -60, 15, 73]):
    ''' Returns a list of records associated with a given lat and lon, 
    including FIPS codes for the state and county
    '''
    # check if point is inside the US
    if lat < bbox[2] or lat > bbox[3] or lon < bbox[0] or lon > bbox[1]:
        return [None]*len(records[0])
    
    point_missing = True

    for i in range(0, len(shapes)):

        county_sh = shapes[i]

        # County of point located
        if shape(county_sh).contains(Point(lon, lat)):

            point_missing = False
            break
        
    # Point not in the US
    if point_missing:
        # raise Exception('Point not found')
        return [None]*len(records[0])
    else:
        return records[i]
        

In [15]:
# Filter to get unique observations of each lat and lon in the data (only those in the US)
data_wind_speed_temp = data_wind_speed[data_wind_speed['time'] == '2016-06-01'].copy()
data_wind_speed_temp = data_wind_speed_temp.query('nbnds == 0').query('lat > 15').query('lon > -180').copy()

data_wind_speed_temp.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
152,90.0,0.0,0,2016-06-01,1.125,1897056.0,-2.274999,2.537961
884,90.0,2.5,0,2016-06-01,1.025,1897056.0,-2.299999,2.518059
1616,90.0,5.0,0,2016-06-01,0.925,1897056.0,-2.324999,2.502248
2348,90.0,7.5,0,2016-06-01,0.8,1897056.0,-2.374999,2.506117
3080,90.0,10.0,0,2016-06-01,0.7,1897056.0,-2.399999,2.499999


In [16]:
# Get state of each observation
data_wind_speed_temp['state_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[0], axis = 1)

In [17]:
# Get county of each observation
data_wind_speed_temp = data_wind_speed_temp.dropna().copy()
data_wind_speed_temp['county_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[1], axis = 1)

In [18]:
# Merge temp dataframe to get fips codes for all dates
data_wind_speed = data_wind_speed_temp[['lat', 'lon', 'state_fips_code', 'county_fips_code']].merge(data_wind_speed, on = ['lat', 'lon'])

## Clean Data

In [19]:
# Drop unnecessary columns
data_wind_speed = data_wind_speed.drop(['lat', 'lon', 'county_fips_code', 'nbnds', 'time_bnds'], axis = 1)

# Fix State FIPS code
data_wind_speed['state_fips_code'] = pd.to_numeric(data_wind_speed['state_fips_code'])

# Get monthly average wind speed
data_wind_speed['month'] = data_wind_speed['time'].apply(lambda x: x.month)
data_wind_speed = data_wind_speed.groupby(['state_fips_code', 'month']).mean().reset_index()

# Rename columns
data_wind_speed = data_wind_speed.rename(columns = {'state_fips_code': 'State FIPS', 'wind_speed': 'Avg_Wind_Speed'}).drop(['uwnd', 'vwnd'], axis = 1)

In [20]:
data_wind_speed.head()

Unnamed: 0,State FIPS,month,Avg_Wind_Speed
0,1,1,4.232283
1,1,2,4.52719
2,1,3,4.669978
3,1,4,3.731362
4,1,5,3.745946


# Solar Radiation

## Import Data

In [74]:
# Solar Radiation Data
solar_rad_file_loc = solar_dir + 'solar_radiation.csv'
data_solar_rad = pd.read_csv(solar_rad_file_loc, na_values = ['-']).dropna()

## Clean Data

In [75]:
# Melt month columns
data_solar_rad_avg_cols = [x for x in data_solar_rad.columns if x[-20:] == 'Average (kWh/m2/day)' and x[0:3] != 'Ann']
data_solar_rad = data_solar_rad.melt(id_vars = 'State', value_vars = data_solar_rad_avg_cols, var_name='month', value_name='solar_avg_rad')

# Convert months to numbers
data_solar_rad['month'] = data_solar_rad['month'].apply(lambda x: month_key.get(x.split(' ')[0]))

# Convert average solar radiation (kwh) to thousands of kwh
data_solar_rad['solar_avg_rad'] = data_solar_rad['solar_avg_rad']/1000;

In [76]:
data_solar_rad.head()

Unnamed: 0,State,month,solar_avg_rad
0,Alabama,1,0.00387
1,Arizona,1,0.00653
2,Arkansas,1,0.0035
3,California,1,0.00434
4,Colorado,1,0.00474


# State Factors

In [77]:
# Population data
pop_data_file_loc = data_dir + 'state/state_population.csv'
pop_data = pd.read_csv(pop_data_file_loc)

# State Area data
area_data_file_loc = data_dir + 'state/state_area.csv'
area_data = pd.read_csv(area_data_file_loc)

In [78]:
# Convert areas from km2 to m2
for col in area_data.columns:
    if 'Area' in col:
        area_data[col] = area_data[col].apply(lambda x: float(x)*(1000**2))

# Merge Data

In [80]:
# # Add FIPS code to wind energy generation data
# data_wind_speed = data_wind_speed.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

# For solar radiation data
data_solar_rad['State Name'] = data_solar_rad['State'].str.upper().str.strip()
data_solar_rad = data_solar_rad.drop(['State'], axis = 1).merge(state_codes_key).drop(state_fips_indicators, axis = 1)

# Pop data FIPS codes
pop_data = pop_data.rename(columns = {'State': 'State Name'})
pop_data['State Name'] = pop_data['State Name'].apply(lambda x: x[1:]).str.upper()
pop_data = pop_data.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

# Area data FIPS codes
area_data['State Name'] = area_data['State Name'].str.upper()
area_data = area_data.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

data_merged = data_wind_speed.merge(data_solar_rad).merge(area_data).merge(pop_data)

# Export Data

In [83]:
data_merged.to_csv(data_dir + 'processed/' + 'instruments_data.csv', index = False, header = True)

## References

* Kalnay et al., The NCEP/NCAR 40-year reanalysis project, Bull. Amer. Meteor. Soc., 77, 437-470, 1996. [ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README](ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README)
* US Census, Cartographic Boundary Shapefiles - Counties, 2017. https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html 
* EIA, Electricity Data Browser. https://www.eia.gov/electricity/data/browser/