In [7]:
import numpy as np
import xarray as xr
import pandas as pd
import urllib.request
from numba import vectorize, float64
import shapefile
from shapely.geometry import shape, Point
import zipfile

In [8]:
data_dir = '../data/'
wind_dir = data_dir + 'wind/'
gis_dir  = data_dir + 'state/gis/'

# Wind Speeds

## Download Data

### Wind Data

In [9]:
# Get wind speed data from NOAA through FTP server
ftp_loc = 'ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/'
uwind_filename = 'uwnd.sig995.2016.nc'
vwind_filename = 'vwnd.sig995.2016.nc'

In [10]:
# Code to download
print('Downloading wind data...')
urllib.request.urlretrieve(ftp_loc + uwind_filename, wind_dir + uwind_filename)
urllib.request.urlretrieve(ftp_loc + vwind_filename, wind_dir + vwind_filename)
print('Complete')

Downloading wind data...
Complete


### Shapefile Data

In [11]:
# Get US shapefile from US Census
url_loc = 'http://www2.census.gov/geo/tiger/GENZ2017/shp/'
shapefile_filename = 'cb_2017_us_county_5m.zip'
shapefile_foldername = 'us_county_5m/'

In [12]:
print('Downloading shapefile data...')
urllib.request.urlretrieve(url_loc + shapefile_filename, gis_dir + shapefile_foldername + shapefile_filename)
print('Complete')

Downloading shapefile data...
Complete


In [13]:
# unzip file
zip_ref = zipfile.ZipFile(gis_dir + shapefile_foldername + shapefile_filename, 'r')
zip_ref.extractall(gis_dir + shapefile_foldername)
zip_ref.close()

## Convert .NC to Pandas DF

In [63]:
# Convert nc files to dataframes
uwind_df = xr.open_dataset(wind_dir + uwind_filename).to_dataframe().reset_index()
vwind_df = xr.open_dataset(wind_dir + vwind_filename).to_dataframe().reset_index()

In [64]:
# Function to do sqrt(a^2 + b^2)
@vectorize
def f_diag(a,b): 
    return np.sqrt(np.power(a, 2) + np.power(b, 2))

In [71]:
# Find wind speed using u_speed and v_speed
data_wind_speed = uwind_df.merge(vwind_df)
data_wind_speed['wind_speed'] = f_diag(data_wind_speed['uwnd'].values, data_wind_speed['vwnd'].values)

In [72]:
# Convert longitudinal coordinates from [0:365] into [0:180 -180:0]
def convert_lon(x):
    if x > 180:
        return x - 360
    else: 
        return x
    
data_wind_speed['lon'] = data_wind_speed['lon'].apply(lambda x: convert_lon(x))

In [73]:
data_wind_speed.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
0,90.0,0.0,0,2016-01-01,-0.449999,1893408.0,-8.449999,8.461973
1,90.0,0.0,0,2016-01-02,-0.699999,1893432.0,-2.049999,2.166216
2,90.0,0.0,0,2016-01-03,2.425002,1893456.0,3.725001,4.444802
3,90.0,0.0,0,2016-01-04,7.625001,1893480.0,1.350002,7.743587
4,90.0,0.0,0,2016-01-05,5.200001,1893504.0,-0.199999,5.203845


## Reverse Geocode

In [20]:
# Read shapefile of counties from US Census
r = shapefile.Reader(gis_dir + "us_county_5m/cb_2017_us_county_5m.shp")
shapes = r.shapes()
records = r.records()

In [21]:
def reverse_geocode_county(lat, lon, shapes, records, bbox = [-180, -60, 15, 73]):
    ''' Returns a list of records associated with a given lat and lon, 
    including FIPS codes for the state and county
    '''
    # check if point is inside the US
    if lat < bbox[2] or lat > bbox[3] or lon < bbox[0] or lon > bbox[1]:
        return [None]*len(records[0])
    
    point_missing = True

    for i in range(0, len(shapes)):

        county_sh = shapes[i]

        # County of point located
        if shape(county_sh).contains(Point(lon, lat)):

            point_missing = False
            break
        
    # Point not in the US
    if point_missing:
        # raise Exception('Point not found')
        return [None]*len(records[0])
    else:
        return records[i]
        

In [22]:
# Filter to get unique observations of each lat and lon in the data (only those in the US)
data_wind_speed_temp = data_wind_speed[data_wind_speed['time'] == '2016-06-01'].copy()
data_wind_speed_temp = data_wind_speed_temp.query('nbnds == 0').query('lat > 15').query('lon > -180').copy()

data_wind_speed_temp.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
152,90.0,0.0,0,2016-06-01,1.125,1897056.0,-2.274999,2.537961
884,90.0,2.5,0,2016-06-01,1.025,1897056.0,-2.299999,2.518059
1616,90.0,5.0,0,2016-06-01,0.925,1897056.0,-2.324999,2.502248
2348,90.0,7.5,0,2016-06-01,0.8,1897056.0,-2.374999,2.506117
3080,90.0,10.0,0,2016-06-01,0.7,1897056.0,-2.399999,2.499999


In [33]:
# Get state of each observation
data_wind_speed_temp['state_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[0], axis = 1)

In [34]:
# Get county of each observation
data_wind_speed_temp = data_wind_speed_temp.dropna().copy()
data_wind_speed_temp['county_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[1], axis = 1)

In [74]:
# Merge temp dataframe to get fips codes for all dates
data_wind_speed = data_wind_speed_temp[['lat', 'lon', 'state_fips_code', 'county_fips_code']].merge(data_wind_speed, on = ['lat', 'lon'])

## Clean Data

In [75]:
# Drop unnecessary columns
data_wind_speed = data_wind_speed.drop(['lat', 'lon', 'county_fips_code', 'nbnds', 'time_bnds'], axis = 1)

# Fix State FIPS code
data_wind_speed['state_fips_code'] = pd.to_numeric(data_wind_speed['state_fips_code'])

# Get monthly average wind speed
data_wind_speed['month'] = data_wind_speed['time'].apply(lambda x: x.month)
data_wind_speed = data_wind_speed.groupby(['state_fips_code', 'month']).mean().reset_index()

# Rename columns
data_wind_speed = data_wind_speed.rename(columns = {'state_fips_code': 'State FIPS', 'wind_speed': 'Avg_Wind_Speed'}).drop(['uwnd', 'vwnd'], axis = 1)

In [76]:
data_wind_speed.head()

Unnamed: 0,State FIPS,month,Avg_Wind_Speed
0,1,1,4.232283
1,1,2,4.52719
2,1,3,4.669978
3,1,4,3.731362
4,1,5,3.745946


# Wind Energy Generation

## Import Data

In [209]:
# Wind Generation Data
wind_gen_file_loc = wind_dir + 'wind_generation_2016.csv'
data_wind_gen = pd.read_csv(wind_gen_file_loc)

In [210]:
# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']
                         
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

## Clean Data

In [211]:
# Melt month columns
data_wind_gen = data_wind_gen.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_wind_gen.columns[4:16], var_name='month', value_name='wind_net_gen')
data_wind_gen['month'] = data_wind_gen['month'].apply(lambda x: month_key[x])

# Pivot sector column
data_wind_gen['Sector'] = data_wind_gen['Sector'].str.strip()
data_wind_gen = data_wind_gen.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_wind_gen.columns = ['State Name', 'month', 'wind_net_gen_all', 'wind_net_gen_elc']
data_wind_gen.head()

Unnamed: 0,State Name,month,wind_net_gen_all,wind_net_gen_elc
0,Alabama,1,0.0,0.0
1,Alabama,2,0.0,0.0
2,Alabama,3,0.0,0.0
3,Alabama,4,0.0,0.0
4,Alabama,5,0.0,0.0


## Merge Data

In [212]:
# Add FIPS code to wind energy generation data
data_wind_gen['State Name'] = data_wind_gen['State Name'].str.upper().str.strip()
data_wind_gen = data_wind_gen.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

data_wind = data_wind_gen.merge(data_wind_speed)

# Export Data

In [213]:
data_wind.to_csv(data_dir + 'processed/' + 'wind_data.csv', index = False, header = True)

## References

* Kalnay et al., The NCEP/NCAR 40-year reanalysis project, Bull. Amer. Meteor. Soc., 77, 437-470, 1996. [ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README](ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README)
* US Census, Cartographic Boundary Shapefiles - Counties, 2017. https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html 
* EIA, Electricity Data Browser. https://www.eia.gov/electricity/data/browser/