In [26]:
import numpy as np
import xarray as xr
import pandas as pd
import urllib.request
from numba import vectorize, float64
import shapefile
from shapely.geometry import shape, Point
import zipfile

In [27]:
data_dir = '../data/'
wind_dir = data_dir + 'wind/'
gis_dir  = data_dir + 'gis/'

# Wind Speeds

## Download Data

### Wind Data

In [3]:
# Get wind speed data from NOAA through FTP server
ftp_loc = 'ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/'
uwind_filename = 'uwnd.sig995.2016.nc'
vwind_filename = 'vwnd.sig995.2016.nc'

In [4]:
# Code to download
print('Downloading wind data...')
urllib.request.urlretrieve(ftp_loc + uwind_filename, wind_dir + uwind_filename)
urllib.request.urlretrieve(ftp_loc + vwind_filename, wind_dir + vwind_filename)
print('Complete')

Downloading wind data...
Complete


### Shapefile Data

In [5]:
# Get US shapefile from US Census
url_loc = 'http://www2.census.gov/geo/tiger/GENZ2017/shp/'
shapefile_filename = 'cb_2017_us_county_5m.zip'
shapefile_foldername = 'us_county_5m/'

In [6]:
print('Downloading shapefile data...')
urllib.request.urlretrieve(url_loc + shapefile_filename, gis_dir + shapefile_foldername + shapefile_filename)
print('Complete')

Downloading shapefile data...
Complete


In [7]:
# unzip file
zip_ref = zipfile.ZipFile(gis_dir + shapefile_foldername + shapefile_filename, 'r')
zip_ref.extractall(gis_dir + shapefile_foldername)
zip_ref.close()

## Convert .NC to Pandas DF

In [28]:
# Convert nc files to dataframes
uwind_df = xr.open_dataset(wind_dir + uwind_filename).to_dataframe().reset_index()
vwind_df = xr.open_dataset(wind_dir + vwind_filename).to_dataframe().reset_index()

In [29]:
# Function to do sqrt(a^2 + b^2)
@vectorize
def f_diag(a,b): 
    return np.sqrt(np.power(a, 2) + np.power(b, 2))

In [30]:
# Find wind speed using u_speed and v_speed
data_wind_speed = uwind_df.merge(vwind_df)
data_wind_speed['wind_speed'] = f_diag(data_wind_speed['uwnd'].values, data_wind_speed['vwnd'].values)

In [31]:
# Convert longitudinal coordinates from [0:365] into [0:180 -180:0]
def convert_lon(x):
    if x > 180:
        return x - 360
    else: 
        return x
    
data_wind_speed['lon'] = data_wind_speed['lon'].apply(lambda x: convert_lon(x))

In [32]:
data_wind_speed.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
0,90.0,0.0,0,2016-01-01,-0.449999,1893408.0,-8.449999,8.461973
1,90.0,0.0,0,2016-01-02,-0.699999,1893432.0,-2.049999,2.166216
2,90.0,0.0,0,2016-01-03,2.425002,1893456.0,3.725001,4.444802
3,90.0,0.0,0,2016-01-04,7.625001,1893480.0,1.350002,7.743587
4,90.0,0.0,0,2016-01-05,5.200001,1893504.0,-0.199999,5.203845


## Reverse Geocode

In [33]:
# Read shapefile of counties from US Census
r = shapefile.Reader("../data/gis/us_county_5m/cb_2017_us_county_5m.shp")
shapes = r.shapes()
records = r.records()

In [34]:
def reverse_geocode_county(lat, lon, shapes, records, bbox = [-180, -60, 15, 73]):
    ''' Returns a list of records associated with a given lat and lon, 
    including FIPS codes for the state and county
    '''
    # check if point is inside the US
    if lat < bbox[2] or lat > bbox[3] or lon < bbox[0] or lon > bbox[1]:
        return [None]*len(records[0])
    
    point_missing = True

    for i in range(0, len(shapes)):

        county_sh = shapes[i]

        # County of point located
        if shape(county_sh).contains(Point(lon, lat)):

            point_missing = False
            break
        
    # Point not in the US
    if point_missing:
        # raise Exception('Point not found')
        return [None]*len(records[0])
    else:
        return records[i]
        

In [35]:
# Filter to get unique observations of each lat and lon in the data (only those in the US)
data_wind_speed_temp = data_wind_speed[data_wind_speed['time'] == '2016-06-01'].copy()
data_wind_speed_temp = data_wind_speed_temp.query('nbnds == 0').query('lat > 15').query('lon > -180').copy()

wind_df_temp.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed,state_fips_code,county_fips_code
901244,70.0,-162.5,0,2016-06-01,-6.724999,1897056.0,-8.274999,10.663077,2,185
901976,70.0,-160.0,0,2016-06-01,-7.1,1897056.0,-7.349998,10.219221,2,185
902708,70.0,-157.5,0,2016-06-01,-7.075,1897056.0,-6.424999,9.556999,2,185
903440,70.0,-155.0,0,2016-06-01,-6.799999,1897056.0,-5.674999,8.856952,2,185
904172,70.0,-152.5,0,2016-06-01,-6.424999,1897056.0,-5.149999,8.234264,2,185


In [36]:
# Get state of each observation
data_wind_speed_temp['state_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[0], axis = 1)

In [37]:
# Get county of each observation
data_wind_speed_temp = data_wind_speed_temp.dropna().copy()
data_wind_speed_temp['county_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[1], axis = 1)

In [93]:
# Merge temp dataframe to get fips codes for all dates
data_wind_speed = data_wind_speed_temp[['lat', 'lon', 'state_fips_code', 'county_fips_code']].merge(wind_df, on = ['lat', 'lon'])

## Clean Data

In [94]:
# Drop unnecessary columns
data_wind_speed = data_wind_speed.drop(['lat', 'lon', 'county_fips_code', 'nbnds', 'time_bnds'], axis = 1)

# Fix State FIPS code
data_wind_speed['state_fips_code'] = pd.to_numeric(data_wind_speed['state_fips_code'])

# Get average wind speed
data_wind_speed = data_wind_speed.groupby('state_fips_code').mean()['wind_speed'].reset_index()

# Rename columns
data_wind_speed = data_wind_speed.rename(columns = {'state_fips_code': 'State FIPS', 'wind_speed': 'Annual_Avg_Wind_Speed'})

In [95]:
data_wind_speed.head()

Unnamed: 0,State FIPS,Annual_Avg_Wind_Speed
0,1,3.585898
1,2,4.619468
2,4,3.323894
3,5,3.905829
4,6,3.12396


# Wind Energy Generation

## Import Data

In [96]:
# Wind Generation Data
wind_gen_file_loc = wind_dir + 'wind_generation_2016.csv'
data_wind_gen = pd.read_csv(wind_gen_file_loc)

In [97]:
# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']
                         
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

## Clean Data

In [98]:
# Add code descriptions to wind consumption data
data_wind_gen = data_wind_gen.merge(msn_codes_key).sort_values('State')

# Select wind energy net generation and consumption columns 
data_wind_gen = data_wind_gen.rename(columns = {'2016': 'Energy'})
data_wind_gen = data_wind_gen.pivot(index = 'State', columns= 'MSN', values = 'Energy').reset_index()

# Compute wind energy consumption in kWh
data_wind_gen['Consumption_com'] = data_wind_gen['WYCCB']*293.29722
data_wind_gen['Consumption_ind'] = data_wind_gen['WYICB']*293.29722
data_wind_gen['Consumption_elc'] = data_wind_gen['WYEGB']*293.29722
data_wind_gen['Consumption_tot'] = data_wind_gen['WYTCB']*293.29722

data_wind_gen['Net_Generation_com'] = data_wind_gen['WYCCP']
data_wind_gen['Net_Generation_ind'] = data_wind_gen['WYICP']
data_wind_gen['Net_Generation_elc'] = data_wind_gen['WYEGP']
data_wind_gen['Net_Generation_tot'] = data_wind_gen['WYTCP']

# Hold relevant columns
data_wind_gen = data_wind_gen.drop([x for x in data_wind_gen.columns if x[0:2] == "WY"], axis = 1)

## Merge Data

In [108]:
# Add FIPS code to wind energy generation data
data_wind_gen = data_wind_gen.rename(columns = {'State': 'State Abbreviation'}).merge(state_fips_key).drop(state_fips_indicators, axis = 1)

data_wind = data_wind_gen.merge(data_wind_speed)

# Export Data

In [110]:
data_wind.to_csv(data_dir + 'processed/' + 'wind_data.csv', index = False, header = True)

## References

* Kalnay et al., The NCEP/NCAR 40-year reanalysis project, Bull. Amer. Meteor. Soc., 77, 437-470, 1996. [ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README](ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README)
* US Census, Cartographic Boundary Shapefiles - Counties, 2017. https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html 