In [10]:
import numpy as np
import xarray as xr
import pandas as pd
import urllib.request
from numba import vectorize, float64
import shapefile
from shapely.geometry import shape, Point
import zipfile
from tqdm import tqdm_notebook as tqdm

In [11]:
data_dir = '../data/'
wind_dir = data_dir + 'wind/'
gis_dir  = data_dir + 'state/gis/'

# Wind Speeds

## Download Data

### Wind Data

In [12]:
# Get wind speed data from NOAA through FTP server
ftp_loc = 'ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/'
uwind_filename = 'uwnd.sig995.2018.nc'
vwind_filename = 'vwnd.sig995.2018.nc'

In [4]:
# Code to download
print('Downloading wind data...')
urllib.request.urlretrieve(ftp_loc + uwind_filename, wind_dir + uwind_filename)
urllib.request.urlretrieve(ftp_loc + vwind_filename, wind_dir + vwind_filename)
print('Complete')

Downloading wind data...
Complete


### Wind Data (All Years)

In [14]:
print('Downloading wind data...')

for year in tqdm(range(2010,2020)):
    urllib.request.urlretrieve(ftp_loc + uwind_filename, wind_dir + 'uwnd.sig995.{0}.nc'.format(str(year)))
    urllib.request.urlretrieve(ftp_loc + vwind_filename, wind_dir + 'vwnd.sig995.{0}.nc'.format(str(year)))
    
print('Complete')

Downloading wind data...


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Complete


### Shapefile Data

In [17]:
# Get US shapefile from US Census
url_loc = 'http://www2.census.gov/geo/tiger/GENZ2017/shp/'
shapefile_filename = 'cb_2017_us_state_5m.zip'
shapefile_foldername = 'us_state_5m/'

In [18]:
print('Downloading shapefile data...')
urllib.request.urlretrieve(url_loc + shapefile_filename, gis_dir + shapefile_foldername + shapefile_filename)
print('Complete')

Downloading shapefile data...
Complete


In [19]:
# unzip file
zip_ref = zipfile.ZipFile(gis_dir + shapefile_foldername + shapefile_filename, 'r')
zip_ref.extractall(gis_dir + shapefile_foldername)
zip_ref.close()

## Convert .NC to Pandas DF

In [20]:
# Convert nc files to dataframes
uwind_df = xr.open_dataset(wind_dir + uwind_filename).to_dataframe().reset_index()
vwind_df = xr.open_dataset(wind_dir + vwind_filename).to_dataframe().reset_index()

In [21]:
# Function to do sqrt(a^2 + b^2)
@vectorize
def f_diag(a,b): 
    return np.sqrt(np.power(a, 2) + np.power(b, 2))

In [22]:
# Find wind speed using u_speed and v_speed
data_wind_speed = uwind_df.merge(vwind_df)
data_wind_speed['wind_speed'] = f_diag(data_wind_speed['uwnd'].values, data_wind_speed['vwnd'].values)

In [23]:
# Convert longitudinal coordinates from [0:360] into [0:180 -180:0]
def convert_lon(x):
    if x > 180:
        return x - 360
    else: 
        return x
    
data_wind_speed['lon'] = data_wind_speed['lon'].apply(lambda x: convert_lon(x))

In [24]:
data_wind_speed.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
0,90.0,0.0,0,2018-01-01,-3.799999,1910952.0,-2.649998,4.632762
1,90.0,0.0,0,2018-01-02,-5.899999,1910976.0,-2.799999,6.530695
2,90.0,0.0,0,2018-01-03,-3.799998,1911000.0,-2.874999,4.76504
3,90.0,0.0,0,2018-01-04,-0.524999,1911024.0,-2.999999,3.045589
4,90.0,0.0,0,2018-01-05,1.05,1911048.0,-3.724999,3.870157


## Reverse Geocode

In [25]:
# Read shapefile of counties from US Census
r = shapefile.Reader(gis_dir + "us_state_5m/cb_2017_us_state_5m.shp")
shapes = r.shapes()
records = r.records()

In [26]:
def reverse_geocode_county(lat, lon, shapes, records, bbox = [-180, -60, 15, 73]):
    ''' Returns a list of records associated with a given lat and lon, 
    including FIPS codes for the state and county
    '''
    # check if point is inside the US
    if lat < bbox[2] or lat > bbox[3] or lon < bbox[0] or lon > bbox[1]:
        return [None]*len(records[0])
    
    point_missing = True

    for i in range(0, len(shapes)):

        county_sh = shapes[i]

        # County of point located
        if shape(county_sh).contains(Point(lon, lat)):

            point_missing = False
            break
        
    # Point not in the US
    if point_missing:
        # raise Exception('Point not found')
        return [None]*len(records[0])
    else:
        return records[i]
        

In [27]:
# Filter to get unique observations of each lat and lon in the data (only those in the US)
data_wind_speed_temp = data_wind_speed[data_wind_speed['time'] == '2018-06-01'].copy()
data_wind_speed_temp = data_wind_speed_temp.query('nbnds == 0').query('lat > 15').query('lon > -180').copy()

data_wind_speed_temp.head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
151,90.0,0.0,0,2018-06-01,0.825002,1914576.0,-3.624998,3.717693
881,90.0,2.5,0,2018-06-01,0.650002,1914576.0,-3.674998,3.732038
1611,90.0,5.0,0,2018-06-01,0.500002,1914576.0,-3.724998,3.758406
2341,90.0,7.5,0,2018-06-01,0.350002,1914576.0,-3.699998,3.716516
3071,90.0,10.0,0,2018-06-01,0.175002,1914576.0,-3.724998,3.729107


In [None]:
# Get state of each observation
data_wind_speed_temp['state_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[0], axis = 1)

In [34]:
# Get county of each observation
data_wind_speed_temp = data_wind_speed_temp.dropna().copy()
data_wind_speed_temp['county_fips_code'] = data_wind_speed_temp.apply(lambda x: reverse_geocode_county(x.lat, x.lon, shapes, records)[1], axis = 1)

In [74]:
# Merge temp dataframe to get fips codes for all dates
data_wind_speed = data_wind_speed_temp[['lat', 'lon', 'state_fips_code', 'county_fips_code']].merge(data_wind_speed, on = ['lat', 'lon'])

## Monthly averages

In [55]:
def reverse_geocode_state(lat, lon, shapes, records, bbox = [-180, -60, 15, 73]):
    ''' Returns a list of records associated with a given lat and lon, 
    including FIPS codes for the state and county
    '''
    # check if point is inside the US
    if lat < bbox[2] or lat > bbox[3] or lon < bbox[0] or lon > bbox[1]:
        return [None]*len(records[0])
    
    point_missing = True

    for i in range(0, len(shapes)):

        county_sh = shapes[i]

        # County of point located
        if shape(county_sh).contains(Point(lon, lat)):

            point_missing = False
            break
        
    # Point not in the US
    if point_missing:
        # raise Exception('Point not found')
        return [None]*len(records[0])
    else:
        return records[i]
        

In [78]:
coordinates

((-37.81, 144.96), (31.76, 35.21))

In [64]:
reverse_geocode_state(-20,90, shapes, records)

[None, None, None, None, None, None, None, None, None]

In [70]:
shape(shapes[0]).contains(Point(-20,90))

False

In [49]:
data_wind_speed_m = data_wind_speed.query('nbnds == 0').copy()

In [51]:
# Create date variables
data_wind_speed_m['month'] = data_wind_speed_m['time'].apply(lambda x: x.month)
data_wind_speed_m['year']  = data_wind_speed_m['time'].apply(lambda x: x.year)

# Aggregate by month
data_wind_speed_m = data_wind_speed_m.groupby(['month', 'year', 'lat', 'lon'])[['uwnd', 'vwnd']].mean().reset_index()

data_wind_speed_m.head()

Unnamed: 0,month,year,lat,lon,uwnd,vwnd
0,1,2018,-90.0,-177.5,1.050001,2.808065
1,1,2018,-90.0,-175.0,0.937098,2.854033
2,1,2018,-90.0,-172.5,0.806453,2.885485
3,1,2018,-90.0,-170.0,0.683065,2.919356
4,1,2018,-90.0,-167.5,0.55484,2.944356


In [86]:
# Add state names
data_wind_speed_m.sample(n=100).apply(lambda x: 
    reverse_geocode_county(x.lat, x.lon, shapes, records)[0], axis = 1)

82219     None
94093     None
90622     None
19476     None
53563     None
80371     None
54281     None
9502      None
11695     None
26769     None
46197     None
74161     None
109479    None
58683     None
30730     None
96673     None
81879     None
94264     None
91277     None
32757     None
81214     None
76545     None
41830     None
95889     None
33668     None
119507    None
67333     None
50817     None
62354     None
72363     None
          ... 
95453     None
5571      None
119279    None
3892      None
59951     None
75102     None
41879     None
85352     None
117196    None
105614    None
112364    None
66286     None
123988    None
88911     None
28441     None
18457       30
121265    None
122384    None
33383     None
118538    None
20691     None
45009     None
56666     None
93140     None
56912     None
11182     None
42772     None
71000     None
25634     None
85211     None
Length: 100, dtype: object


 66%|██████▋   | 665/1000 [00:21<00:03, 107.62it/s][A

In [None]:
data_wind_speed_m.head()

## Clean Data

In [75]:
# Drop unnecessary columns
data_wind_speed = data_wind_speed.drop(['lat', 'lon', 'county_fips_code', 'nbnds', 'time_bnds'], axis = 1)

# Fix State FIPS code
data_wind_speed['state_fips_code'] = pd.to_numeric(data_wind_speed['state_fips_code'])

# Get monthly average wind speed
data_wind_speed['month'] = data_wind_speed['time'].apply(lambda x: x.month)
data_wind_speed = data_wind_speed.groupby(['state_fips_code', 'month']).mean().reset_index()

# Rename columns
data_wind_speed = data_wind_speed.rename(columns = {'state_fips_code': 'State FIPS', 'wind_speed': 'Avg_Wind_Speed'}).drop(['uwnd', 'vwnd'], axis = 1)

In [76]:
data_wind_speed.head()

Unnamed: 0,State FIPS,month,Avg_Wind_Speed
0,1,1,4.232283
1,1,2,4.52719
2,1,3,4.669978
3,1,4,3.731362
4,1,5,3.745946


## Texas Wind Speed Data

In [30]:
# Wind speed in given location (largest wind farm)
data_wind_speed_loc         = data_wind_speed.query('abs(lat - 32.4) < 0.5 & abs(lon + 97.44) < 0.6 & nbnds == 0').copy()
data_wind_speed_loc['date'] = data_wind_speed_loc['time'].apply(lambda x: '{1}/{2}/{0}'.format(x.year, x.month, x.day))

data_wind_speed_loc['wind_speed'] = np.multiply(data_wind_speed_loc['wind_speed'], np.sign(
    data_wind_speed_loc['uwnd']*data_wind_speed_loc['vwnd']))
data_wind_speed_loc               = data_wind_speed_loc[['date', 'wind_speed', 'uwnd', 'vwnd']]
data_wind_speed_loc.head()                                                             

Unnamed: 0,date,wind_speed,uwnd,vwnd
2494410,1/1/2018,5.520471,-0.474999,-5.499998
2494411,1/2/2018,2.334522,-1.599999,-1.699999
2494412,1/3/2018,-4.726059,4.400002,-1.724999
2494413,1/4/2018,0.951313,-0.049999,-0.949999
2494414,1/5/2018,5.01261,1.525,4.775001


In [31]:
data_wind_speed.query('abs(lat - 32.4) < 0.5 & abs(lon + 100.53) < 0.6 & nbnds == 0').copy().head()

Unnamed: 0,lat,lon,nbnds,time,uwnd,time_bnds,vwnd,wind_speed
2493680,32.5,-100.0,0,2018-01-01,-1.574999,1910952.0,-5.399998,5.624998
2493681,32.5,-100.0,0,2018-01-02,-2.474999,1910976.0,0.650001,2.55893
2493682,32.5,-100.0,0,2018-01-03,4.550002,1911000.0,-1.424999,4.767928
2493683,32.5,-100.0,0,2018-01-04,0.850001,1911024.0,2.250001,2.405205
2493684,32.5,-100.0,0,2018-01-05,1.575,1911048.0,5.925001,6.130764


In [32]:
# Export
data_wind_speed_loc.to_csv(wind_dir + 'wind_austin_tx_2018.csv', index = False)

# Wind Energy Generation

## Import Data

In [218]:
# Wind Generation Data
wind_gen_file_loc = wind_dir + 'wind_generation_2016.csv'
data_wind_gen = pd.read_csv(wind_gen_file_loc)

In [219]:
# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']
                         
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

## Clean Data

In [220]:
# Melt month columns
data_wind_gen = data_wind_gen.melt(id_vars = ['State Name', 'Sector', 'units'], value_vars = data_wind_gen.columns[4:16], var_name='month', value_name='wind_net_gen')
data_wind_gen['month'] = data_wind_gen['month'].apply(lambda x: month_key[x])

# Convert to thousand kwh
data_wind_gen['wind_net_gen'] = data_wind_gen['wind_net_gen']*1000

# Pivot sector column
data_wind_gen['Sector'] = data_wind_gen['Sector'].str.strip()
data_wind_gen = data_wind_gen.groupby(['State Name', 'Sector', 'month']).sum().unstack('Sector').reset_index()

# Fix column names
data_wind_gen.columns = ['State Name', 'month', 'wind_net_gen_all', 'wind_net_gen_elc']

In [220]:
data_wind_gen.head()

Unnamed: 0,State Name,month,wind_net_gen_all,wind_net_gen_elc
0,Alabama,1,0.0,0.0
1,Alabama,2,0.0,0.0
2,Alabama,3,0.0,0.0
3,Alabama,4,0.0,0.0
4,Alabama,5,0.0,0.0


## Merge Data

In [221]:
# Add FIPS code to wind energy generation data
data_wind_gen['State Name'] = data_wind_gen['State Name'].str.upper().str.strip()
data_wind_gen = data_wind_gen.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

data_wind = data_wind_gen.merge(data_wind_speed)

# Export Data

In [222]:
data_wind.to_csv(data_dir + 'processed/' + 'wind_data.csv', index = False, header = True)

## References

* Kalnay et al., The NCEP/NCAR 40-year reanalysis project, Bull. Amer. Meteor. Soc., 77, 437-470, 1996. [ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README](ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis.dailyavgs/surface/README)
* US Census, Cartographic Boundary Shapefiles - Counties, 2017. https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html 
* EIA, Electricity Data Browser. https://www.eia.gov/electricity/data/browser/