# Grid Creation

Requires local Nominatim server covering the continental United States at the minimum. Because hundreds of thousands of reverse geolocation queries must be made and then organized, the code below does many operations asynchronously. See https://ipython-books.github.io/59-distributing-python-code-across-multiple-cores-with-ipython/ and https://ipython-books.github.io/510-interacting-with-asynchronous-parallel-tasks-in-ipython/ for setup and more information.

In [7]:
import time
import ipyparallel
import ipywidgets
import os.path
import pickle
import urllib
import numpy as np
import pandas as pd

from IPython.display import clear_output, display
from geopy import Point, distance
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderServiceError, GeocoderTimedOut

In [None]:
# Using the progess bar is optional and will lock the kernel for the
# duration of the task.
def progress_bar(ar):
    w = ipywidgets.IntProgress()
    w.max = len(ar.msg_ids)
    display(w)
    while not ar.ready():
        w.value = ar.progress
        time.sleep(.1)
    w.value = w.max

## Coordinate to County

In [None]:
def coord_to_county(coord):
    """
    Returns county from entered coordinate.
    coord: tuple consisting of (latitude, longitude)
    """
    try:
        time.sleep(.01)
        raw = geolocator.reverse(coord).raw
        if raw['address']['country_code'] != 'us':
            return (coord, None)
        return (coord, raw['address']['county'])
    # investigate errors manually
    except (GeocoderServiceError, GeocoderTimedOut, KeyError) as e:
        return (coord, e)

In [None]:
def coordinate_generator(min_lat, max_lat, min_lon, max_lon, dist):
    """
    Returns list of (latitude, longitude) tuples bounded by the given
    minimum and maximum latitudes and longitudes.
    """
    start = Point(max_lat, min_lon)
    point = start
    
    def point_to_tuple(point):
        return (point.latitude, point.longitude)
    
    coord = point_to_tuple(point)
    ver_dist = distance.distance(miles=dist)
    hor_dist = distance.distance(miles=dist)
    coords = list() # list of coords forming the grid
    
    while True:
        # eastmost bound reached
        if coord[1] > max_lon:
            # go back to starting point and move south
            point = ver_dist.destination(point=start, bearing=180)
            # increment vertical distance
            ver_dist += distance.distance(miles=5)
        # southmost bound reached
        elif coord[0] < min_lat:
            break
        yield coord
        # move east
        point = hor_dist.destination(point=point, bearing=90)
        coord = point_to_tuple(point)

In [None]:
geolocator = Nominatim(domain='localhost:8080', scheme='http')

In [None]:
min_lat = None
max_lat= None
min_lon = None
max_lon = None
dist = 5
coords = list(coordinate_generator(min_lat, max_lat, min_lon, max_lon, dist))

In [None]:
rc = ipyparallel.Client()
view = rc.load_balanced_view()

with rc[:].sync_imports():
    import time

rc[:].push(dict(
    geolocator=geolocator,
    GeocoderServiceError=GeocoderServiceError,
    GeocoderTimedOut=GeocoderTimedOut
))

In [None]:
ar = view.map_async(coord_to_county, coords)

In [None]:
# can check progress without locking kernel with this cell
ar.elapsed # time
ar.progress # number of coordinates that have returned

In [None]:
# the progress bar will lock the kernel
# progress_bar(ar)

In [None]:
# dict from coordinate to county in US, None if not in US, or an error object
try:
    with open('pickles/coord_county_dict.pkl', 'rb') as file:
        coord_county_dict = pickle.load(file)
except FileNotFoundError:
    coord_county_dict = dict()

In [None]:
for coord, maybe_county in ar:
    if coord not in coord_county_dict:
        coord_county_dict[coord] = maybe_county

In [None]:
with open('pickles/coord_county_dict.pkl', 'wb') as file:
    pickle.dump(coord_county_dict, file)

## County Coordinates

In [None]:
# this loop is fast and need not be pickled
# dict from county name to list of coordinates within said county
county_coords_dict = dict()
skipped = list()
for coord, county in coord_county_dict:
    if not isinstance(county, str):
        skipped.append((coord, county))
        continue
    try:
        county_coords_dict[county].append(coord)
    except KeyError:
        county_coords_dict[county] = [coord]

In [None]:
# analyze skipped coords, potentially rerunning them manually
skipped

## County Stations

In [None]:
try:
    stations_df = pd.read_fwf('weather_data/stations.txt',
       colspecs=[(0,11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71), (72, 75), (76, 79), (80, 85)],
       header=None,
       names=['ID', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME', 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID'])
except FileNotFoundError:
    url = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
    urllib.request.urlretrieve(url, 'weather_data/stations.txt')
    stations_df = pd.read_fwf('stations.txt',
       colspecs=[(0,11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71), (72, 75), (76, 79), (80, 85)],
       header=None,
       names=['ID', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME', 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID'])

In [None]:
# only includes HCN stations in the continental United States
stations_df = stations_df[stations_df['ID'].str.startswith('US') & (stations_df['HCN/CRN FLAG'] == 'HCN')].reset_index(drop=True)

In [None]:
def add_stations_to_county(county):
    coords = county_coords[county]
    county_stations = list()
    time.sleep(.01)
    for i, station in stations_df.iterrows():
        station_coord = (station['LATITUDE'], station['LONGITUDE'])
        time.sleep(.01)
        for coord in coords:
            dist = distance(station_coord, coord).miles
            if dist <= 50:
                county_stations.append(station['ID'])
                break
    return (county, county_stations)

In [None]:
counties = list(county_coords)

In [None]:
rc = ipyparallel.Client()
view = rc.load_balanced_view()

with rc[:].sync_imports():
    import time

rc[:].push(dict(
    distance=distance.distance,
    county_coords=county_coords,
    stations_df=stations_df
))

In [None]:
ar = view.map_async(add_stations_to_county, counties)

In [None]:
# can check progress without locking kernel with this cell
ar.elapsed # time
ar.progress # number of coordinates that have returned

In [None]:
# the progress bar will lock the kernel
# progress_bar(ar)

In [None]:
# dict from coordinate to county in US, None if not in US, or an error object
try:
    with open('pickles/county_stations_dict.pkl', 'rb') as file:
        county_stations_dict = pickle.load(file)
except FileNotFoundError:
    county_stations_dict = dict()

In [None]:
for county, stations in ar:
    if county not in county_stations_dict:
        county_stations_dict[county] = stations

In [None]:
with open('pickles/county_stations_dict.pkl', 'wb') as file:
    pickle.dump(county_stations_dict, file)

In [None]:
# set index to ID for fast lookup
stations_df.set_index('ID', inplace=True)

In [None]:
# this loop is fairly fast so I haven't pickled it
# dict from county to dict of station to inverse average distance
area_weighted_inverse_distances = dict()
for i, (county, stations) in enumerate(county_stations.items()):
    for j, station in enumerate(stations):
        print(f'county: {i+1}/{len(county_stations)}\nstations: {j+1}/{len(stations)}')
        clear_output(wait=True)
        distances = list()
        station_coord = (stations_df.loc[station]['LATITUDE'], stations_df.loc[station]['LONGITUDE'])
        for coord in county_coords[county]:
            distances.append(distance.distance(station_coord, coord).miles)
        distances = np.array(distances, dtype=float)
        inverse_distances = np.reciprocal(distances)
        try:
            area_weighted_inverse_distances[county][station] = np.mean(inverse_distances)
        except KeyError:
            area_weighted_inverse_distances[county] = {station: np.mean(inverse_distances)}