# Attach nearest network nodes to CHTS homes and workplaces

Run this once for `graph_type='directed'` and once for `'undirected'`.

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import BallTree

graph_type = 'undirected'

In [2]:
# identify bay area counties by fips code
bayarea = {'San Francisco':'075'}

state_counties = ['06{}'.format(county) for county in bayarea.values()]

## Load persons home and work data

In [3]:
# load persons and workplace data
persons = pd.read_csv('data/chts/LookUp_PER.csv', dtype={'WBLOCK':str})
persons = persons[['SAMPN', 'PERNO', 'WXCORD', 'WYCORD', 'WBLOCK']]
mask = persons['WBLOCK'].str.slice(0, 5).isin(state_counties)
persons = persons[mask]

In [4]:
# load homes data
homes = pd.read_csv('data/chts/LookUp_Home.csv', dtype={'HBLOCK':str})
homes = homes[['SAMPN', 'HXCORD', 'HYCORD', 'HBLOCK']]
mask = homes['HBLOCK'].str.slice(0, 5).isin(state_counties)
homes = homes[mask]

In [5]:
# merge homes and workplaces for each person
df = pd.merge(persons, homes, how='inner', on='SAMPN').dropna()
df = df.set_index(['SAMPN', 'PERNO'])
assert df.index.is_unique

## Find nearest network node to home and work locations

For fast nearest-neighbor search, use a ball tree with the haversine distance metric. This will be more accurate than using Euclidean distance at this spatial scale, and faster than projecting data to a metric projection and then using Euclidean distance. If minor inaccuracy is acceptible in the name of greater speed, just use scipy's basic cKDTree and Euclidean distance instead.

In [6]:
# load the network nodes
nodes = pd.read_csv(f'data/network/sf-{graph_type}-no-fwy-nodelist.csv')
nodes = nodes.set_index('osmid')
assert nodes.index.is_unique

In [7]:
# haversine requires data in form of [lat, lng] and inputs/outputs in units of radians
nodes_rad = np.deg2rad(nodes[['y', 'x']])
homes_rad = np.deg2rad(df[['HYCORD', 'HXCORD']])
works_rad = np.deg2rad(df[['WYCORD', 'WXCORD']])

In [8]:
# build the tree for fast nearest-neighbor search
tree = BallTree(nodes_rad, metric='haversine')

In [9]:
# query the tree for nearest node to each home
idx = tree.query(homes_rad, return_distance=False)
df['HNODE'] = nodes.iloc[idx[:,0]].index

In [10]:
# query the tree for nearest node to each workplace
idx = tree.query(works_rad, return_distance=False)
df['WNODE'] = nodes.iloc[idx[:,0]].index

## Save to disk

In [11]:
len(df)

797

In [12]:
# how many home and work locations are so close that they resolve to same network node?
len(df[df['HNODE']==df['WNODE']])

3

In [13]:
df_save = df[['HNODE', 'WNODE']].rename(columns={'HNODE':'orig', 'WNODE':'dest'})
df_return = df_save.copy()
df_return[['orig', 'dest']] = df_return[['dest', 'orig']]
df_save = pd.concat((df_save, df_return))

In [14]:
len(df_save)

1594

In [15]:
# save ODs to disk
df_save.to_csv(f'data/od-{graph_type}-no-fwy.csv', index=True, encoding='utf-8')