# Attach block ID and nearest network node to each parcel

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from sklearn.neighbors import BallTree

crs = {'init':'epsg:4326'}

## Load the input data

In [2]:
%%time
# load parcels shapefile, drop any that lack geometry, and project to 4326
parcels = gpd.read_file('data/Parcels/')
parcels = parcels.dropna(subset=['geometry'])
parcels = parcels.to_crs(crs)
assert parcels.index.is_unique
print(len(parcels))

1956207
Wall time: 7min 14s


In [3]:
# load parcels-to-blocks lookup table
parcels_blocks = pd.read_csv('data/parcels_joined_blocks.csv', dtype={'block_geoid':str})
parcels_blocks = parcels_blocks.set_index('parcel_id')
assert parcels_blocks.index.is_unique
print(len(parcels_blocks))

1956207


In [4]:
# load the network nodes
nodes = pd.read_csv('data/network/bay_area_full_strongly_nodes.csv')
nodes = nodes.set_index('osmid')
assert nodes.index.is_unique
print(len(nodes))

223328


## Add census block ID to each parcel

In [5]:
# convert integer columns to int
to_int = ['PARCEL_ID', 'DEVELOPMEN', 'COUNTY_ID', 'ZONE_ID', 'PROPORTION', 'TAX_EXEMPT', 'ID']
for col in to_int:
    parcels[col] = parcels[col].astype(int)

# drop unused columns and set index
parcels = parcels.drop(columns=['CENTROID', 'X', 'Y'])
parcels = parcels.set_index('PARCEL_ID')

In [6]:
# merge block IDs into parcels df
parcels = pd.merge(left=parcels, right=parcels_blocks, how='left', left_index=True, right_index=True)

## Find network node nearest to each parcel

In [7]:
# calculate parcel centroids
parcels['centroid'] = parcels.centroid

In [8]:
# extract x and y from centroids
parcels = parcels.set_geometry('centroid')
parcels['x'] = parcels.geometry.x
parcels['y'] = parcels.geometry.y
parcels = parcels.set_geometry('geometry')

### scipy euclidean method

faster but less accurate with unprojected coordinates

In [9]:
# build the tree for nearest-neighbor search
tree = cKDTree(data=nodes[['x', 'y']], compact_nodes=True, balanced_tree=True)

In [10]:
%%time
# query the tree for node nearest to each parcel centroid
dist, idx = tree.query(parcels[['x', 'y']], k=1)
parcels['node'] = nodes.iloc[idx].index

Wall time: 2.88 s


### scikit-learn haversine method

slower but more accurate with unprojected coordinates

In [11]:
# haversine requires data in form of [lat, lng] and inputs/outputs in units of radians
nodes_rad = np.deg2rad(nodes[['y', 'x']])
parcels_rad = np.deg2rad(parcels[['y', 'x']])

In [12]:
# build the tree for nearest-neighbor search
tree = BallTree(nodes_rad, metric='haversine')

In [13]:
%%time
# query the tree for node nearest to each parcel centroid
idx = tree.query(parcels_rad, k=1, return_distance=False)
parcels['node'] = nodes.iloc[idx[:,0]].index

Wall time: 10min


## Save parcels data to disk

In [14]:
# as simple csv
df_save = parcels.reset_index()[['PARCEL_ID', 'block_geoid', 'node']]
df_save = df_save.rename(columns={'PARCEL_ID':'parcel_id',
                                  'block_geoid':'block_id',
                                  'node':'node_id'})
df_save.to_csv('data/parcels_blocks_nodes.csv', index=False, encoding='utf-8')

In [15]:
%%time
# as full shapefile
parcels = parcels.reset_index()
parcels = parcels.drop(columns=['centroid', 'x', 'y'])
parcels['node'] = parcels['node'].astype(str) # to handle ints bigger than 32-bits
parcels.to_file('data/parcels_blocks_nodes')

Wall time: 27min 32s
