# Investigate Finding Nearest Building with OSMNX

## Resources
1. https://autogis-site.readthedocs.io/en/latest/notebooks/L3/04_nearest-neighbour.html
2. https://geopandas.org/en/stable/docs/reference/api/geopandas.tools.reverse_geocode.html 
   1. We are not doing this at the moment but it could potentially be helpful. It doesn't seem like all buildings in OSMX have addresses though
3. 

In [1]:
!pip install geopy
# have to re-clone this because I can't pull changes in the /repos folder. Don't have write access...
!git clone https://github.com/mschrader15/amazon-routing-challenge.git

Cloning into 'amazon-routing-challenge'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 67 (delta 23), reused 52 (delta 13), pack-reused 0[K
Receiving objects: 100% (67/67), 2.42 MiB | 7.55 MiB/s, done.
Resolving deltas: 100% (23/23), done.


In [2]:
import osmnx as ox
from geopandas import points_from_xy, GeoDataFrame
import pandas as pd
import numpy as np
from shapely.geometry import Point
from geopy import distance
import pyproj

import sys
import pathlib
import os


In [28]:
import json

sys.path.append("~/amazon-routing-challenge")
from almrcc_tools.notebook_setup import GLMFileHandler, MapboxPlot, ORSClient

In [29]:
data_path = "research-data-staging/osmnx-cities/raw/Boston, USA"

## Create the Route DataFrame


In [30]:
route_data = json.load(
    GLMFileHandler.get_file_stream(os.path.join("research-data-staging/almrrc2021-raw/data/almrrc2021-data-training/model_build_inputs", "route_data.json"))
)


In [31]:
route_df = pd.DataFrame.from_records(
    (
        {
            "stop_id": k,
            "route_id": route_id,
            "station_code": route_data[route_id]["station_code"],
            "departure_datetime": route_data[route_id]["date_YYYY_MM_DD"]
            + " "
            + route_data[route_id]["departure_time_utc"],
            "executor_capacity_cm3": route_data[route_id]["executor_capacity_cm3"],
            "route_score": route_data[route_id]["route_score"],
            **v,
        }
        for route_id in route_data.keys()
        for k, v in route_data[route_id]["stops"].items()
    )
)


In [32]:
# only look at boston for now
route_df = route_df.loc[route_df["station_code"].str.contains("BO")]
lat_lon = route_df[['lat', 'lng']].drop_duplicates()
r_gdf = GeoDataFrame(geometry=points_from_xy(x=lat_lon['lng'], y=lat_lon['lat'], crs="EPSG:4326"))

In [33]:
from sklearn.neighbors import BallTree


#  below comes from https://autogis-site.readthedocs.io/en/latest/notebooks/L3/06_nearest-neighbor-faster.html
def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0]

    # Return indices and distances
    return (closest, closest_dist)


def nearest_neighbor(left_gdf, right_gdf, left_col=None, right_col=None, return_dist=False):
    """
    For each point in left_gdf, find closest point in right GeoDataFrame and return them.
    
    NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
    """
    
    left_geom_col = left_col or left_gdf.geometry.name
    right_geom_col = right_col or right_gdf.geometry.name
    
    # Ensure that index in right gdf is formed of sequential numbers
    right = right_gdf.copy().reset_index(drop=True)
    
    # Parse coordinates from points and insert them into a numpy array as RADIANS
    # Notice: should be in Lat/Lon format 
    left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.y * np.pi / 180, geom.x * np.pi / 180)).to_list())
    right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.y * np.pi / 180, geom.x * np.pi / 180)).to_list())
    
    # Find the nearest points
    # -----------------------
    # closest ==> index in right_gdf that corresponds to the closest point
    # dist ==> distance between the nearest neighbors (in meters)
    
    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)

    # Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
    closest_points = right.loc[closest]
    
    # Ensure that the index corresponds the one in left_gdf
    closest_points = closest_points.reset_index(drop=True)
    
    # Add distance if requested 
    if return_dist:
        # Convert to meters from radians
        earth_radius = 6371000  # meters
        closest_points['distance'] = dist * earth_radius
        
    return closest_points

In [34]:
r_gdf

Unnamed: 0,geometry
0,POINT (-71.21844 42.23546)
1,POINT (-71.27056 42.22431)
2,POINT (-71.29745 42.25284)
3,POINT (-71.30727 42.23086)
4,POINT (-71.30994 42.22282)
...,...
120391,POINT (-71.15088 42.34571)
120392,POINT (-71.14803 42.34285)
120393,POINT (-71.15429 42.35058)
120394,POINT (-71.14532 42.34302)


## Read in Boston Data

In [3]:
import osmnx as ox

In [12]:
df = pd.read_pickle("~/Desktop/osmnx-cities/raw/Boston, USA/building.pkl")
gdf = GeoDataFrame(df, crs="EPSG:4326")

In [22]:
gdf[['geometry', 'building']].to_pickle("~/Desktop/osmnx-cities/raw/Boston, USA/building.parquet", protocol=4)

In [36]:
# https://epsg.io/32619
gdf['geometry_utm'] = gdf['geometry'].to_crs(epsg=32619).buffer(0.0001)

gdf['centroid'] = gdf['geometry_utm'].centroid.to_crs(epsg=4326)

In [37]:
gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amenity,geometry,height,highway,access,entrance,description,name,man_made,surveillance,...,gnis:ST_num,gnis:id,import_uuid,phone:mnemonic,polling_station,townhall:type,contact:livestream,service_times,geometry_utm,centroid
element_type,osmid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,367777158,,POINT (-71.16728 42.33386),,,,,,Merkert Chemistry Center,,,...,,,,,,,,,"POLYGON ((321448.647 4689120.510, 321448.647 4...",POINT (-71.16728 42.33386)
node,367777165,,POINT (-71.10391 42.34019),,,,,,Ansin Wing,,,...,,,,,,,,,"POLYGON ((326686.299 4689691.834, 326686.299 4...",POINT (-71.10391 42.34019)
node,367777167,theatre,POINT (-71.08734 42.34699),,,,main,,Berklee Performance Center,,,...,,,,,,,,,"POLYGON ((328070.291 4690413.014, 328070.291 4...",POINT (-71.08734 42.34699)
node,367777172,,POINT (-71.06896 42.37489),,,,,,Building B,,,...,,,,,,,,,"POLYGON ((329659.956 4693474.968, 329659.956 4...",POINT (-71.06896 42.37489)
node,367777173,,POINT (-71.06897 42.37434),,,,,,Building A,,,...,,,,,,,,,"POLYGON ((329657.419 4693413.065, 329657.419 4...",POINT (-71.06897 42.37434)


## Test Using Nearest Neighbors to Map the Amazon Lat/Lon to the Nearest Building in OSMNX

This matches to the *nearest* point, which isn't always the best IMO

In [38]:
test_df = nearest_neighbor(r_gdf, gdf, right_col='centroid', return_dist=True)
test_df = test_df.rename(columns={'geometry': 'closest_home'})
r_gdf = r_gdf.join(test_df)

In [39]:
r_gdf.building.value_counts()

yes                   115417
shed                    2371
university              1331
apartments               666
house                    163
detached                 138
residential               83
semidetached_house        55
industrial                47
retail                    32
garage                    13
bunker                    13
commercial                10
terrace                    7
dormitory                  6
roof                       6
office                     5
hotel                      5
construction               4
boathouse                  3
school                     3
canopy                     2
garages                    2
train_station              2
warehouse                  2
hut                        1
government                 1
church                     1
public                     1
convent                    1
hospital                   1
parking                    1
transportation             1
civic                      1
college       

In [40]:
r_gdf.loc[r_gdf.building == "apartments"].sort_values(by="distance").head()

Unnamed: 0,geometry,amenity,closest_home,height,highway,access,entrance,description,name,man_made,...,gnis:id,import_uuid,phone:mnemonic,polling_station,townhall:type,contact:livestream,service_times,geometry_utm,centroid,distance
81107,POINT (-71.11171 42.33173),,"POLYGON ((-71.11182 42.33171, -71.11171 42.331...",,,,,,,,...,,,,,,,,"POLYGON ((326011.318 4688766.668, 326011.318 4...",POINT (-71.11169 42.33170),3.159184
56871,POINT (-71.05653 42.36381),,"POLYGON ((-71.05656 42.36370, -71.05660 42.363...",13.6,,,,,,,...,,,,,,,,"POLYGON ((330650.308 4692207.563, 330650.308 4...",POINT (-71.05656 42.36376),5.79327
72555,POINT (-71.06679 42.35968),,"POLYGON ((-71.06666 42.35962, -71.06678 42.359...",,,,,,,,...,,,,,,,,"POLYGON ((329807.983 4691774.296, 329807.983 4...",POINT (-71.06672 42.35967),6.118513
56874,POINT (-71.05599 42.36419),,"POLYGON ((-71.05598 42.36412, -71.05601 42.364...",9.3,,,,,,,...,,,,,,,,"POLYGON ((330699.316 4692252.776, 330699.316 4...",POINT (-71.05607 42.36417),6.315931
19149,POINT (-71.05523 42.33173),,"POLYGON ((-71.05534 42.33167, -71.05540 42.331...",,,,,,,,...,,,,,,,,"POLYGON ((330665.345 4688648.200, 330665.345 4...",POINT (-71.05531 42.33175),6.95907
