In [1]:
import geopandas as gpd
import os
import pickle
from shapely import from_wkb
from shapely.geometry import MultiPoint
from tqdm import tqdm
import plotly.express as px
import contextily as cx

from data.constants import (DATA_FOLDER, LOCAL_CRS, WORLD_CRS, WEB_CRS)

In [2]:
bike_rides_in = os.path.join(DATA_FOLDER, "deprecated", "bike_rides.geoparquet")

bike_rides_out = os.path.join(DATA_FOLDER, "deprecated", "bike_rides_v2.geoparquet")

In [3]:
# TOLERANCE = typical total street width = 2 sidewalks + 2 parking lanes + 2 traffic lanes
#  source: https://www.chicago.gov/dam/city/depts/cdot/StreetandSitePlanDesignStandards407.pdf
TOLERANCE = 66  # ft

# We were using a much larger tolerance in the previous notebook, but if we're
# doing this purely spatially, I think basically "across the street" is as far
# as we want to merge things. City block ~= 0.1mi is another option, but there
# are bus stops that are only one block apart so I can imagine divvies being 
# that close.

# Pipeline In

In [4]:
bike_rides = gpd.read_parquet(bike_rides_in)

In [5]:
bike_rides = bike_rides.assign(
    station_cluster_centroid = gpd.GeoSeries(bike_rides.station_cluster_centroid.apply(from_wkb), crs=WORLD_CRS))

In [6]:
# Save for assertions later
nobs = len(bike_rides)
total_rides = bike_rides.filter(like='ride').sum()

# Check

In [7]:
multi_clusters = bike_rides.groupby('station_cluster_id')['geometry'].transform('nunique') > 1
iso_clusters = bike_rides.groupby('station_cluster_id')['geometry'].transform('nunique') == 1

# Spatial Clustering

So far we've tried to reduce the data by different primary key sets.
But maybe we should try directly spatially aggregating.

## Via buffer union

In [8]:
# Takes ~2m
lpoints = (bike_rides[['station_cluster_id','station_cluster_centroid','geometry']]
            .set_geometry('station_cluster_centroid', crs=WORLD_CRS)
            .drop(columns=['geometry'])
            .rename_geometry('geometry')
            .drop_duplicates()
            .to_crs(LOCAL_CRS))

 # set radius as half tolerance ==> points unioned if distance <= 2 radii = 1 tol
buffered = lpoints.buffer(TOLERANCE / 2)
supercluster = buffered.union_all()

In [9]:
assert supercluster.geom_type == 'MultiPolygon'
supercluster = gpd.GeoSeries(supercluster.geoms, crs=LOCAL_CRS)

Note:

This first step in the spatial merge will merge points that have different station names.
The station names seem pretty good at separating what we should consider a station.
The problem is that sometimes a station name has points really far away from it too.
This is why station name -> centroid is still 1:m after attribute clustering.
This next spatial clustering step runs pretty fast.
If we add more checks in this loop it might make it slower.
Instead we'll add checks in the next block when we actually compute the super centroids.

In [10]:
# Takes ~5m
supercluster_file = os.path.join(DATA_FOLDER, "deprecated", 'superclusters.pickle')
if not os.path.exists(supercluster_file):
    cluster_to_supercluster = {}
    sindex = supercluster.sindex
    for i,pt in tqdm(lpoints.itertuples(index=False), total=len(lpoints)):
        candidate_idx = sindex.query(pt, predicate='dwithin', distance=TOLERANCE*2)
        candidates = supercluster[candidate_idx]
        match_idx = candidate_idx[candidates.contains(pt)]
        cluster_to_supercluster[i] = match_idx
    with open(supercluster_file, 'wb') as f:
        pickle.dump(cluster_to_supercluster, f, pickle.HIGHEST_PROTOCOL)
else:
    with open(supercluster_file, 'rb') as f:
        cluster_to_supercluster = pickle.load(f)


In [11]:
assert all(map(lambda x: len(x)==1, cluster_to_supercluster.values()))
supercluster_to_cluster = {}
for k,v in cluster_to_supercluster.items():
    supercluster_to_cluster.setdefault(int(v[0]), []).append(k)

In [12]:
# Takes ~5m
cluster_to_super_centroid = {}
for sc, ids in tqdm(supercluster_to_cluster.items()):
    sc_rides = bike_rides[bike_rides.station_cluster_id.isin(ids)]
    # Further break down by station name because the data imprecision >> true station variance
    for name, name_rides in sc_rides.groupby('station_name'):
        name_ids = name_rides['station_cluster_id']
        name_pts = name_rides['station_cluster_centroid']
        super_centroid = MultiPoint(name_pts.to_crs(LOCAL_CRS).values).centroid
        cluster_to_super_centroid |= {x: super_centroid for x in name_ids}
bike_rides = bike_rides.assign(super_centroid = bike_rides.station_cluster_id.map(cluster_to_super_centroid))
bike_rides = bike_rides.assign(super_centroid = gpd.GeoSeries(bike_rides.super_centroid, crs=LOCAL_CRS).to_crs(WORLD_CRS))

100%|██████████| 3918/3918 [01:59<00:00, 32.90it/s]


# Pipeline Out

In [13]:
bike_rides.to_parquet(bike_rides_out, index=False)