In [1]:
import geopandas as gpd
import os
import pickle
from shapely import from_wkb
from shapely.geometry import MultiPoint
from tqdm import tqdm
from pyproj import Transformer

from data.constants import (DATA_FOLDER, LOCAL_CRS, WORLD_CRS, WEB_CRS)

In [2]:
bike_rides_in = os.path.join(DATA_FOLDER, "interim", "bike_rides.geoparquet")

bike_rides_out = os.path.join(DATA_FOLDER, "interim", "bike_rides_v2.geoparquet")

In [3]:
# TOLERANCE = typical total street width = 2 sidewalks + 2 parking lanes + 2 traffic lanes
#  source: https://www.chicago.gov/dam/city/depts/cdot/StreetandSitePlanDesignStandards407.pdf
TOLERANCE = 66  # ft

# We were using a much larger tolerance in the previous notebook, but if we're
# doing this purely spatially, I think basically "across the street" is as far
# as we want to merge things. City block ~= 0.1mi is another option, but there
# are bus stops that are only one block apart so I can imagine divvies being 
# that close.

# Pipeline In

In [4]:
bike_rides = gpd.read_parquet(bike_rides_in)

In [5]:
bike_rides = bike_rides.assign(
    station_cluster_centroid = gpd.GeoSeries(bike_rides.station_cluster_centroid.apply(from_wkb), crs=WORLD_CRS))

In [6]:
# Save for assertions later
nobs = len(bike_rides)
total_rides = bike_rides.filter(like='ride').sum()

# Check

In [7]:
multi_clusters = bike_rides.groupby('station_cluster_id')['geometry'].transform('nunique') > 1
iso_clusters = bike_rides.groupby('station_cluster_id')['geometry'].transform('nunique') == 1

# Spatial Clustering

So far we've tried to reduce the data by different primary key sets.
But maybe we should try directly spatially aggregating.

## Via buffer union

In [8]:
lpoints = (bike_rides[['station_cluster_id','station_cluster_centroid','geometry']]
            .set_geometry('station_cluster_centroid', crs=WORLD_CRS)
            .drop(columns=['geometry'])
            .rename_geometry('geometry')
            .drop_duplicates()
            .to_crs(LOCAL_CRS))

 # set radius as half tolerance ==> points unioned if distance <= 2 radii = 1 tol
buffered = lpoints.buffer(TOLERANCE / 2)
supercluster = buffered.union_all()

In [9]:
assert supercluster.geom_type == 'MultiPolygon'
supercluster = gpd.GeoSeries(supercluster.geoms, crs=LOCAL_CRS)

Note:

This first step in the spatial merge will merge points that have different station names.
The station names seem pretty good at separating what we should consider a station.
The problem is that sometimes a station name has points really far away from it too.
This is why station name -> centroid is still 1:m after attribute clustering.
This next spatial clustering step runs pretty fast.
If we add more checks in this loop it might make it slower.
Instead we'll add checks in the next block when we actually compute the super centroids.

In [10]:
# Takes ~5m
supercluster_file = os.path.join(DATA_FOLDER, "interim", 'superclusters.pickle')
if not os.path.exists(supercluster_file):
    cluster_to_supercluster = {}
    sindex = supercluster.sindex
    for i,pt in tqdm(lpoints.itertuples(index=False), total=len(lpoints)):
        candidate_idx = sindex.query(pt, predicate='dwithin', distance=TOLERANCE*2)
        candidates = supercluster[candidate_idx]
        match_idx = candidate_idx[candidates.contains(pt)]
        cluster_to_supercluster[i] = match_idx
    with open(supercluster_file, 'wb') as f:
        pickle.dump(cluster_to_supercluster, f, pickle.HIGHEST_PROTOCOL)
else:
    with open(supercluster_file, 'rb') as f:
        cluster_to_supercluster = pickle.load(f)


In [11]:
assert all(map(lambda x: len(x)==1, cluster_to_supercluster.values()))
supercluster_to_cluster = {}
for k,v in cluster_to_supercluster.items():
    supercluster_to_cluster.setdefault(int(v[0]), []).append(k)

In [13]:
cluster_to_super_centroid = {}
for sc, ids in tqdm(supercluster_to_cluster.items()):
    sc_rides = bike_rides[bike_rides.station_cluster_id.isin(ids)]
    # Further break down by station name because the data imprecision >> true station variance
    for name, name_rides in sc_rides.groupby('station_name'):
        name_ids = name_rides['station_cluster_id']
        name_pts = name_rides['station_cluster_centroid']
        super_centroid = MultiPoint(name_pts.to_crs(LOCAL_CRS).values).centroid
        cluster_to_super_centroid |= {x: super_centroid for x in name_ids}
bike_rides = bike_rides.assign(super_centroid = bike_rides.station_cluster_id.map(cluster_to_super_centroid))
bike_rides = bike_rides.assign(super_centroid = gpd.GeoSeries(bike_rides.super_centroid, crs=LOCAL_CRS).to_crs(WORLD_CRS))

100%|██████████| 3918/3918 [01:58<00:00, 33.05it/s]


In [None]:
import pandas as pd
station_info = pd.read_json('https://gbfs.lyft.com/gbfs/2.3/chi/en/station_information.json')
# TODO: Um this might cover 99% of our rides with stable geometries.

In [22]:
station_info = pd.DataFrame.from_records(station_info['data']['stations'])

In [29]:
bike_rides.station_id.head()

0    021320
1    021320
2    021320
3    021320
4    021320
Name: station_id, dtype: object

In [28]:
station_info.station_id.head()

0    1943244520961310788
1    1958859639339458778
2    1936582560225114270
3    1963672425916463636
4    1936553212713581016
Name: station_id, dtype: object

In [40]:
station_info[(station_info.name.str.contains('Narragansett'))]

Unnamed: 0,short_name,lat,capacity,station_id,lon,name,rental_uris,region_id,address
187,24308.0,41.9095,12,2012672652900879218,-87.78491,Narragansett Ave & North Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,
488,21314.0,41.92729,15,1945087711550943880,-87.78523,Narragansett Ave & Wrightwood Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,
531,24364.0,41.959978,16,1978529864027224986,-87.786928,Narragansett Ave & Montrose Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,
793,21309.0,41.9159,15,1939172360789937026,-87.78511,Narragansett Ave & McLean Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,
1008,,41.952614,9,1571105068000485406,-87.785383,Narragansett & Irving Park,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,"Merrimac Park Fieldhouse, 6343, West Irving Pa..."
1025,,41.923739,1,1695777015314520896,-87.785835,Public Rack - Fullerton Ave & Narragansett Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,"Burger King, 6400, West Fullerton Avenue, Beat..."
1590,,41.92378,2,1674190527309818772,-87.7844,Public Rack - Narragansett Ave & Fullerton Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,"6333, West Fullerton Avenue, Beat 2512, Belmon..."
1614,,41.92258,1,1674190523014851470,-87.78498,Public Rack - N Narragansett Ave & Grand Ave,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,"6352-6358, West Grand Avenue, Beat 2512, Belmo..."
1793,,41.940128,9,1571105050820616196,-87.786481,Narragansett Ave & School St,"{'ios': 'https://chi.lft.to/lastmile_qr_scan',...",,"6465, West School Street, Dunning, Chicago, Je..."


In [39]:
bike_rides[~bike_rides.station_name.isin(station_info.name)][['station_id','station_name']].drop_duplicates()

Unnamed: 0,station_id,station_name
0,021320,MTV Hubbard St
23823,20252.0,W Oakdale Ave & N Broadway
24181,20256.0,N Sheffield Ave & W Wellington Ave
26185,309,Narragansett & McLean
26218,314,Narragansett & Wrightwood
26282,322,Cicero Ave & Grace St
26331,326,Harlem & Irving Park
26631,331.0,Pulaski Rd & 21st St
26682,339,Sacramento Ave & Pershing Rd
26687,340,Pulaski & Ann Lurie Pl


In [35]:
bike_rides[~bike_rides.station_id.isin(station_info.short_name)][['station_id','station_name']].drop_duplicates()

Unnamed: 0,station_id,station_name
0,021320,MTV Hubbard St
89,1011,Public Rack - Fullerton Ave & Narragansett Ave
99,1012,Public Rack - Langley Ave & 49th St
111,1018,Public Rack - Kostner Ave & Wrightwood Ave
113,1019,Public Rack - Kostner Ave & Diversey Ave
...,...,...
1063960,896,Public Rack - Troy & 111th St
1063994,923,Public Rack - Torrence Ave & 106th St
1064163,961,Public Rack - Pulaski Rd & Lexington St
1064177,970,Public Rack - Roscoe St & Osceola Ave


# Check

In [None]:
# (bike_rides['station_cluster_centroid']==bike_rides['super_centroid']).mean()
# (bike_rides.groupby('station_cluster_id')['super_centroid'].transform('nunique') == 1).all()
# (bike_rides.groupby('super_centroid')['station_cluster_id'].transform('nunique') == 1).mean()
# (bike_rides.groupby('super_centroid')['station_cluster_id'].nunique() == 1).mean()

Very few centroids are exactly equal to their super cluster.

Station cluster -> super cluster is at least m:1.

Station cluster -> super cluster is 1:1 in 15% of rows 

Station cluster -> super cluster is 1:1 in 78% of super clusters 

In [None]:
# TODO: Check if we like these results and want to incorporate super clusters
#       maybe smaller than a certain size. 
#       There's some outliers here like one super cluster with 30k points in it
#       But maybe those points are all a few cm apart? Map it out!
# There's 
import plotly.express as px
many = 20000
is_one_to_many = bike_rides.groupby('super_centroid')['station_cluster_id'].transform('nunique') <= many
px.box(bike_rides[~is_one_to_many].groupby('super_centroid')['station_cluster_id'].nunique())

In [None]:
import contextily as cx
many = 20000
is_one_to_many = bike_rides.groupby('super_centroid')['station_cluster_id'].transform('nunique') <= many

fig = bike_rides[is_one_to_many]['station_cluster_centroid'].to_crs(WEB_CRS).plot(figsize=(8,8), color='red', alpha=.5)
bike_rides[is_one_to_many]['super_centroid'].to_crs(WEB_CRS).plot(color='blue', ax=fig)
cx.add_basemap(ax=fig, attribution=False)


In [None]:

fig = bike_rides[is_one_to_many].to_crs(WEB_CRS).plot("station_name", figsize=(8,8))
cx.add_basemap(ax=fig, attribution=False)
# Yeah this data is trash! These points are all over the place, in buildings, etc.
# Maybe it's actually the GPS location of the app when they check it in? ugh.

# Pipeline Out

In [None]:
# bike_rides.to_parquet(bike_rides_out, index=False)