The denormalized ride start/stop points are horribly imprecise. I had assumed
they are the actual station locations, but now I wonder if they
are user locations when they start/stop on the app. The per-station location
imprecision is as big as the between-station distances.

So far I haven't trusted the over-time stability of station ids names and locations.
But in this notebook we'll use the GBFS station info feed as our reference.

In [1]:
import geopandas as gpd
import pandas as pd
import requests
import json

from data.constants import (WORLD_CRS,
                            DIVVY_STATIONS_GBFS)

In [2]:
bike_stations_in = "../data/raw/bike_stations_gbfs.json"
bike_stations_out = "../data/interim/bike_stations_gbfs.geoparquet"

# Pipeline in

In [3]:
bike_stations_raw = requests.get(DIVVY_STATIONS_GBFS).json()
bike_stations = pd.DataFrame.from_records(bike_stations_raw['data']['stations'])

# Preprocess

In [4]:
unused_cols = ['station_id','capacity','rental_uris','region_id','address', 'lon','lat']
bike_stations = (bike_stations
                 .assign(geometry = gpd.points_from_xy(bike_stations['lon'],
                                                      bike_stations['lat'], 
                                                      crs=WORLD_CRS))
                 .drop(columns=unused_cols)
                 .rename(columns={'name':'station_name',
                                  'short_name':'station_id'})
                .pipe(gpd.GeoDataFrame))


In [5]:
# id,name is the composite PK
assert all(bike_stations.groupby(['station_id','station_name'],dropna=False).size() == 1)
# id to name is at least m:1
assert all(bike_stations.groupby(['station_id']).station_name.nunique() == 1)
# name to id is 1:Null or 1:1
mask = bike_stations.duplicated(subset=['station_name','geometry'])
print(f"Dropping {mask.sum()} stations with non-PK id,name")
bike_stations = bike_stations[~mask]
assert all(bike_stations.groupby(['station_name']).station_id.agg(lambda x: x.isna().all() or x.dropna().nunique() == 1))
# id is Null or unique
assert not bike_stations['station_id'].dropna().duplicated().any()
# name is unique given id not null
assert not bike_stations.dropna()['station_name'].duplicated().any()
# name is the more complete id
assert bike_stations.station_name.nunique() > bike_stations['station_id'].nunique()
assert bike_stations.station_name.notna().mean() > bike_stations.station_id.notna().mean()

Dropping 1 stations with non-PK id,name


In [6]:
# when name to geo is m:m, the rows are missing ids.
# we can prioritize the rows with ids here.
# therefore name to geo is at least m:1
stable_geos = bike_stations.groupby('station_name').geometry.transform('nunique') == 1

assert all(bike_stations[~stable_geos].groupby('station_name')['station_id'].agg(lambda x: x.dropna().nunique()) == 1)

print("Dropping {} duplicate rows.".format(sum(~(stable_geos | bike_stations.station_id.notna()))))
bike_stations = bike_stations[stable_geos | bike_stations.station_id.notna()]

# Now name is unique
assert not bike_stations['station_name'].duplicated().any()

Dropping 2 duplicate rows.


# Pipeline out

In [7]:
with open(bike_stations_in, "w") as f:
    json.dump(bike_stations_raw, f)

bike_stations.to_parquet(bike_stations_out, index=False)