In [1]:
import geopandas as gpd

from data.geo import wkb_geom
from data.util import coalesce

In [2]:
bike_rides_in = "../data/interim/bike_rides.geoparquet"

bike_rides_out = "../data/interim/bike_rides_v2.geoparquet"

# Pipeline In

In [3]:
bike_rides = gpd.read_parquet(bike_rides_in).pipe(wkb_geom, 'geometry_imputed')

# Simplify schema

Use the imputed id name and geom, which have lower cardinality.

In [4]:
bike_rides = bike_rides.pipe(coalesce, 'station_id_imputed','station_id', 'station_id')
bike_rides = bike_rides.pipe(coalesce, 'station_name_imputed','station_name', 'station_name')
bike_rides = bike_rides.pipe(coalesce, 'geometry_imputed','geometry', 'geometry')

To reduce risk of future confusion, I'll drop the station_id and vintage and geometry columns.
The station name is useful in identifying clusters and useful descriptively so we keep it,
even though it is not a PK.

In [5]:
bike_rides = bike_rides.drop(columns=['station_id','vintage'])

In [6]:
bike_rides = (bike_rides
              .groupby(['geometry','station_name','date'], as_index=False)
              .agg({"start_rides": lambda x: x.fillna(0).sum(),
                    "end_rides": lambda x: x.fillna(0).sum(),
                    "rides": lambda x: x.fillna(0).sum()})
              .pipe(gpd.GeoDataFrame, geometry='geometry'))

In [7]:
assert all(bike_rides.groupby('station_name')['geometry'].nunique() == 1 )

In [8]:
# there's only a couple geoms that point to different station names but it's just mis-spellings. 
bike_rides[bike_rides.groupby('geometry')['station_name'].transform('nunique') > 1][['geometry',	'station_name']].drop_duplicates()

Unnamed: 0,geometry,station_name
15306,POINT (-87.75552 41.97871),Public Rack - Forest Glen Station
15339,POINT (-87.75552 41.97871),Public Rack - Peterson Park
116097,POINT (-87.6527 41.93242),Wilton Ave & Diversey Pkwy
116341,POINT (-87.6527 41.93242),Wilton Ave & Diversey Pkwy*
167726,POINT (-87.66208 41.80135),Public Rack - Laflin St & 51st St
167740,POINT (-87.66208 41.80135),Public Rack - Laflin St &51st ST


# Pipeline Out

In [9]:
bike_rides.to_parquet(bike_rides_out, index=False)