In [1]:
from enum import Enum
import geopandas as gpd
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
class AnalysisArea(Enum):
    SAN_FRANCISCO = 1
    WASHINGTON_DC = 2
    
class ExportMode(Enum):
    AREA = 1
    CITY = 2

class AnalysisConfig:

    def __init__(self,
                 area=None,
                 csv_input_path='',
                 csv_output_area_path='',
                 csv_output_city_path='',
                 lat_min=0,
                 lat_max=0,
                 lng_min=0,
                 lng_max=0,
                 shapefile_path=''):
        self.area = area
        self.csv_input_path = csv_input_path
        self.csv_output_area_path = csv_output_area_path
        self.csv_output_city_path = csv_output_city_path
        self.lat_min = lat_min
        self.lat_max = lat_max
        self.lng_min = lng_min
        self.lng_max = lng_max
        self.shapefile_path = shapefile_path

In [3]:
CONFIG_SF = AnalysisConfig(
    area = AnalysisArea.SAN_FRANCISCO,
    csv_input_path = '../data/raw/202102-baywheels-tripdata.csv',
    csv_output_area_path = '../data/exports/stations_san_francisco_area.csv',
    csv_output_city_path = '../data/exports/stations_san_francisco.csv',
    lat_min = 37.705262390821154,
    lat_max = 37.81066405821323,
    lng_min = -122.5245991704898,
    lng_max = -122.35292764505348,
    shapefile_path = '../data/shapefiles/zipcodes/san_francisco.shp',
)

CONFIG_DC = AnalysisConfig(
    area = AnalysisArea.WASHINGTON_DC,
    csv_input_path = '../data/raw/202102-capitalbikeshare-tripdata.csv',
    csv_output_area_path = '../data/exports/stations_washington_dc_area.csv',
    csv_output_city_path = '../data/exports/stations_washington_dc.csv',
    lat_min = 38.806229986847676,
    lat_max = 38.999466837911626,
    lng_min = -77.12409668146095,
    lng_max = -76.90252862586087,
    shapefile_path = '../data/shapefiles/zipcodes/washington_dc.shp',
)

In [4]:
# SET LOCATION TO CALCULATE STATIONS FOR HERE ##################################
config = CONFIG_SF
export_mode = ExportMode.CITY

In [5]:
def summarize_df(df):
    display(df.head())
    print('Number of rows: {}'.format(len(df)))

In [6]:
bikes = pd.read_csv(config.csv_input_path, error_bad_lines=False, dtype={'start_station_id': 'string', 'end_station_id': 'string'})
summarize_df(bikes)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0F961E4450F8544E,classic_bike,2021-02-20 14:03:25,2021-02-20 14:14:17,21st St & Pennsylvania Ave NW,31252,New York Ave & 15th St NW,31222,38.901539,-77.046564,38.899032,-77.033354,casual
1,DFD528B4F2B3CA6A,classic_bike,2021-02-15 09:54:23,2021-02-15 11:21:02,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
2,2398431BB0EB78BE,classic_bike,2021-02-15 09:53:12,2021-02-15 09:53:34,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
3,6E32C58697957443,classic_bike,2021-02-24 14:50:17,2021-02-24 15:29:01,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
4,2DCACE8B26B0A50A,classic_bike,2021-02-05 16:39:10,2021-02-05 16:39:13,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member


Number of rows: 77505


In [7]:
def get_non_null_rows(df, column):
    return df[df[column].notnull()]


REQUIRED_COLUMNS = [
    'start_lat', 'start_lng', 'end_lng', 'end_lat', 'start_station_id',
    'end_station_id'
]

# Remove all rows with missing geographic information.
for column in REQUIRED_COLUMNS:
    bikes = get_non_null_rows(bikes, column)

summarize_df(bikes)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0F961E4450F8544E,classic_bike,2021-02-20 14:03:25,2021-02-20 14:14:17,21st St & Pennsylvania Ave NW,31252,New York Ave & 15th St NW,31222,38.901539,-77.046564,38.899032,-77.033354,casual
1,DFD528B4F2B3CA6A,classic_bike,2021-02-15 09:54:23,2021-02-15 11:21:02,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
2,2398431BB0EB78BE,classic_bike,2021-02-15 09:53:12,2021-02-15 09:53:34,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
3,6E32C58697957443,classic_bike,2021-02-24 14:50:17,2021-02-24 15:29:01,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
4,2DCACE8B26B0A50A,classic_bike,2021-02-05 16:39:10,2021-02-05 16:39:13,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member


Number of rows: 65289


In [8]:
def get_rows_in_range(df, column, min_value, max_value):
    return df[(df[column] >= min_value) & (df[column] <= max_value)]


def get_geofenced_rows(df):
    new_df = df
    new_df = get_rows_in_range(new_df, 'start_lat', config.lat_min,
                               config.lat_max)
    new_df = get_rows_in_range(new_df, 'start_lng', config.lng_min,
                               config.lng_max)
    new_df = get_rows_in_range(new_df, 'end_lat', config.lat_min,
                               config.lat_max)
    new_df = get_rows_in_range(new_df, 'end_lng', config.lng_min,
                               config.lng_max)
    return new_df


if export_mode == ExportMode.CITY:
    bikes = get_geofenced_rows(bikes)
    
summarize_df(bikes)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0F961E4450F8544E,classic_bike,2021-02-20 14:03:25,2021-02-20 14:14:17,21st St & Pennsylvania Ave NW,31252,New York Ave & 15th St NW,31222,38.901539,-77.046564,38.899032,-77.033354,casual
1,DFD528B4F2B3CA6A,classic_bike,2021-02-15 09:54:23,2021-02-15 11:21:02,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
2,2398431BB0EB78BE,classic_bike,2021-02-15 09:53:12,2021-02-15 09:53:34,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
3,6E32C58697957443,classic_bike,2021-02-24 14:50:17,2021-02-24 15:29:01,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
4,2DCACE8B26B0A50A,classic_bike,2021-02-05 16:39:10,2021-02-05 16:39:13,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member


Number of rows: 63869


In [9]:
# Find average lat/lng values for each unique station.


def get_stations(df, column_station_id, column_lat, column_lng):
    stations_rows = df[[column_station_id, column_lat, column_lng]]
    stations_rows = stations_rows.rename(columns={
        column_station_id: 'station_id',
        column_lat: 'lat',
        column_lng: 'lng',
    })
    return stations_rows.groupby(by=['station_id'], as_index=False).agg({
        'lat': 'mean',
        'lng': 'mean',
    })


def get_first_numeric_value(value1, value2):
    if np.isnan(value1):
        return value2
    return value1


# Some station ids are present as end stations but not start stations. Collect
# from both.
start_stations = get_stations(bikes, 'start_station_id', 'start_lat',
                              'start_lng')
end_stations = get_stations(bikes, 'end_station_id', 'end_lat', 'end_lng')

# Take the first non-NaN value from the merged stations dataframes.
stations = start_stations.merge(end_stations, on='station_id', how='outer')
stations['lat'] = stations.apply(
    lambda row: get_first_numeric_value(row['lat_x'], row['lat_y']), axis=1)
stations['lng'] = stations.apply(
    lambda row: get_first_numeric_value(row['lng_x'], row['lng_y']), axis=1)
stations = stations[['station_id', 'lat', 'lng']]

summarize_df(stations)

Unnamed: 0,station_id,lat,lng
0,31000,38.85897,-77.053239
1,31001,38.85725,-77.053402
2,31002,38.856408,-77.049262
3,31003,38.860856,-77.049425
4,31004,38.857866,-77.05949


Number of rows: 494


In [18]:
def get_stations_as_gdf(df):
    return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lng'], df['lat'])).set_crs('EPSG:4326')

def get_stations_within_city_bounds(gdf):
    zips_gdf = gpd.read_file(config.shapefile_path)
    joined_gdf = gpd.sjoin(gdf, zips_gdf, op='within', how='right')
    return joined_gdf[['station_id', 'lat', 'lng']]
    
if export_mode == ExportMode.CITY:
    stations_gdf = get_stations_as_gdf(stations)
    stations = get_stations_within_city_bounds(stations_gdf)
    stations = stations.dropna().reset_index(drop=True)
    summarize_df(stations)
    
    

Unnamed: 0,station_id,lat,lng
0,31267,38.908147,-77.03837
1,31250,38.903552,-77.044822
2,31224,38.903723,-77.04244
3,31221,38.905062,-77.041768
4,31212,38.905726,-77.04729


Number of rows: 324


In [19]:
csv_output_path = config.csv_output_area_path if export_mode == ExportMode.AREA else config.csv_output_city_path
stations.to_csv(csv_output_path, index=False)