In [1]:
from enum import Enum
import geopandas as gpd
import numpy as np
import pandas as pd

In [11]:
class AnalysisArea(Enum):
    SAN_FRANCISCO = 1
    WASHINGTON_DC = 2


class AnalysisConfig:

    def __init__(self,
                 area=None,
                 csv_input_path='',
                 csv_output_path='',
                 lat_min=0,
                 lat_max=0,
                 lng_min=0,
                 lng_max=0):
        self.area = area
        self.csv_input_path = csv_input_path
        self.csv_output_path = csv_output_path
        self.lat_min = lat_min
        self.lat_max = lat_max
        self.lng_min = lng_min
        self.lng_max = lng_max

In [12]:
# Only includes stations within San Francisco (e.g. excludes Oakland).
CONFIG_SF = AnalysisConfig(
    area = AnalysisArea.SAN_FRANCISCO,
    csv_input_path = '../data/raw/202102-baywheels-tripdata.csv',
    csv_output_path = '../data/exports/stations_san_francisco.csv',
    lat_min = 37.705262390821154,
    lat_max = 37.81066405821323,
    lng_min = -122.5245991704898,
    lng_max = -122.35292764505348,
)

CONFIG_DC = AnalysisConfig(
    area = AnalysisArea.WASHINGTON_DC,
    csv_input_path = '../data/raw/202102-capitalbikeshare-tripdata.csv',
    csv_output_path = '../data/exports/stations_washington_dc.csv',
    lat_min = 38.806229986847676,
    lat_max = 38.999466837911626,
    lng_min = -77.12409668146095,
    lng_max = -76.90252862586087,
)

In [13]:
# SET LOCATION TO CALCULATE STATIONS FOR HERE ##################################
config = CONFIG_DC

In [14]:
def summarize_df(df):
    display(df.head())
    print('Number of rows: {}'.format(len(df)))

In [16]:
bikes = pd.read_csv(config.csv_input_path, error_bad_lines=False, dtype={'start_station_id': 'string', 'end_station_id': 'string'})
summarize_df(bikes)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0F961E4450F8544E,classic_bike,2021-02-20 14:03:25,2021-02-20 14:14:17,21st St & Pennsylvania Ave NW,31252,New York Ave & 15th St NW,31222,38.901539,-77.046564,38.899032,-77.033354,casual
1,DFD528B4F2B3CA6A,classic_bike,2021-02-15 09:54:23,2021-02-15 11:21:02,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
2,2398431BB0EB78BE,classic_bike,2021-02-15 09:53:12,2021-02-15 09:53:34,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
3,6E32C58697957443,classic_bike,2021-02-24 14:50:17,2021-02-24 15:29:01,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member
4,2DCACE8B26B0A50A,classic_bike,2021-02-05 16:39:10,2021-02-05 16:39:13,Hains Point/Buckeye & Ohio Dr SW,31273,Hains Point/Buckeye & Ohio Dr SW,31273,38.878433,-77.03023,38.878433,-77.03023,member


Number of rows: 77505


In [6]:
def get_non_null_rows(df, column):
    return df[df[column].notnull()]


REQUIRED_COLUMNS = [
    'start_lat', 'start_lng', 'end_lng', 'end_lat', 'start_station_id',
    'end_station_id'
]

# Remove all rows with missing geographic information.
for column in REQUIRED_COLUMNS:
    bikes = get_non_null_rows(bikes, column)

summarize_df(bikes)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
4029,BD19F002C21065E0,classic_bike,2021-02-26 15:29:27,2021-02-26 15:31:14,Vine St at Shattuck Ave,BK-A7,Vine St at Shattuck Ave,BK-A7,37.880222,-122.269592,37.880222,-122.269592,casual
4052,6D4ED519944CC5E5,classic_bike,2021-02-24 15:06:14,2021-02-24 15:39:29,Vine St at Shattuck Ave,BK-A7,Vine St at Shattuck Ave,BK-A7,37.880222,-122.269592,37.880222,-122.269592,casual
10561,072879C2ED883CF1,classic_bike,2021-02-12 19:08:22,2021-02-12 19:15:02,Townsend St at 5th St,SF-K28,Mississippi St at 17th St,SF-N29,37.775235,-122.397437,37.764794,-122.39476,member
10562,25D1A5D72C59C493,classic_bike,2021-02-28 10:34:39,2021-02-28 11:00:01,19th St at Florida St,SF-O25-1,2nd St at Folsom St,SF-H29,37.760447,-122.410807,37.785453,-122.396512,member
10563,5D7DCF49E7C54D43,electric_bike,2021-02-14 13:44:57,2021-02-14 13:51:55,19th St at Florida St,SF-O25-1,Bryant St at 15th St,SF-M25,37.760488,-122.410707,37.767114,-122.410699,member


Number of rows: 69555


In [7]:
def get_rows_in_range(df, column, min_value, max_value):
    return df[(df[column] >= min_value) & (df[column] <= max_value)]


def get_geofenced_rows(df):
    new_df = df
    new_df = get_rows_in_range(new_df, 'start_lat', config.lat_min,
                               config.lat_max)
    new_df = get_rows_in_range(new_df, 'start_lng', config.lng_min,
                               config.lng_max)
    new_df = get_rows_in_range(new_df, 'end_lat', config.lat_min,
                               config.lat_max)
    new_df = get_rows_in_range(new_df, 'end_lng', config.lng_min,
                               config.lng_max)
    return new_df


bikes = get_geofenced_rows(bikes)
summarize_df(bikes)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
10561,072879C2ED883CF1,classic_bike,2021-02-12 19:08:22,2021-02-12 19:15:02,Townsend St at 5th St,SF-K28,Mississippi St at 17th St,SF-N29,37.775235,-122.397437,37.764794,-122.39476,member
10562,25D1A5D72C59C493,classic_bike,2021-02-28 10:34:39,2021-02-28 11:00:01,19th St at Florida St,SF-O25-1,2nd St at Folsom St,SF-H29,37.760447,-122.410807,37.785453,-122.396512,member
10563,5D7DCF49E7C54D43,electric_bike,2021-02-14 13:44:57,2021-02-14 13:51:55,19th St at Florida St,SF-O25-1,Bryant St at 15th St,SF-M25,37.760488,-122.410707,37.767114,-122.410699,member
10564,EF8B274C5CD4559E,electric_bike,2021-02-22 18:24:57,2021-02-22 18:50:10,Utah St at 24th St,SF-Q26,24th St at Bartlett St,SF-Q23-3,37.753217,-122.405449,37.752137,-122.420002,member
10565,A896A2BE15365157,classic_bike,2021-02-22 13:16:00,2021-02-22 13:21:37,19th St at Florida St,SF-O25-1,16th St Mission BART Station 2,SF-N22-1B,37.760447,-122.410807,37.764839,-122.420141,member


Number of rows: 60815


In [8]:
# Find average lat/lng values for each unique station.


def get_stations(df, column_station_id, column_lat, column_lng):
    stations_rows = df[[column_station_id, column_lat, column_lng]]
    stations_rows = stations_rows.rename(columns={
        column_station_id: 'station_id',
        column_lat: 'lat',
        column_lng: 'lng',
    })
    return stations_rows.groupby(by=['station_id'], as_index=False).agg({
        'lat': 'mean',
        'lng': 'mean',
    })


def get_first_numeric_value(value1, value2):
    if np.isnan(value1):
        return value2
    return value1


# Some station ids are present as end stations but not start stations. Collect
# from both.
start_stations = get_stations(bikes, 'start_station_id', 'start_lat',
                              'start_lng')
end_stations = get_stations(bikes, 'end_station_id', 'end_lat', 'end_lng')

# Take the first non-NaN value from the merged stations dataframes.
stations = start_stations.merge(end_stations, on='station_id', how='outer')
stations['lat'] = stations.apply(
    lambda row: get_first_numeric_value(row['lat_x'], row['lat_y']), axis=1)
stations['lng'] = stations.apply(
    lambda row: get_first_numeric_value(row['lng_x'], row['lng_y']), axis=1)
stations = stations[['station_id', 'lat', 'lng']]

summarize_df(stations)

Unnamed: 0,station_id,lat,lng
0,SF I29-1,37.783375,-122.393276
1,SF-A19,37.805339,-122.43725
2,SF-A20,37.804438,-122.433527
3,SF-A27,37.804778,-122.403261
4,SF-AA15,37.72057,-122.447685


Number of rows: 253


In [9]:
stations.to_csv(config.csv_output_path, index=False)