This notebook takes a before and after date range and exports citibike stations that were present the entire time

In [94]:
import geopandas as gpd
import math
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [95]:
DATA_DIR = '../data'
EXPORTS_DIR = f'{DATA_DIR}/citibike/exports'
CRS_NYC = 'EPSG:2263'
CRS_LATLON = 'EPSG:4326'

In [96]:
site_1 = {
    'name': "Williamsburg Bridge",
    'before_start' : '2016-11-21',
    'before_end' : '2017-11-21',
    'after_start' : '2017-11-22',
    'after_end' : '2018-11-22'
}

In [97]:
site = site_1

In [98]:
zone = site['name']
before_start = site['before_start']
before_end = site['before_end']
after_start = site['after_start']
after_end = site['after_end']

In [99]:
stations_df = pd.read_csv(f'{EXPORTS_DIR}/station_coords.csv')
stations_df['date'] = stations_df['date'].astype(str)
station_ids = stations_df['id'].unique()
len(station_ids)

3167

In [100]:
nyc_gdf = gpd.read_file(f'{DATA_DIR}/geo/census_tract/nyc_ct.shp')
nyc_gdf = nyc_gdf[['ct2010', 'boro_code', 'geometry']]

In [101]:
stations_df.head()

Unnamed: 0,id,date,lat,lon
0,116,201310,40.741776,-74.001497
1,116,202004,40.743,-74.001
2,119,201310,40.696089,-73.978034
3,120,201310,40.686768,-73.959282
4,127,201310,40.731724,-74.006744


In [102]:
def clean_stations_df(stations_gdf):
    """Filters stations to only those within NYC."""
    gdf = gpd.sjoin(stations_gdf, nyc_gdf, how='inner', op='within')
    gdf = gdf.drop(columns=['index_right'])
    return gdf

In [103]:
def get_stations_for_date(date_string):
    """Returns a dataframe with the most recent station positions prior to the
    given YYYYMM date string."""
    # Select all stations prior to the given date string.
    stations_before_df = stations_df[stations_df['date'] < date_string]
    # Select the most recent of those stations.
    stations_recent_df = stations_before_df.groupby(by=['id']).last().reset_index()
    return stations_recent_df

In [104]:
def get_stations_gdf_for_date(date_string):
    df = get_stations_for_date(date_string)
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'], df['lat']), crs=CRS_LATLON)
    gdf = clean_stations_df(gdf)
    return gdf

In [105]:
df_stations_before = get_stations_gdf_for_date(before_start)
df_stations_after = get_stations_gdf_for_date(after_end)

In [106]:
df_stations_after

Unnamed: 0,id,date,lat,lon,geometry,ct2010,boro_code
0,116,201310,40.741776,-74.001497,POINT (-74.00150 40.74178),008300,1
38,212,201310,40.743349,-74.006818,POINT (-74.00682 40.74335),008300,1
507,3428,201609,40.740983,-74.001702,POINT (-74.00170 40.74098),008300,1
549,3472,201706,40.742754,-74.007474,POINT (-74.00747 40.74275),008300,1
789,434,201310,40.743174,-74.003664,POINT (-74.00366 40.74317),008300,1
...,...,...,...,...,...,...,...
879,527,201411,40.744023,-73.976056,POINT (-73.97606 40.74402),007000,1
880,528,201310,40.742909,-73.977061,POINT (-73.97706 40.74291),007000,1
887,536,201310,40.741444,-73.975361,POINT (-73.97536 40.74144),007000,1
884,532,201310,40.710451,-73.960876,POINT (-73.96088 40.71045),052300,3


In [107]:
# merge the stations from teh two points in time to keep only the ones that were unchanged
left_df = df_stations_before
right_df = df_stations_after

stations_in_timeperiod = pd.merge(left_df, right_df, on='geometry' ,how='inner')

In [108]:
print(len(df_stations_before))
print(len(df_stations_after))
print(len(stations_in_timeperiod)) #The length of the merged stations should not exceed either of the inputs

492
851
481


In [109]:
#drop the righthand columns - they are duplicates
drop_columns = (len(right_df.columns) -1)

stations_in_timeperiod = stations_in_timeperiod.iloc[:,:-drop_columns].copy()

In [110]:
#trim off the '_x' suffix from the merge
columns = []
for col in stations_in_timeperiod.columns:
    if col[-2:] == '_x' :
        col = col[:-2]
    else:
        col = col
    columns.append(col)
    
stations_in_timeperiod.columns = columns

In [111]:
stations_in_timeperiod

Unnamed: 0,id,date,lat,lon,geometry,ct2010,boro_code
0,116,201310,40.741776,-74.001497,POINT (-74.00150 40.74178),008300,1
1,212,201310,40.743349,-74.006818,POINT (-74.00682 40.74335),008300,1
2,434,201310,40.743174,-74.003664,POINT (-74.00366 40.74317),008300,1
3,463,201310,40.742065,-74.004432,POINT (-74.00443 40.74207),008300,1
4,119,201310,40.696089,-73.978034,POINT (-73.97803 40.69609),002901,3
...,...,...,...,...,...,...,...
476,527,201411,40.744023,-73.976056,POINT (-73.97606 40.74402),007000,1
477,528,201310,40.742909,-73.977061,POINT (-73.97706 40.74291),007000,1
478,536,201310,40.741444,-73.975361,POINT (-73.97536 40.74144),007000,1
479,532,201310,40.710451,-73.960876,POINT (-73.96088 40.71045),052300,3


In [113]:
# Export subset of stations
stations_in_timeperiod.to_csv(f'{EXPORTS_DIR}/station_snapshots/{zone}/stations_in_timeperiod.csv')
