# Collect Stops

#### Primary Author
Chris Carey

#### Description:
This notebook collects all Citi Bike station locations along with reported dates.

#### Inputs:
```
data/citibike/date_prefixes.npy
```
 
#### Outputs:
```
data/citibike/csv/exports/station_coords.csv
```

In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = '../data'
CITIBIKE_CSV_DIR = f'{DATA_DIR}/citibike/csv'
EXPORTS_DIR = f'{DATA_DIR}/citibike/exports'

In [3]:
DATE_PREFIXES = np.load(f'{DATA_DIR}/citibike/date_prefixes.npy')

In [12]:
def peek(df):
    display(df.iloc[:3])
    print(len(df))
    
def hash_station_id(station_id):
    try:
        float_id = float(station_id)
        int_id = int(float(station_id))
        if int_id == float_id:
            return str(int_id)
        else:
            return str(float_id)
    except:
        return str(station_id)

## Collect Citi Bike Stops with Monthly Granularity

In [27]:
station_dict = {}
def add_row_to_station_dict(row):
    (station_id, lat, lon) = tuple(row[1])
    station_id = hash_station_id(station_id)
    if station_id not in station_dict:
        station_dict[station_id] = {}
    if (lat, lon) not in station_dict[station_id]:
        station_dict[station_id][(lat, lon)] = date_prefix

In [29]:
trip_dfs = []

for date_prefix in tqdm(DATE_PREFIXES):
    df = pd.read_csv(f'{CITIBIKE_CSV_DIR}/{date_prefix}-citibike-tripdata.csv')
    df = df.rename(columns={
        'Bike ID': 'bike_id',
        'Birth Year': 'birth_year',
        'End Station ID': 'end_station_id',
        'End Station Latitude': 'end_lat',
        'End Station Longitude': 'end_lon',
        'End Station Name': 'end_station_name',
        'Gender': 'gender',
        'Start Station ID': 'start_station_id',
        'Start Station Latitude': 'start_lat',
        'Start Station Longitude': 'start_lon',
        'Start Station Name': 'start_station_name',
        'Start Time': 'start_time',
        'Stop Time': 'stop_time',
        'User Type': 'user_type',
        'Trip Duration': 'trip_duration',
        'bikeid': 'bike_id',
        'birth year': 'birth_year',
        'end station id': 'end_station_id',
        'end station latitude': 'end_lat',
        'end station longitude': 'end_lon',
        'end station name': 'end_station_name',
        'end_lng': 'end_lon',
        'start station id': 'start_station_id',
        'start station latitude': 'start_lat',
        'start station longitude': 'start_lon',
        'start station name': 'start_station_name',
        'start_lng': 'start_lon',
        'started_at': 'start_time',
        'starttime': 'start_time',
        'stoptime': 'stop_time',
        'tripduration': 'trip_duration',
        'usertype': 'user_type',
    })
    trip_dfs.append(df)
    
    start_df = df[['start_station_id', 'start_lat', 'start_lon']].drop_duplicates()
    end_df = df[['end_station_id', 'end_lat', 'end_lon']].drop_duplicates()

    for row in start_df.iterrows():
        add_row_to_station_dict(row)
    for row in end_df.iterrows():
        add_row_to_station_dict(row)

100%|██████████| 97/97 [08:58<00:00,  5.55s/it]


In [8]:
df_dict = {'id': [], 'date': [], 'lat': [], 'lon': []}
for station_id, pos_dict in station_dict.items():
    for pos, date in pos_dict.items():
        df_dict['id'].append(station_id)
        df_dict['lat'].append(pos[0])
        df_dict['lon'].append(pos[1])
        df_dict['date'].append(date)

In [None]:
station_df = pd.DataFrame(df_dict)
station_df = station_df.sort_values(by=['id', 'date'])
station_df = station_df[station_df['id'] != 'nan']
station_df = station_df.dropna()
station_df = station_df.reset_index(drop=True)
station_df

In [None]:
station_df.to_csv(f'{EXPORTS_DIR}/station_coords.csv', index=False)

## Collect Citi Bike Stops with Perfect Granularity

In [30]:
station_dict = {}
station_df = pd.DataFrame({'id': [], 'date': [], 'lat': [], 'lon': []})

def add_row_to_station_dict(row):
    (station_id, lat, lon) = tuple(row[1])
    station_id = hash_station_id(station_id)
    if station_id not in station_dict:
        station_dict[station_id] = {}
    if (lat, lon) not in station_dict[station_id]:
        station_dict[station_id] = (lat, lon)
        station_df.append(row)

In [33]:
station_dfs = []

for df in tqdm(trip_dfs):
    start_df = df.groupby(by=['start_station_id', 'start_lat', 'start_lon']).agg({'start_time': 'first'}).reset_index()
    end_df = df.groupby(by=['end_station_id', 'end_lat', 'end_lon']).agg({'start_time': 'first'}).reset_index()
    
    start_df = start_df.rename(columns={
        'start_station_id': 'id',
        'start_lat': 'lat',
        'start_lon': 'lon',
        'start_time': 'date',
    })
    end_df = end_df.rename(columns={
        'end_station_id': 'id',
        'end_lat': 'lat',
        'end_lon': 'lon',
        'start_time': 'date',
    })
    
    all_df = pd.concat([start_df, end_df])[['id', 'lat', 'lon', 'date']]
    all_df = all_df.groupby(by=['id', 'lat', 'lon']).agg({'date': 'first'}).reset_index()
    station_dfs.append(all_df)

100%|██████████| 97/97 [00:26<00:00,  3.67it/s]


In [44]:
stations_df = pd.concat(station_dfs)
stations_df['date'] = pd.to_datetime(stations_df['date'])
stations_df = stations_df.sort_values(by=['date'], ascending=True)

In [45]:
stations_df = stations_df.groupby(by=['id', 'lat', 'lon']).agg({'date': 'first'}).reset_index()
stations_df = stations_df.sort_values(by=['date'], ascending=True)

In [46]:
stations_df

Unnamed: 0,id,lat,lon,date
0,72,40.767000,-73.995000,2020-03-10 21:23:55.618
1,72,40.767272,-73.993929,2013-10-01 00:40:09.000
2,79,40.719000,-74.007000,2020-02-21 19:47:27.787
3,79,40.719116,-74.006667,2013-10-01 00:17:44.000
4,82,40.710000,-74.001000,2020-03-05 18:01:06.295
...,...,...,...,...
7658,SYS033,40.728487,-74.011693,2021-02-05 10:11:47.000
7659,SYS035,40.728660,-74.011980,2021-02-27 14:22:51.000
7660,SYS035,40.728660,-74.011980,2021-02-17 15:04:43.000
7661,SYS037,40.716878,-73.983755,2021-07-24 15:03:38.000
