# Collect Stops

#### Primary Author
Chris Carey

#### Description:
This notebook collects all Citi Bike station locations along with reported dates.

#### Inputs:
```
data/citibike/date_prefixes.npy
```
 
#### Outputs:
```
data/citibike/csv/exports/station_coords.csv
```

In [53]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [88]:
DATA_DIR = '../data'
CITIBIKE_CSV_DIR = f'{DATA_DIR}/citibike/csv'
EXPORTS_DIR = f'{DATA_DIR}/citibike/exports'

In [6]:
DATE_PREFIXES = np.load(f'{DATA_DIR}/citibike/date_prefixes.npy')

In [81]:
def is_integer_num(n):
    if isinstance(n, int):
        return True
    if isinstance(n, float):
        return n.is_integer()
    return False

def hash_station_id(station_id):
    if is_integer_num(station_id):
        station_id = int(station_id)
    return str(station_id)

In [82]:
station_dict = {}
def add_row_to_station_dict(row):
    (station_id, lat, lon) = tuple(row[1])
    station_id = hash_station_id(station_id)
    if station_id not in station_dict:
        station_dict[station_id] = {}
    if (lat, lon) not in station_dict[station_id]:
        station_dict[station_id][(lat, lon)] = date_prefix

In [83]:
for date_prefix in tqdm(DATE_PREFIXES):
    df = pd.read_csv(f'{CITIBIKE_CSV_DIR}/{date_prefix}-citibike-tripdata.csv')
    df = df.rename(columns={
        'start station id': 'start_station_id',
        'start station latitude': 'start_lat',
        'start station longitude': 'start_lon',
        'end station id': 'end_station_id',
        'end station latitude': 'end_lat',
        'end station longitude': 'end_lon',
        'Start Station ID': 'start_station_id',
        'Start Station Latitude': 'start_lat',
        'Start Station Longitude': 'start_lon',
        'End Station ID': 'end_station_id',
        'End Station Latitude': 'end_lat',
        'End Station Longitude': 'end_lon',
        'start_lng': 'start_lon',
        'end_lng': 'end_lon',
    })
    
    start_df = df[['start_station_id', 'start_lat', 'start_lon']].drop_duplicates()
    end_df = df[['end_station_id', 'end_lat', 'end_lon']].drop_duplicates()

    for row in start_df.iterrows():
        add_row_to_station_dict(row)
    for row in end_df.iterrows():
        add_row_to_station_dict(row)

100%|██████████| 97/97 [04:27<00:00,  2.76s/it]


In [84]:
df_dict = {'id': [], 'date': [], 'lat': [], 'lon': []}
for station_id, pos_dict in station_dict.items():
    for pos, date in pos_dict.items():
        df_dict['id'].append(station_id)
        df_dict['lat'].append(pos[0])
        df_dict['lon'].append(pos[1])
        df_dict['date'].append(date)

In [86]:
station_df = pd.DataFrame(df_dict)
station_df = station_df.sort_values(by=['id', 'date'])
station_df = station_df[station_df['id'] != 'nan']
station_df = station_df.dropna()
station_df = station_df.reset_index(drop=True)
station_df

Unnamed: 0,id,date,lat,lon
0,116,201310,40.741776,-74.001497
1,116,202004,40.743000,-74.001000
2,119,201310,40.696089,-73.978034
3,120,201310,40.686768,-73.959282
4,127,201310,40.731724,-74.006744
...,...,...,...,...
4990,SYS033,202102,40.728487,-74.011693
4991,SYS035,202102,40.728660,-74.011980
4992,SYS035,202102,40.728660,-74.011980
4993,SYS037,202107,40.716878,-73.983755


In [91]:
station_df.to_csv(f'{EXPORTS_DIR}/station_coords.csv', index=False)