In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from pathlib import Path

from src.station_cleaning import (get_words_to_search_with, standardize_stations, create_station_dictionary)

In [None]:
BAYWHEELS_PATH = Path("./BAYWHEELS_DATA_FOLDER")        

files = BAYWHEELS_PATH.glob('*.csv')

baywheels = pd.concat(
    (pd.read_csv(file, engine = 'pyarrow') for file in files), 
    ignore_index = True
)

In [None]:
baywheels['started_at'] = baywheels['started_at'].fillna(baywheels['start_time'])
baywheels['ended_at'] = baywheels['ended_at'].fillna(baywheels['end_time'])

baywheels['start_lat'] = baywheels['start_lat'].fillna(baywheels['start_station_latitude'])
baywheels['start_lng'] = baywheels['start_lng'].fillna(baywheels['start_station_longitude'])
baywheels['end_lat'] = baywheels['end_lat'].fillna(baywheels['end_station_latitude'])
baywheels['end_lng'] = baywheels['end_lng'].fillna(baywheels['end_station_longitude'])

In [None]:
columns_to_drop = ['duration_sec', 'rental_access_method', 'bike_share_for_all_trip', 'bike_id', 'user_type', 'member_casual',
                   'start_time', 'end_time', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 
                   'end_station_longitude', 'ride_id', 'rideable_type']

baywheels.drop(columns_to_drop, axis = 1, inplace = True)

In [None]:
baywheels.dropna(subset = ['start_station_id', 'start_station_name'], how = 'all', inplace = True)
baywheels.dropna(subset = ['end_station_id', 'end_station_name'], how = 'all', inplace = True)
baywheels.dropna(subset = ['end_lat', 'end_lng'], how = 'any', inplace = True)

In [None]:
baywheels['start_station_name'] = baywheels['start_station_name'].str.upper()
baywheels['end_station_name'] = baywheels['end_station_name'].str.upper()

baywheels['started_at'] = pd.to_datetime(baywheels['started_at'], format = 'mixed')    
baywheels['ended_at'] = pd.to_datetime(baywheels['ended_at'], format = 'mixed')

baywheels['start_lat'] = baywheels['start_lat'].astype(float)
baywheels['start_lng'] = baywheels['start_lng'].astype(float)
baywheels['end_lat'] = baywheels['end_lat'].astype(float)
baywheels['end_lng'] = baywheels['end_lng'].astype(float)

In [None]:
SF_LONGITUDE_RANGE = [-123.173825, -122.28178]             
SF_LATITUDE_RANGE = [37.63983, 37.929824]                    

SF_STATIONS_ONLY = (baywheels['start_lng'].between(SF_LONGITUDE_RANGE[0], SF_LONGITUDE_RANGE[1])) & (baywheels['start_lat'].between(SF_LATITUDE_RANGE[0], SF_LATITUDE_RANGE[1]))

baywheels_sf = baywheels.loc[SF_STATIONS_ONLY].copy()       

remove_weird_station_id_prefixes = ('OK', 'BK', 'EM', 'SpecialEventTempStation', 'San Jose Depot', 'SF Depot', 'EB Test Station', 'SF-Outside Lands-Temp',
                                    '16th Depot Bike Station', 'HS', 'LW1.6', 'GGP-L1.5', 'L1', 'OutsideTempStation', '4040 3rd St Depot Station', '16th St Depot',
                                      'GGP-L1.7', 'SJ'
                                      )

baywheels_sf = baywheels_sf[~baywheels_sf['start_station_id'].str.startswith(remove_weird_station_id_prefixes, na = False) &
                            ~baywheels_sf['end_station_id'].str.startswith(remove_weird_station_id_prefixes, na = False)].copy()    

In [None]:
station_name_columns = ['start_station_name', 'end_station_name']
baywheels_sf.loc[:, station_name_columns] = baywheels_sf[station_name_columns].replace(r'[():/*\-]', ' ', regex = True)

In [None]:
end_station_name_list = baywheels_sf['end_station_name'].unique().tolist()
end_station_keywords = get_words_to_search_with(end_station_name_list)    

start_station_name_list = baywheels_sf['start_station_name'].unique().tolist()
start_station_keywords = get_words_to_search_with(start_station_name_list)

In [None]:
standardized_end_station = standardize_stations(end_station_keywords[0], 'end_station_name', 'end_station_id', 'ended_at', 'end_lat', 'end_lng', baywheels_sf)
end_station_key = create_station_dictionary(standardized_end_station[0], 'end_station_name', 'end_station_id')

standardized_start_station = standardize_stations(start_station_keywords[0], 'start_station_name', 'start_station_id', 'started_at', 'start_lat', 'start_lng', baywheels_sf)
start_station_key = create_station_dictionary(standardized_start_station[0], 'start_station_name', 'start_station_id')

In [None]:
weird_station_names_dictionary = {                 
    'HOWARD INTERNAL MONOLITH', 'FIRMWARE TEST CHARGING INTERNAL HOWARD ', 'FIRMWARE TEST INTERNAL HOWARD ', 
    'MINNESOTA ST DEPOT   MONOLITH ', 'HOWARD INTERNAL MONOLITH   IOT', 'X DEV MTL 01', 'EB TEST STATION', 'MTL ECO5 01', 
    'HOWARD WORKSHOP   STATION IN A BOX', 'PROTOTYPE LAB', 'LAB   HOWARD', 'X DEV MTL 01', 'OLD HASTINGS MONOLITH',
    'LAB   NYC   M', 'OCEAN 2E'
}

end_station_key = {key: value for key, value in end_station_key.items() if key not in weird_station_names_dictionary}
start_station_key = {key: value for key, value in start_station_key.items() if key not in weird_station_names_dictionary}

In [None]:
avoid_chain_assignment_end_station = baywheels_sf['end_station_name'].map(end_station_key)
baywheels_sf.loc[avoid_chain_assignment_end_station.notna(), 'end_station_id'] = avoid_chain_assignment_end_station

avoid_chain_assignment_start_station = baywheels_sf['start_station_name'].map(start_station_key)
baywheels_sf.loc[avoid_chain_assignment_start_station.notna(), 'start_station_id'] = avoid_chain_assignment_start_station


weird_station_names_list = list(weird_station_names_dictionary)
avoid_chain_assignment_baywheels = baywheels_sf[station_name_columns].isin(weird_station_names_list).any(axis = 1)

baywheels_sf = baywheels_sf[~avoid_chain_assignment_baywheels].copy()

baywheels_sf = baywheels_sf.dropna()

In [None]:
numeric_start_station_ids = pd.to_numeric(baywheels_sf['start_station_id'], errors = 'coerce').notna()
baywheels_sf = baywheels_sf[~numeric_start_station_ids].copy()

numeric_end_station_ids = pd.to_numeric(baywheels_sf['end_station_id'], errors = 'coerce').notna()
baywheels_sf = baywheels_sf[~numeric_end_station_ids].copy()

In [None]:
BALBOA_PARK_DICTIONARY = {
    'BALBOA PARK BART': 'SF-AA15',
    'BALBOA PARK BART 2': 'SF-AA15',
    'BALBOA PARK': 'SF-AA15',
    'BALBOA PARK  SAN JOSE AVE AT SGT. JOHN V. YOUNG LN ': 'SF-AA15',
    'BALBOA PARK BART STATION': 'SF-AA15',
    'BALBOA PARK BART 2  GEVENA AVE ': 'SF-AA15',
    'BALBOA PARK BART STATION 2': 'SF-AA15'
}

balboa_start_stations = baywheels_sf['start_station_name'].isin(BALBOA_PARK_DICTIONARY)
baywheels_sf.loc[balboa_start_stations, 'start_station_id'] = baywheels_sf.loc[balboa_start_stations, 'start_station_name'].map(BALBOA_PARK_DICTIONARY)

balboa_end_stations = baywheels_sf['end_station_name'].isin(BALBOA_PARK_DICTIONARY)
baywheels_sf.loc[balboa_end_stations, 'end_station_id'] = baywheels_sf.loc[balboa_end_stations, 'end_station_name'].map(BALBOA_PARK_DICTIONARY)

In [None]:
mismatched_station_ids = {
    '22ND ST CALTRAIN STATION': 'SF-P30',
    'POWELL ST BART NORTH': 'SF-G27',
    'SFSU PARK WEST': 'SF-Y8',
    'WEST CRISSY FIELD': 'SF-A11'
}

rename_mismatched_station_names = {
    'NORTH POINT ST AT POWELL ST': 'POWELL ST BART NORTH',
    'SFSU UNIVERSITY PARK NORTH WEST': 'SFSU PARK WEST'
}


baywheels_sf['start_station_name'] = baywheels_sf['start_station_name'].replace(rename_mismatched_station_names)
baywheels_sf['end_station_name'] = baywheels_sf['end_station_name'].replace(rename_mismatched_station_names)

mismatch_start_station_fix = baywheels_sf['start_station_name'].map(mismatched_station_ids)
baywheels_sf.loc[mismatch_start_station_fix.notna(), 'start_station_id'] = mismatch_start_station_fix[mismatch_start_station_fix.notna()]

mismatch_end_station_fix = baywheels_sf['end_station_name'].map(mismatched_station_ids)
baywheels_sf.loc[mismatch_end_station_fix.notna(), 'end_station_id'] = mismatch_end_station_fix[mismatch_end_station_fix.notna()]

In [None]:
baywheels_sf.to_csv('baywheels_sf.csv', index = False)