# Bluebikes data parser
*Finnley Autumn Rogers* | 2024-08-27

Function to parse csv and combine csv files in folders.

In [4]:
import pandas as pd
from pathlib import Path
import numpy as np


In [3]:
path_stub = "data/bb_"
years = ['2021', '2022', '2023', '2024']

At some point in 2023 the column names were changed, meaning I'll need to map them correctly to ensure the concat works correctly

In [28]:
def bb_csv_parser(year):
    ''' 
    # bb_csv_parser

    function for parsing bluebikes trip data for several years

    ## Inputs

    - year (str): year of the trip data to be parsed for accessing the correct folder

    ## Returns

    - bb_df (str): path to created combined csv
    '''


    map_cols = {
        'starttime': 'started_at',
        'stoptime': 'ended_at',
        'start station id': 'start_station_id',
        'end station id': 'end_station_id',
        'start station name': 'start_station_name',
        'end station name': 'end_station_name',
        'start station latitude': 'start_lat',
        'end station latitude': 'end_lat',
        'start station longitude': 'start_lng',
        'end station longitude': 'end_lng',
    }

    keep_cols = list(map_cols.values())
    available_files = Path(path_stub + year).glob("*.csv") # get list of all csv files

    imported_dfs = []

    for f in available_files:

        # read in intial file
        dat = pd.read_csv(f)

        # map names 
        dat.rename(map_cols, axis='columns', inplace=True)

        dat = dat.loc[:,~dat.columns.duplicated()].copy()

        dat = dat[keep_cols]
        # add file name column
        dat['file_name'] = f.stem
        imported_dfs.append(dat)
    
    filename =  "data/bb" + year + "_rideData.csv"

    pd.concat(imported_dfs, ignore_index=True).to_csv(filename, index=False)

    return filename

In [26]:
# 2024 has the fewest files, test on that first
bb_2024 = bb_csv_parser('2023')

In [29]:
# concat for all years
for year in years:
    fn = bb_csv_parser(year)

    print(fn)

data/bb2021_rideData.csv
data/bb2022_rideData.csv
data/bb2023_rideData.csv
data/bb2024_rideData.csv
