In [2]:
def cleanData(csv_path):
    '''
    cleans up san francisco traffic data for api processing

    Params
    csv_path: str of path to csv file
    '''
    import pandas as pd

    df = pd.read_csv(csv_path)

    # Group data on count of routes
    df_grouped = df.groupby(['start_station_name', 'end_station_name'])['started_at','ended_at'].agg('count')
    df_grouped = df_grouped.reset_index()

    # Remove routes where bike did not change stations
    df_grouped = df_grouped[df_grouped['start_station_name'] != df_grouped['end_station_name']]

    # cleanup
    df_grouped['count'] = df_grouped['started_at']
    df_grouped = df_grouped[df_grouped['count'] > 5]
    df_grouped = df_grouped.drop(columns=['started_at','ended_at'])

    # Create Metatada Dataset. All station names with the respective coordinates
    # Each ride with have start and end coordinates, however, for the same station there will multiple corrdinate values that differ for a different in meters (minute)
    # To avoid issues in the groupBy statements we average all coordinates values for each locations's latitute and longitude.
    df_stations = pd.read_csv(csv_path)[['start_station_name', 'end_station_name',  'start_lat', 'start_lng', 'end_lat', 'end_lng']]
    df_stations_start = df_stations[['start_station_name','start_lat','start_lng']]
    df_stations_end = df_stations[['end_station_name','end_lat','end_lng']]

    # Averaging coordinate values for start locations and end locations
    df_stations_start = df_stations_start.groupby(['start_station_name'])['start_lat','start_lng'].agg('mean')
    df_stations_end = df_stations_end.groupby(['end_station_name'])['end_lat','end_lng'].agg('mean')

    df_stations_start.rename(columns = {'start_lat':'lat','start_lng':'lng'},inplace=True)
    df_stations_start.reset_index()

    df_stations_end.rename(columns = {'end_lat':'lat','end_lng':'lng'},inplace=True)
    df_stations_end.reset_index()

    df_stations_metadata = pd.concat([df_stations_start,df_stations_end])
    df_stations_metadata = df_stations_metadata.reset_index()

    # If there are duplicated entries for start and end stations (there will be), we drop one to have just one calculated coordinates field per stations.
    # Doesnt matter which one we drop because coordinates in the duplicated pair will be fairly consistent.
    df_stations_metadata = df_stations_metadata.drop_duplicates(subset='index')
    df_1 = df_grouped.merge(df_stations_metadata,how='right', left_on='start_station_name',right_on='index')
    df_2 = df_1.merge(df_stations_metadata,how='right', left_on='end_station_name',right_on='index')

    df_2.to_csv('../processed_datasets/ny_processed.csv')

cleanData('../datasets/JC-202209-citibike-tripdata.csv')

  df_grouped = df.groupby(['start_station_name', 'end_station_name'])['started_at','ended_at'].agg('count')
  df_stations_start = df_stations_start.groupby(['start_station_name'])['start_lat','start_lng'].agg('mean')
  df_stations_end = df_stations_end.groupby(['end_station_name'])['end_lat','end_lng'].agg('mean')


In [3]:
def points(start_lat,start_lng,end_lat,end_lng):
    '''
    makes an api call to route and directions to get a list of coordinates of the shortest path between two geographical coordinates

    Params
    start_lat: the origin lattitude
    start_lng: the origin longitude
    end_lat: the destination lattitude
    end_lng: the destination longitude
    '''
    import requests
    
    url = "https://route-and-directions.p.rapidapi.com/v1/routing"
    querystring = {"waypoints":f"{start_lat},{start_lng}|{end_lat},{end_lng}","mode":"bicycle"}
    headers = {
        "X-RapidAPI-Key": "INSERT-YOUR-API-KEY-HERE",
        "X-RapidAPI-Host": "route-and-directions.p.rapidapi.com"
    }
    response = requests.request("GET", url, headers=headers, params=querystring)

    return response

import pandas as pd

def getCoordinates(csv_file):
    '''
    reads a csv file of starting and ending locations, creates a csv file of points of all the paths traveled

    Params
    csv_file- str to csv file location
    '''
    df = pd.read_csv(csv_file)
    df_tester = df.drop(columns='Unnamed: 0')

    points_list = []
    for i in range(len(df_tester)):
        print(i*100/(len(df_tester)), '% Complete')

        start_lat = str(df_tester['lat_x'].loc[i])
        start_lng = str(df_tester['lng_x'].loc[i])
        end_lat = str(df_tester['lat_y'].loc[i])
        end_lng = str(df_tester['lng_y'].loc[i])


        coordinates = points(start_lat,start_lng,end_lat,end_lng)
        if coordinates.status_code == 200:
            points_json = coordinates.json()
            points_list.append(points_json)
        else:
            points_list.append("error")

    df_tester['coordinates'] = points_list
    df_tester.to_csv('../processed_datasets/ny_final.csv')


getCoordinates('../processed_datasets/ny_processed.csv')

0.0 % Complete
0.035211267605633804 % Complete
0.07042253521126761 % Complete
0.1056338028169014 % Complete
0.14084507042253522 % Complete
0.176056338028169 % Complete
0.2112676056338028 % Complete
0.24647887323943662 % Complete
0.28169014084507044 % Complete
0.31690140845070425 % Complete
0.352112676056338 % Complete
0.3873239436619718 % Complete
0.4225352112676056 % Complete
0.45774647887323944 % Complete
0.49295774647887325 % Complete
0.528169014084507 % Complete
0.5633802816901409 % Complete
0.5985915492957746 % Complete
0.6338028169014085 % Complete
0.6690140845070423 % Complete
0.704225352112676 % Complete
0.7394366197183099 % Complete
0.7746478873239436 % Complete
0.8098591549295775 % Complete
0.8450704225352113 % Complete
0.8802816901408451 % Complete
0.9154929577464789 % Complete
0.9507042253521126 % Complete
0.9859154929577465 % Complete
1.0211267605633803 % Complete
1.056338028169014 % Complete
1.091549295774648 % Complete
1.1267605633802817 % Complete
1.1619718309859155 % C