In [4]:
def cleanData(csv_path):
    '''
    cleans up san diego traffic data for api processing

    Params
    csv_path: str of path to csv file
    '''
    assert isinstance(csv_path, str)
    assert csv_path[-4:] =='.csv'
    import pandas as pd
    
    df = pd.read_csv(csv_path)
    df = df[df['count'] > 5]
    df = df[df['lat_x'].notna()]
    df = df[df['lon_x'].notna()]
    df = df[df['lat_y'].notna()]
    df = df[df['lon_y'].notna()]
    df.reset_index(drop=True)
    df.to_csv('../processed_datasets/sd_cleaned.csv')

cleanData('../processed_datasets/sd_data_processed.csv')

In [9]:
def points(start_lat,start_lng,end_lat,end_lng):
    '''
    makes an api call to route and directions to get a list of coordinates of the shortest path between two geographical coordinates

    Params
    start_lat: the origin lattitude
    start_lng: the origin longitude
    end_lat: the destination lattitude
    end_lng: the destination longitude
    '''
    assert isinstance(start_lat, str)
    assert isinstance(start_lng, str)
    assert isinstance(end_lat, str)
    assert isinstance(end_lng, str)

    import requests

    url = "https://route-and-directions.p.rapidapi.com/v1/routing"
    querystring = {"waypoints":f"{start_lat},{start_lng}|{end_lat},{end_lng}","mode":"drive"}
    headers = {
        "X-RapidAPI-Key": "INSERT-YOUR-KEY-HERE",
        "X-RapidAPI-Host": "route-and-directions.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    return response


def getCoordinates(csv_file):
    '''
    reads a csv file of starting and ending locations, creates a csv file of points of all the paths traveled

    Params
    csv_file- str to csv file location
    '''
    assert isinstance(csv_file, str)

    import pandas as pd
    df_tester = pd.read_csv(csv_file)

    points_list = []
    for i in range(len(df_tester)):
        print(i*100/(len(df_tester)), '% Complete')

        count = str(df_tester['count'].loc[i])
        start_lat = str(df_tester['lat_x'].loc[i])
        start_lng = str(df_tester['lon_x'].loc[i])
        end_lat = str(df_tester['lat_y'].loc[i])
        end_lng = str(df_tester['lon_y'].loc[i])

        #makes an api call to get a list of coordinates as a JSON request response
        coordinates = points(start_lat,start_lng,end_lat,end_lng)

        if coordinates.ok:
            points_json = coordinates.json()
            points_list.append(points_json)
        else:
            points_list.append("nope")

    #adds the coordinates to the df
    df_tester['coordinates'] = points_list

    #cleans the data of error responses from the api
    df_tester = df_tester[df_tester['coordinates'] != "nope"]
    df_tester = df_tester[df_tester['coordinates'] != "{'statusCode': 400, 'error': 'Bad Request', 'message': 'No path could be found for input'}"]
    df_tester.to_csv('../processed_datasets/sd_final.csv')


getCoordinates('../processed_datasets/sd_cleaned.csv')

0.0 % Complete
0.9900990099009901 % Complete
1.9801980198019802 % Complete
2.9702970297029703 % Complete
3.9603960396039604 % Complete
4.9504950495049505 % Complete
5.9405940594059405 % Complete
6.930693069306931 % Complete
7.920792079207921 % Complete
8.910891089108912 % Complete
9.900990099009901 % Complete
10.891089108910892 % Complete
11.881188118811881 % Complete
12.871287128712872 % Complete
13.861386138613861 % Complete
14.851485148514852 % Complete
15.841584158415841 % Complete
16.831683168316832 % Complete
17.821782178217823 % Complete
18.81188118811881 % Complete
19.801980198019802 % Complete
20.792079207920793 % Complete
21.782178217821784 % Complete
22.77227722772277 % Complete
23.762376237623762 % Complete
24.752475247524753 % Complete
25.742574257425744 % Complete
26.73267326732673 % Complete
27.722772277227723 % Complete
28.712871287128714 % Complete
29.702970297029704 % Complete
30.693069306930692 % Complete
31.683168316831683 % Complete
32.67326732673267 % Complete
33.

In [None]:
def extract_location_coordinates():
    '''
    Creating a dataset with all the location names and their respective coordinates.
    Requires the cleaned dataset sd_cleaned.csv to be present in the processed_datasets folder.
    '''
    import pandas as pd
    df = pd.read_csv('../processed_datasets/sd_cleaned.csv')

    df_locs = df[['start_loc', 'end_loc',  'lat_x', 'lon_x', 'lat_y', 'lon_y']]
    df_locs.rename(columns = {'lat_x':'start_lat', 'lon_x':'start_lng', 'lat_y':'end_lat', 'lon_y':'end_lng'}, inplace=True)

    df_locs_start = df_locs[['start_loc','start_lat','start_lng']]
    df_locs_end = df_locs[['end_loc','end_lat','end_lng']]

    # Averaging coordinate values for start locations and end locations
    df_locs_start = df_locs_start.groupby(['start_loc'])[['start_lat','start_lng']].agg('mean')
    df_locs_end = df_locs_end.groupby(['end_loc'])[['end_lat','end_lng']].agg('mean')

    df_locs_start.rename(columns = {'start_lat':'lat','start_lng':'lng'},inplace=True)
    df_locs_start.reset_index()
    df_locs_end.rename(columns = {'end_lat':'lat','end_lng':'lng'},inplace=True)
    df_locs_end.reset_index()

    df_locs_metadata = pd.concat([df_locs_start,df_locs_end])
    df_locs_metadata = df_locs_metadata.reset_index()

    df_locs_metadata = df_locs_metadata.drop_duplicates(subset='index')
    df_locs_metadata.to_csv('../processed_datasets/sd_location_coordinates.csv')


def extract_total_counts_per_loc():
    '''
    Finding total counts (sum of counts of rides where it was start or end point) for each location. This is used for hotspot visualization.
    Requires the cleaned dataset sd_cleaned.csv to be present in the processed_datasets folder.
    '''
    import pandas as pd
    data = pd.read_csv('../processed_datasets/sd_final.csv')
    data = data.drop(columns='Unnamed: 0')

    # Aggregate counts by start and end locations
    start_locs = data.groupby('start_loc')[["count"]].sum().reset_index()
    end_locs = data.groupby('end_loc')[["count"]].sum().reset_index()

    locs = start_locs.merge(end_locs, left_on='start_loc', right_on='end_loc', how='outer', indicator=True)

    # Mark locations on the basis of whether they are ride start points only, ride end points only, or both
    locs.loc[locs["_merge"] == "left_only", "loc"] = locs["start_loc"]
    locs.loc[locs["_merge"] == "left_only", "total_count"] = locs["count_x"]

    locs.loc[locs["_merge"] == "right_only", "loc"] = locs["end_loc"]
    locs.loc[locs["_merge"] == "right_only", "total_count"] = locs["count_y"]

    locs.loc[locs["_merge"] == "both", "loc"] = locs["start_loc"]
    locs.loc[locs["_merge"] == "both", "total_count"] = locs["count_x"] + locs["count_y"]

    locs = locs.drop(columns=['start_loc', 'end_loc', 'count_x', 'count_y'])
    locs = locs.rename(columns={'_merge':'type'})
    locs["type"].cat.rename_categories({'left_only':'start_only', 'right_only':'end_only'}, inplace=True)

    lat_lngs = pd.read_csv('../processed_datasets/sd_location_coordinates.csv')
    lat_lngs = lat_lngs.drop(columns='Unnamed: 0')
    locs = locs.merge(lat_lngs, left_on='loc', right_on='index', how='left')
    locs = locs.drop(columns=['index'])
    locs = locs.sort_values(by='total_count', ascending=False)
    locs = locs[['loc', 'total_count', 'lat', 'lng', 'type']]
    locs = locs.dropna()
    locs = locs.reset_index(drop=True)

    locs.to_csv('../processed_datasets/sd_hotspots.csv')

extract_location_coordinates()
extract_total_counts_per_loc()