In [3]:

import pandas as pd
import numpy as np
import polyline
import gpxpy
import folium
import ast
from collections import defaultdict, Counter
from math import radians, cos, sin, asin, sqrt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import scale
from scipy.cluster import  hierarchy
from math import radians, cos, sin, asin, sqrt

In [4]:
df = pd.read_csv('Sample_Data/run_data_11-27-2018.csv')

In [5]:
df = df.iloc[:,1::]

In [6]:
len(df)

19731

In [11]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    paraphrased from 
    https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in miles. Use 6371 for km.
    return c * r

In [12]:
def find_distances(coordinate1, coordinate2):
    lat1, lon1 = coordinate1
    lat2, lon2 = coordinate2
    distance = haversine(lat1, lon1, lat2, lon2)
    return distance


In [13]:
def make_floats(tup):
    x, y = tup
    return (float(x), float(y))

In [25]:
def get_distances(df, start):
    '''Takes in a dataframe of activities and returns a dataframe with start and end latlng as tuples with floats.
    Also returns the distance away from a starting point input.'''
   
    df_starts = df[(~df['start_latlng'].isna()) & (~df['end_latlng'].isna())].copy()
    df_starts.loc[:,'start_latlng'] = df_starts.loc[:,'start_latlng'].apply(lambda x: x.split(","))
    df_starts.loc[:,'start_latlng'] = df_starts.loc[:,'start_latlng'].apply(lambda x: tuple(x))
    df_starts.loc[:,'start_latlng'] = df_starts.loc[:,'start_latlng'].apply(lambda x: make_floats(x))
    df_starts.loc[:,'end_latlng'] = df_starts.loc[:,'end_latlng'].apply(lambda x: x.split(","))
    df_starts.loc[:,'end_latlng'] = df_starts.loc[:,'end_latlng'].apply(lambda x: tuple(x))
    df_starts.loc[:,'end_latlng'] = df_starts.loc[:,'end_latlng'].apply(lambda x: make_floats(x))
    df_starts.loc[:,'distance_away'] = df_starts.loc[:,'start_latlng'].apply(lambda x: find_distances(start, x))
    return df_starts
    

### Start Here With Your Location

In [26]:
start = (47.529832, -121.987695)

In [27]:
df_starts = get_distances(df, start)

In [28]:
df_starts.head(1)

Unnamed: 0,upload_id,average_speed,distance,elapsed_time,total_elevation_gain,type,start_date_local,start_latlng,end_latlng,map,miles_converted,distance_away
0,1919166000.0,3.238,4429.7,0:37:28,31.0,Run,2018-08-21T12:21:27,"(47.61, -122.35)","(47.6, -122.33)","{'id': 'a1788404266', 'polyline': None, 'summa...",2.752488,17.762133


In [29]:
def standardize_inputs(user_input, df):
    '''Standardize the user inputs for cosine similarity'''
    elevation = user_input[0]
    distance = user_input[1]
    std_elevation = (elevation - df['total_elevation_gain'].mean())/df['total_elevation_gain'].std()
    std_distance = (distance - df['miles_converted'].mean())/df['miles_converted'].std()
    return np.array([std_elevation,std_distance])

In [32]:
def recommend_runs(request, dist, df):
    '''Inputs are a list of user-specified elevation gain in meters and miles to run, 
    dataframe of activities, and the columns of the dataframe to check 
    for cosine similarity. Columns to check should be in standardized form.  
    Output is a dictionary of polyline maps for route recommendations.'''
    df = df.copy()
    #requires sklearn.cosine_similarity
    df = df[df['distance_away']<= dist] #filter dataframe for the requested distance range
    df.loc[:, 'elevation_std'] = scale(df['total_elevation_gain'].values.reshape(-1, 1))
    df.loc[:, 'miles_std'] = scale(df['miles_converted'].values.reshape(-1, 1))
    similarity_df = df.loc[:, ['elevation_std', 'miles_std']]
    user_input = standardize_inputs(request, df)
    user_input = user_input.reshape(1,2)
    user_input_reshaped = user_input.reshape(1,-1)
    similarities = cosine_similarity(similarity_df, user_input_reshaped)
    sort_indices = np.argsort(similarities, axis = None)
    top_20 = sort_indices[-20:]
    recommend_indices = list(top_20[::-1]) #reverse the order
    recommendations = df.iloc[recommend_indices, :]
    return dict(recommendations['map']), recommend_indices

### Get a dataframe of Recommendations Here

In [38]:
recommend_dict, recommend_indices = recommend_runs([100, 5], 3, df_starts)

In [39]:
recommend_indices

[328,
 255,
 402,
 299,
 156,
 28,
 166,
 416,
 40,
 1,
 296,
 334,
 290,
 179,
 301,
 370,
 294,
 308,
 171,
 400]

In [None]:
def make_polyline_dict(recommend_dict):
    '''Take in a dictionary of map objects and return dictionary of polylines{index:polyline} and the indices
    for the polylines as a list.'''
    polylines = {}
    for k, v in recommend_dict.items():
        v = ast.literal_eval(v)
        if v['summary_polyline'] != None: #make sure the polyline list isn't empty
            polylines[k] = v['summary_polyline']
    indices = list(polylines.keys())
    return polylines, indices

In [None]:
polylines,indices = make_polyline_dict(recommend_dict)

In [None]:
indices  #use these to later go back into dataframe for run stats

In [None]:
#get coordinates for the polylines in the list
map_coordinates = []
for line in list(polylines.values()):
    coordinates = polyline.decode(line)
    map_coordinates.append(coordinates)

In [None]:
def find_centroids(coordinate_lst):
    centroids = []
    for l in coordinate_lst:
        lats = []
        longs = []
        for point in l:
            lats.append(point[0])
            longs.append(point[1])
        centroid = (round(np.mean(lats), 3), round(np.mean(longs), 3))
        centroids.append(centroid)
    return centroids

In [None]:
def make_comparison_df(coordinate_lst, df, indices):
    centroids = find_centroids(coordinate_lst)
    lats = []
    longs = []
    elevation_lst = []
    for c in centroids:
        lats.append(c[0])
        longs.append(c[1])
    for idx in indices: #get the elevation for the runs in the suggestion list.
        row = df.loc[idx] 
        elevation_lst.append(row['total_elevation_gain'])
    comparison_df = pd.DataFrame({'lats': lats, 'longs':longs, 'elevation':elevation_lst})
    return comparison_df

In [None]:
comparison_df = make_comparison_df(map_coordinates, working_df, indices)

In [None]:
comparison_array = comparison_df.values
comparison_array_std = (comparison_array - np.mean(comparison_array, axis=0)) / np.std(comparison_array, axis=0)
#make comparisons with all the datapoints in the comparison array
cosine_sim_arr = cosine_similarity(comparison_array_std)

In [None]:
threshold = 0.05
Z = hierarchy.linkage(cosine_sim_arr, 'average', metric="cosine")
C = hierarchy.fcluster(Z, threshold, criterion="distance")
ids = list(range(20)) #make a list of numbers 0-19 to use as indices for a cluster groups dictionary
C

In [None]:
cluster_groups = defaultdict(list)
for idx, grouping in enumerate(C):
    cluster_groups[grouping].append(idx)

In [None]:
def get_indices(groups):
    '''Takes in dictionary of cluster_groupings and returns a list of indices
    to use for route suggestions'''
    sort_groups = sorted(list(groups.values()), key=len)
    sort_groups = sort_groups[::-1]
    indices_to_use = []
    for group in sort_groups:
        if len(group) >= 1:
            indices_to_use.append(group[0])
    return indices_to_use

In [None]:
indices_to_use = get_indices(cluster_groups)
indices_to_use

In [None]:
def map_indices(indices_to_use, indices):
    '''Takes in indices_to_use from 20 suggested routes and the actual indices of the 20 routes in the larger
    dataframe and returns a mapping of indices_to_use back to the index in the larger dataframe. 
    Use: to retrieve stats for suggested routes'''
    mapping = {}
    for idx, i in enumerate(indices):
        if idx in indices_to_use:
            mapping[idx] = i
    return mapping
        

In [None]:
mapping_dict = map_indices(indices_to_use, indices)


In [None]:
mapping_dict

In [None]:
def return_route_stats(mapping_dict, indices_to_use, df):
    '''Returns the elevation gain and miles for the routes that will be returned'''
    mapping = [mapping_dict[i] for i in indices_to_use[0:5]]
    slice_df = df.iloc[mapping]
    return slice_df.loc[:, ['total_elevation_gain', 'miles_converted']].reset_index()
        

In [None]:
return_route_stats(mapping_dict, indices_to_use, df)

In [None]:
unique_coordinates = [map_coordinates[i] for i in indices_to_use]

In [None]:
lat, long = map_coordinates[0][0]
m = folium.Map(location=[lat, long], zoom_start=12.2)

In [None]:
for idx, route in enumerate(unique_coordinates[0:5]):
    colors = ['blue','green','red','orange','purple']
    folium.PolyLine(
            route,
            weight=2,
            color=colors[idx]
        ).add_to(m)

In [None]:
legend_html = '''<div style= "position: fixed; 
     bottom: 50px; left: 50px; width: 100px; height: 180px; 
     border:2px solid grey; z-index:9999; font-size:14px;
     ">&nbsp; Routes <br>
     &nbsp; Route_0 &nbsp; <i class="fa fa-square fa-2x"
                  style="color:blue"></i><br>
     &nbsp; Route_1 &nbsp; <i class="fa fa-square fa-2x"
                  style="color:green"></i>
     &nbsp; Route_2 &nbsp; <i class="fa fa-square fa-2x"
                  style="color:red"></i><br>
     &nbsp; Route_3 &nbsp; <i class="fa fa-square fa-2x"
                  style="color:orange"></i>
     &nbsp; Route_4 &nbsp; <i class="fa fa-square fa-2x"
                  style="color:purple"></i>
    </div>'''
    
m.get_root().html.add_child(folium.Element(legend_html))

In [None]:

m