In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine

In [2]:
def custom_dist(u, v):
    if u[2] == v[2]:
        return sys.float_info.max
    else:
        return haversine([u[0], u[1]], [v[0], v[1]])

thresholds = {'CurbRamp': 0.0075,
                      'NoCurbRamp': 0.0075,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Problem': 0.01}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng', 'user_id']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')
   
    return (labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']

# These are the columns required in the POST requests for the labels
label_cols = ['label_id', 'label_type', 'cluster_id']

def cluster_label_type_at_index(label_data,i):
        #clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        (labels_for_type_i) = cluster(type_data, label_type, thresholds)

        return (label_type, labels_for_type_i)

In [3]:
# read in the labels
labels = gpd.read_file('data/seattle_labels.geojson')

In [4]:
labels

Unnamed: 0,label_id,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,label_type,severity,user_id,lat,lng,geometry
0,85053,3sAn6u8bQPVW3hTDfNUP1w,1,107.750000,-15.625000,270.675171,0.168884,CurbRamp,1.0,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618618,-122.333054,POINT (-122.33305 47.61862)
1,85004,CpU83mS7vz17EnE02JGOkg,1,41.750000,-10.375000,129.873657,0.026001,CurbRamp,1.0,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618134,-122.340515,POINT (-122.34052 47.61813)
2,85024,RDV4HddwNqwfBZndZRqqkA,1,100.437500,-12.625000,128.847916,0.042198,CurbRamp,1.0,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.615307,-122.337608,POINT (-122.33761 47.61531)
3,85025,NpkOF1LfE024Ks0XIhCttw,1,136.062500,-11.500000,350.971405,1.353600,CurbRamp,1.0,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.615089,-122.337662,POINT (-122.33766 47.61509)
4,85034,-l9SjmKbZmUfr3JV3PuTVg,1,180.312500,-15.562500,171.933517,-0.698708,CurbRamp,1.0,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.614845,-122.337921,POINT (-122.33792 47.61485)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195538,96388,Ek8nn67kKNgCFjZkKf9ISA,2,348.645081,-3.973214,318.706665,0.226311,NoSidewalk,3.0,f2a1a9cc-466f-4ee2-83d4-2d8ff46a31fe,47.658714,-122.389786,POINT (-122.38979 47.65871)
195539,122316,6U9DDI4UppbHDTyp8_J3JA,1,3.171875,-32.000000,186.167953,0.891960,NoCurbRamp,5.0,7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.644711,-122.387703,POINT (-122.38770 47.64471)
195540,122337,lydaO-3xUm3YPy2sNPyPoQ,1,288.500000,-20.250000,181.292786,5.152237,SurfaceProblem,2.0,7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.641743,-122.387810,POINT (-122.38781 47.64174)
195541,122341,ubr500pdt0vImqg71_dHsQ,1,198.312500,-23.562500,180.803787,3.150978,CurbRamp,1.0,7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.641102,-122.387764,POINT (-122.38776 47.64110)


In [7]:
# cluster the curb ramp labels
curb_ramp = cluster_label_type_at_index(labels,0)[1]

: 

: 