In [1]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
import pandas as pd
import geopandas as gpd

In [2]:
def custom_dist(u, v):
    if u[2] == v[2]:
        return sys.float_info.max
    else:
        return haversine([u[0], u[1]], [v[0], v[1]])

thresholds = {'CurbRamp': 0.0075,
                      'NoCurbRamp': 0.0075,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Problem': 0.01}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng', 'user_id']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')
    curr_type = labels['label_type'].values

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')

    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    clusters = labelsCopy.groupby('cluster_id')

    # Computes the center of each cluster and assigns temporariness and severity.
    cluster_list = [] # list of tuples (label_type, cluster_num, lat, lng, severity, temporary).
    for clust_num, clust in clusters:
        #ave_pos = np.mean(clust['coords'].tolist(), axis=0) # use ave pos of clusters.
        #ave_sev = None if pd.isnull(clust['severity']).all() else int(round(np.median(clust['severity'][~np.isnan(clust['severity'])])))
        #ave_temp = None if pd.isnull(clust['temporary']).all() else bool(round(np.mean(clust['temporary'])))

        cluster_list.append((curr_type, clust_num))

    cluster_df = pd.DataFrame(cluster_list, columns=['label_type', 'cluster_id'])

    return (cluster_df, labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']
# problem_types = ['SurfaceProblem', 'Obstacle']

# These are the columns required in the POST requests for the labels and clusters, respectively.
label_cols = ['label_id', 'label_type', 'cluster_id']
cluster_cols = ['label_type', 'cluster_id', 'lat', 'lng', 'severity', 'temporary']


def cluster_label_type_at_index(label_data,i):
        clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        # if label_type == 'Problem':
        #     type_data = label_data[label_data.label_type.isin(problem_types)]
        # else:
        type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        if type_data.shape[0] > 1:
            (clusters_for_type_i, labels_for_type_i) = cluster(type_data, label_type, thresholds)
        elif type_data.shape[0] == 1:
            labels_for_type_i = type_data.copy()
            labels_for_type_i.loc[:,'cluster_id'] = 1 # Gives the single cluster a cluster_id of 1.
            labels_for_type_i.loc[:,'label_type'] = label_type # Gives Problem type if needed.
            clusters_for_type_i = labels_for_type_i.filter(items=cluster_cols)

        return (label_type, clusters_for_type_i, labels_for_type_i)

In [18]:
df = gpd.read_file('data/attributesWithLabels_q04.json')

In [19]:
df

Unnamed: 0,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,severity,is_temporary,label_id,gsv_panorama_id,heading,...,image_date,label_date,label_severity,label_is_temporary,agree_count,disagree_count,notsure_count,label_description,user_id,geometry
0,68003757,SurfaceProblem,23785,428097040,Industrial District,1.0,False,875,3NCbFOAsTaiCFw19YXDO9A,198.881699,...,2018-09,2019-04-05 18:23:27.967,1.0,False,7,1,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38724 47.59493)
1,68003760,SurfaceProblem,23785,428097040,Industrial District,1.0,False,877,Et9Zm3Esp8JS9UByHyDGww,240.941971,...,2018-09,2019-04-05 18:27:33.201,1.0,False,5,1,1,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38400 47.59418)
2,68003755,SurfaceProblem,23785,428097040,Industrial District,1.0,False,878,7wNqOVKuDt4h6sFedMXJKg,214.857315,...,2018-09,2019-04-05 18:28:07.482,1.0,False,4,1,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38381 47.59393)
3,68003755,SurfaceProblem,23785,428097040,Industrial District,1.0,False,879,uj5JioLmsHq0HYFYdc2RDA,228.428741,...,2018-09,2019-04-05 18:28:14.348,1.0,False,7,3,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38381 47.59393)
4,68003564,CurbRamp,23785,428097040,Industrial District,2.0,False,880,uj5JioLmsHq0HYFYdc2RDA,216.821594,...,2018-09,2019-04-05 18:28:21.307,2.0,False,3,0,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38378 47.59389)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8777,67947694,CurbRamp,4647,6422865,Georgetown,1.0,False,232977,DdMLY0rSxtk-a2xwxQSRbg,269.000000,...,2022-10,2023-02-03 22:03:39.712,1.0,False,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32972 47.55182)
8778,67947693,CurbRamp,4646,6422865,Georgetown,1.0,False,232979,DdMLY0rSxtk-a2xwxQSRbg,74.937500,...,2022-10,2023-02-03 22:03:56.991,1.0,False,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32941 47.55192)
8779,67947746,CurbRamp,4647,6422865,Georgetown,2.0,False,232981,Q6Ap_5f7laP3YUUVhBu2bg,285.937500,...,2019-06,2023-02-03 22:04:24.392,2.0,False,1,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32977 47.55195)
8780,67948295,NoSidewalk,770,6347553,Georgetown,5.0,False,232993,ZZH-7bLX12xfNHWH2QHevQ,299.875000,...,2021-08,2023-02-03 22:13:53.467,5.0,False,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.33105 47.54451)


In [20]:
# break geometry into lat and lng
df['lat'] = df.geometry.y
df['lng'] = df.geometry.x

In [21]:
df0 = cluster_label_type_at_index(df,0)[2]

In [22]:
df0 = cluster_label_type_at_index(df,0)[1]

In [24]:
a = cluster_label_type_at_index(df,0)

In [27]:
a[1]

Unnamed: 0,label_type,cluster_id
0,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",1
1,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",2
2,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",3
3,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",4
4,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",5
...,...,...
2284,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",2285
2285,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",2286
2286,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",2287
2287,"[CurbRamp, CurbRamp, CurbRamp, CurbRamp, CurbR...",2288


In [7]:
df1 = cluster_label_type_at_index(df,1)[2]

In [8]:
df2 = cluster_label_type_at_index(df,2)[2]

In [9]:
df3 = cluster_label_type_at_index(df,3)[2]

In [10]:
df4 = cluster_label_type_at_index(df,4)[2]

In [11]:
df5 = cluster_label_type_at_index(df,5)[2]

In [12]:
df6 = cluster_label_type_at_index(df,6)[2]

In [13]:
df7 = cluster_label_type_at_index(df,7)[2]

In [14]:
df8 = cluster_label_type_at_index(df,8)[2]

In [15]:
df0["clustered"] = df0.duplicated(subset="cluster_id", keep=False)
df1["clustered"] = df1.duplicated(subset="cluster_id", keep=False)
df2["clustered"] = df2.duplicated(subset="cluster_id", keep=False)
df3["clustered"] = df3.duplicated(subset="cluster_id", keep=False)
df4["clustered"] = df4.duplicated(subset="cluster_id", keep=False)
df5["clustered"] = df5.duplicated(subset="cluster_id", keep=False)
df6["clustered"] = df6.duplicated(subset="cluster_id", keep=False)
df7["clustered"] = df7.duplicated(subset="cluster_id", keep=False)
df8["clustered"] = df8.duplicated(subset="cluster_id", keep=False)

In [16]:
df0

Unnamed: 0,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,severity,is_temporary,label_id,gsv_panorama_id,heading,...,agree_count,disagree_count,notsure_count,label_description,user_id,geometry,lat,lng,cluster_id,clustered
4,68003564,CurbRamp,23785,428097040,Industrial District,2.0,False,880,uj5JioLmsHq0HYFYdc2RDA,216.821594,...,3,0,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38378 47.59389),47.593887,-122.383781,1896,False
5,68003615,CurbRamp,23785,428097040,Industrial District,2.0,False,881,Vpv8nU0VhkZ2EzOref_OIQ,221.384094,...,3,0,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38374 47.59380),47.593803,-122.383736,284,False
6,67918432,CurbRamp,26879,620595956,Central Business District,3.0,False,888,b_oAXeqgQXNskG-vg5PiVQ,316.281250,...,1,0,0,Has a pole on the curb ramp,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33254 47.61062),47.610622,-122.332542,1325,False
7,67918711,CurbRamp,6542,6453040,Central Business District,3.0,False,890,b_oAXeqgQXNskG-vg5PiVQ,348.671875,...,2,0,0,,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33240 47.61064),47.610641,-122.332397,320,True
9,67918754,CurbRamp,24262,428248106,Central Business District,2.0,False,898,2BcncBnwUvviwVbBqMVaVQ,288.345978,...,1,0,0,Has a pole,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33259 47.61066),47.610661,-122.332588,345,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8774,67947806,CurbRamp,19192,248713564,Georgetown,1.0,False,232916,EPe0szjx3cutwSOAJxpRnw,88.482140,...,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32960 47.54378),47.543781,-122.329597,36,False
8776,67947760,CurbRamp,26584,568805388,Georgetown,1.0,False,232923,W2wicma7yY8aB00I0F6xZg,299.178558,...,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32956 47.54408),47.544083,-122.329559,26,False
8777,67947694,CurbRamp,4647,6422865,Georgetown,1.0,False,232977,DdMLY0rSxtk-a2xwxQSRbg,269.000000,...,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32972 47.55182),47.551823,-122.329720,22,True
8778,67947693,CurbRamp,4646,6422865,Georgetown,1.0,False,232979,DdMLY0rSxtk-a2xwxQSRbg,74.937500,...,0,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32941 47.55192),47.551918,-122.329414,12,False


In [17]:
#concatenate all dataframes
clusters_q4 = pd.concat([df0, df1, df2, df3, df4, df5, df6, df7, df8])

In [18]:
clusters_q4

Unnamed: 0,attribute_id,label_type,street_edge_id,osm_street_id,neighborhood,severity,is_temporary,label_id,gsv_panorama_id,heading,...,agree_count,disagree_count,notsure_count,label_description,user_id,geometry,lat,lng,cluster_id,clustered
4,68003564,CurbRamp,23785,428097040,Industrial District,2.0,False,880,uj5JioLmsHq0HYFYdc2RDA,216.821594,...,3,0,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38378 47.59389),47.593887,-122.383781,1896,False
5,68003615,CurbRamp,23785,428097040,Industrial District,2.0,False,881,Vpv8nU0VhkZ2EzOref_OIQ,221.384094,...,3,0,0,,e4ff325c-309b-4ea6-8555-8f973e662362,POINT (-122.38374 47.59380),47.593803,-122.383736,284,False
6,67918432,CurbRamp,26879,620595956,Central Business District,3.0,False,888,b_oAXeqgQXNskG-vg5PiVQ,316.281250,...,1,0,0,Has a pole on the curb ramp,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33254 47.61062),47.610622,-122.332542,1325,False
7,67918711,CurbRamp,6542,6453040,Central Business District,3.0,False,890,b_oAXeqgQXNskG-vg5PiVQ,348.671875,...,2,0,0,,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33240 47.61064),47.610641,-122.332397,320,True
9,67918754,CurbRamp,24262,428248106,Central Business District,2.0,False,898,2BcncBnwUvviwVbBqMVaVQ,288.345978,...,1,0,0,Has a pole,87833d72-b357-4e2c-81cd-23f58ff04c59,POINT (-122.33259 47.61066),47.610661,-122.332588,345,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,67949815,Crosswalk,26584,568805388,Georgetown,1.0,False,232920,W2wicma7yY8aB00I0F6xZg,329.178558,...,1,0,0,,dc9c7b1a-e2bd-4e7f-9d7e-dd239d7c7062,POINT (-122.32954 47.54410),47.544102,-122.329536,2,False
7513,67949881,Signal,23248,428017301,Georgetown,,False,200228,29XIqUaN72IMZdCRAODHRA,306.000000,...,7,0,0,,b98e5db3-e835-427a-a5fd-2d8fa43a166c,POINT (-122.33447 47.54905),47.549049,-122.334465,1,False
7609,67949877,Signal,23281,428019212,Georgetown,,False,200069,sfsTWWAxqByQHZIKHgJB3g,272.093750,...,4,1,0,,b98e5db3-e835-427a-a5fd-2d8fa43a166c,POINT (-122.33705 47.55349),47.553486,-122.337051,2,False
8629,67949883,Signal,3550,6398557,Georgetown,,False,200556,QLvIv-d6zujBLdz3qd8PIA,270.000000,...,8,2,0,,b98e5db3-e835-427a-a5fd-2d8fa43a166c,POINT (-122.32980 47.55333),47.553329,-122.329803,3,False


In [20]:
#export to geojson
clusters_q4.to_file("data/clusters_q4.geojson", driver='GeoJSON')