In [1]:
import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
import pandas as pd
import geopandas as gpd

In [2]:
def custom_dist(u, v):
    if u[2] == v[2]:
        return sys.float_info.max
    else:
        return haversine([u[0], u[1]], [v[0], v[1]])

thresholds = {'CurbRamp': 0.0035,
                      'NoCurbRamp': 0.0035,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Problem': 0.01}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng', 'user_id']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')
    curr_type = labels['label_type'].values

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')

    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    clusters = labelsCopy.groupby('cluster_id')

    # Computes the center of each cluster and assigns temporariness and severity.
    cluster_list = [] # list of tuples (label_type, cluster_num, lat, lng, severity, temporary).
    for clust_num, clust in clusters:
        #ave_pos = np.mean(clust['coords'].tolist(), axis=0) # use ave pos of clusters.
        #ave_sev = None if pd.isnull(clust['severity']).all() else int(round(np.median(clust['severity'][~np.isnan(clust['severity'])])))
        #ave_temp = None if pd.isnull(clust['temporary']).all() else bool(round(np.mean(clust['temporary'])))

        cluster_list.append((curr_type, clust_num))

    cluster_df = pd.DataFrame(cluster_list, columns=['label_type', 'cluster_id'])

    return (cluster_df, labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']
# problem_types = ['SurfaceProblem', 'Obstacle']

# These are the columns required in the POST requests for the labels and clusters, respectively.
label_cols = ['label_id', 'label_type', 'cluster_id']
cluster_cols = ['label_type', 'cluster_id', 'lat', 'lng', 'severity', 'temporary']


def cluster_label_type_at_index(label_data,i):
        clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        # if label_type == 'Problem':
        #     type_data = label_data[label_data.label_type.isin(problem_types)]
        # else:
        type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        if type_data.shape[0] > 1:
            (clusters_for_type_i, labels_for_type_i) = cluster(type_data, label_type, thresholds)
        elif type_data.shape[0] == 1:
            labels_for_type_i = type_data.copy()
            labels_for_type_i.loc[:,'cluster_id'] = 1 # Gives the single cluster a cluster_id of 1.
            labels_for_type_i.loc[:,'label_type'] = label_type # Gives Problem type if needed.
            clusters_for_type_i = labels_for_type_i.filter(items=cluster_cols)

        return (label_type, clusters_for_type_i, labels_for_type_i)

In [3]:
df = gpd.read_file('../data/processed-labels/clusters/q2.shp')

In [4]:
df

Unnamed: 0,label_id,label_type,severity,gsv_panora,zoom,heading,pitch,photograph,photogra_1,user_id,lat,lng,geometry
0,86461,CurbRamp,1.0,ZTCzCkmt9uG-TFqJEttrtg,2,323.580353,-27.500000,272.483002,6.449974,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.697720,-122.320244,POINT (-122.32024 47.69772)
1,86493,NoSidewalk,3.0,pnnr_18wU_FheB_bM8evqw,3,178.453125,-2.015625,180.672836,0.060585,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.696251,-122.314857,POINT (-122.31486 47.69625)
2,86499,SurfaceProblem,3.0,IcSKPywxdm93Mz0xscBDBw,2,359.122772,-18.285715,91.630432,0.155014,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.695873,-122.314407,POINT (-122.31441 47.69587)
3,86500,SurfaceProblem,3.0,IcSKPywxdm93Mz0xscBDBw,2,54.301338,-11.232142,91.630432,0.155014,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.695877,-122.314316,POINT (-122.31432 47.69588)
4,86502,NoCurbRamp,5.0,kr9rQp1FwEPtYVnYUmSw5Q,2,314.479919,-13.035714,89.654411,0.920044,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.695854,-122.312233,POINT (-122.31223 47.69585)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30301,230494,CurbRamp,1.0,4yWbasgNDhyu_KtsW9DY0Q,1,334.125000,-25.312500,157.506607,1.039170,78da7c35-4512-49df-a8f3-d5142b71df03,47.691288,-122.309616,POINT (-122.30962 47.69129)
30302,230504,Crosswalk,1.0,cNqDPX6_cDuFREOhNkWlag,1,69.187500,-35.000000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695889,-122.322945,POINT (-122.32294 47.69589)
30303,230508,CurbRamp,1.0,cNqDPX6_cDuFREOhNkWlag,1,117.375000,-35.000000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695843,-122.322922,POINT (-122.32292 47.69584)
30304,230506,CurbRamp,2.0,cNqDPX6_cDuFREOhNkWlag,1,50.437500,-26.750000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695919,-122.322922,POINT (-122.32292 47.69592)


In [5]:
df0 = cluster_label_type_at_index(df,0)[2]

In [6]:
df0

Unnamed: 0,label_id,label_type,severity,gsv_panora,zoom,heading,pitch,photograph,photogra_1,user_id,lat,lng,geometry,cluster_id
0,86461,CurbRamp,1.0,ZTCzCkmt9uG-TFqJEttrtg,2,323.580353,-27.50000,272.483002,6.449974,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.697720,-122.320244,POINT (-122.32024 47.69772),196
8,86624,CurbRamp,1.0,Eo-oKwHk5rBFeIHsJa_OQQ,3,311.968750,-10.84375,90.119415,0.630173,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.712048,-122.290810,POINT (-122.29081 47.71205),2358
15,86767,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,174.437500,-5.87500,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700146,-122.283333,POINT (-122.28333 47.70015),6961
16,86768,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,128.500000,-13.00000,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700150,-122.283104,POINT (-122.28310 47.70015),295
24,86808,CurbRamp,1.0,WfExyzpW559Rn4aEeuYMng,1,98.000000,-16.43750,282.873047,4.660347,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.693405,-122.276970,POINT (-122.27697 47.69341),2734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30296,84725,CurbRamp,1.0,18XWC5r0jwf4eFkEzU7cEQ,1,161.187500,-17.37500,358.921356,-0.705276,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.681194,-122.306633,POINT (-122.30663 47.68119),34
30301,230494,CurbRamp,1.0,4yWbasgNDhyu_KtsW9DY0Q,1,334.125000,-25.31250,157.506607,1.039170,78da7c35-4512-49df-a8f3-d5142b71df03,47.691288,-122.309616,POINT (-122.30962 47.69129),30
30303,230508,CurbRamp,1.0,cNqDPX6_cDuFREOhNkWlag,1,117.375000,-35.00000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695843,-122.322922,POINT (-122.32292 47.69584),20
30304,230506,CurbRamp,2.0,cNqDPX6_cDuFREOhNkWlag,1,50.437500,-26.75000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695919,-122.322922,POINT (-122.32292 47.69592),12


In [7]:
# plot to check
# df0["clustered"] = df0.duplicated(subset="cluster_id", keep=False)
# df0.explore(tiles="cartodbpositron",column="clustered", categorical=True)

In [8]:
df1 = cluster_label_type_at_index(df,1)[2]

In [9]:
df2 = cluster_label_type_at_index(df,2)[2]

In [10]:
df3 = cluster_label_type_at_index(df,3)[2]

In [11]:
df4 = cluster_label_type_at_index(df,4)[2]

In [12]:
df5 = cluster_label_type_at_index(df,5)[2]

In [13]:
df6 = cluster_label_type_at_index(df,6)[2]

In [14]:
df7 = cluster_label_type_at_index(df,7)[2]

In [15]:
df8 = cluster_label_type_at_index(df,8)[2]

In [16]:
df0["clustered"] = df0.duplicated(subset="cluster_id", keep=False)
df1["clustered"] = df1.duplicated(subset="cluster_id", keep=False)
df2["clustered"] = df2.duplicated(subset="cluster_id", keep=False)
df3["clustered"] = df3.duplicated(subset="cluster_id", keep=False)
df4["clustered"] = df4.duplicated(subset="cluster_id", keep=False)
df5["clustered"] = df5.duplicated(subset="cluster_id", keep=False)
df6["clustered"] = df6.duplicated(subset="cluster_id", keep=False)
df7["clustered"] = df7.duplicated(subset="cluster_id", keep=False)
df8["clustered"] = df8.duplicated(subset="cluster_id", keep=False)

In [17]:
df0

Unnamed: 0,label_id,label_type,severity,gsv_panora,zoom,heading,pitch,photograph,photogra_1,user_id,lat,lng,geometry,cluster_id,clustered
0,86461,CurbRamp,1.0,ZTCzCkmt9uG-TFqJEttrtg,2,323.580353,-27.50000,272.483002,6.449974,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.697720,-122.320244,POINT (-122.32024 47.69772),196,True
8,86624,CurbRamp,1.0,Eo-oKwHk5rBFeIHsJa_OQQ,3,311.968750,-10.84375,90.119415,0.630173,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.712048,-122.290810,POINT (-122.29081 47.71205),2358,False
15,86767,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,174.437500,-5.87500,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700146,-122.283333,POINT (-122.28333 47.70015),6961,False
16,86768,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,128.500000,-13.00000,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700150,-122.283104,POINT (-122.28310 47.70015),295,False
24,86808,CurbRamp,1.0,WfExyzpW559Rn4aEeuYMng,1,98.000000,-16.43750,282.873047,4.660347,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.693405,-122.276970,POINT (-122.27697 47.69341),2734,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30296,84725,CurbRamp,1.0,18XWC5r0jwf4eFkEzU7cEQ,1,161.187500,-17.37500,358.921356,-0.705276,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.681194,-122.306633,POINT (-122.30663 47.68119),34,False
30301,230494,CurbRamp,1.0,4yWbasgNDhyu_KtsW9DY0Q,1,334.125000,-25.31250,157.506607,1.039170,78da7c35-4512-49df-a8f3-d5142b71df03,47.691288,-122.309616,POINT (-122.30962 47.69129),30,True
30303,230508,CurbRamp,1.0,cNqDPX6_cDuFREOhNkWlag,1,117.375000,-35.00000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695843,-122.322922,POINT (-122.32292 47.69584),20,True
30304,230506,CurbRamp,2.0,cNqDPX6_cDuFREOhNkWlag,1,50.437500,-26.75000,0.581519,0.443954,78da7c35-4512-49df-a8f3-d5142b71df03,47.695919,-122.322922,POINT (-122.32292 47.69592),12,True


In [18]:
#concatenate all dataframes
clusters_q2 = pd.concat([df0, df1, df2, df3, df4, df5, df6, df7, df8])

In [19]:
clusters_q2

Unnamed: 0,label_id,label_type,severity,gsv_panora,zoom,heading,pitch,photograph,photogra_1,user_id,lat,lng,geometry,cluster_id,clustered
0,86461,CurbRamp,1.0,ZTCzCkmt9uG-TFqJEttrtg,2,323.580353,-27.50000,272.483002,6.449974,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.697720,-122.320244,POINT (-122.32024 47.69772),196,True
8,86624,CurbRamp,1.0,Eo-oKwHk5rBFeIHsJa_OQQ,3,311.968750,-10.84375,90.119415,0.630173,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.712048,-122.290810,POINT (-122.29081 47.71205),2358,False
15,86767,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,174.437500,-5.87500,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700146,-122.283333,POINT (-122.28333 47.70015),6961,False
16,86768,CurbRamp,1.0,Z1u8iq5TEuS8-n_kZKRc8A,1,128.500000,-13.00000,156.979935,-0.590858,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.700150,-122.283104,POINT (-122.28310 47.70015),295,False
24,86808,CurbRamp,1.0,WfExyzpW559Rn4aEeuYMng,1,98.000000,-16.43750,282.873047,4.660347,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.693405,-122.276970,POINT (-122.27697 47.69341),2734,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15879,231609,Signal,,jVLp7QRXH9TyDjbUXpXTTw,1,358.750000,-8.12500,289.983826,-0.280980,c11b44ac-3fa8-45f8-acf0-de27da3e1428,47.694157,-122.320427,POINT (-122.32043 47.69416),9,False
26260,210508,Signal,,GQSjiaAqrHylEWnSH_auzw,1,43.125000,-22.43750,171.160873,0.807304,4cac48cc-452d-40c1-8f16-9f54d6da1a6d,47.692154,-122.317436,POINT (-122.31744 47.69215),4,False
26545,231604,Signal,,jVLp7QRXH9TyDjbUXpXTTw,1,358.000000,-8.12500,289.983826,-0.280980,c11b44ac-3fa8-45f8-acf0-de27da3e1428,47.694210,-122.320419,POINT (-122.32042 47.69421),10,False
26546,231605,Signal,,jVLp7QRXH9TyDjbUXpXTTw,1,358.000000,-8.12500,289.983826,-0.280980,c11b44ac-3fa8-45f8-acf0-de27da3e1428,47.694252,-122.320320,POINT (-122.32032 47.69425),2,False


In [20]:
#export to shapefile
clusters_q2.to_file("../data/processed-labels/clusters/clusters_q2.shp")