In [1]:
import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
import pandas as pd
import geopandas as gpd

In [2]:
city = 'newberg'

In [3]:
city_crs = {'seattle': 2285, '' + city + '': 3424, 'chicago': 3435, 'newberg':2269}

In [4]:
def custom_dist(u, v):
    if u[2] == v[2]:
        return sys.float_info.max
    else:
        return haversine([u[0], u[1]], [v[0], v[1]])

thresholds = {'CurbRamp': 0.0075,
                      'NoCurbRamp': 0.0075,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Problem': 0.01}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng', 'user_id']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')
    curr_type = labels['label_type'].values

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')

    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    clusters = labelsCopy.groupby('cluster_id')

    # Computes the center of each cluster and assigns temporariness and severity.
    cluster_list = [] # list of tuples (label_type, cluster_num, lat, lng, severity, temporary).
    for clust_num, clust in clusters:
        #ave_pos = np.mean(clust['coords'].tolist(), axis=0) # use ave pos of clusters.
        #ave_sev = None if pd.isnull(clust['severity']).all() else int(round(np.median(clust['severity'][~np.isnan(clust['severity'])])))
        #ave_temp = None if pd.isnull(clust['temporary']).all() else bool(round(np.mean(clust['temporary'])))

        cluster_list.append((curr_type, clust_num))

    cluster_df = pd.DataFrame(cluster_list, columns=['label_type', 'cluster_id'])

    return (cluster_df, labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']
# problem_types = ['SurfaceProblem', 'Obstacle']

# These are the columns required in the POST requests for the labels and clusters, respectively.
label_cols = ['label_id', 'label_type', 'cluster_id']
cluster_cols = ['label_type', 'cluster_id', 'lat', 'lng', 'severity', 'temporary']


def cluster_label_type_at_index(label_data,i):
        clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        # if label_type == 'Problem':
        #     type_data = label_data[label_data.label_type.isin(problem_types)]
        # else:
        type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        if type_data.shape[0] > 1:
            (clusters_for_type_i, labels_for_type_i) = cluster(type_data, label_type, thresholds)
        elif type_data.shape[0] == 1:
            labels_for_type_i = type_data.copy()
            labels_for_type_i.loc[:,'cluster_id'] = 1 # Gives the single cluster a cluster_id of 1.
            labels_for_type_i.loc[:,'label_type'] = label_type # Gives Problem type if needed.
            clusters_for_type_i = labels_for_type_i.filter(items=cluster_cols)

        return (label_type, clusters_for_type_i, labels_for_type_i)

In [6]:
df = gpd.read_file('../../../data/{}/processed-labels/labels_raw_{}/labels_raw_{}.shp'.format(city,city,city))
df

Unnamed: 0,label_id,gsv_panora,label_type,severity,correct,has_valida,expired,high_quali,gsv_pano_1,pano_width,...,pano_y,zoom,heading,pitch,camera_hea,camera_pit,user_id,lat,lng,geometry
0,145,0R2JTLibSCNsNb7aGNUzWg,NoSidewalk,,,0,1,1,0R2JTLibSCNsNb7aGNUzWg,13312.0,...,3863,1,319.375000,-19.062500,269.813477,0.678093,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317650,-122.952904,POINT (-122.95290 45.31765)
1,18134,Z6Ja9-4wJxpQZPEnn8KSXA,NoSidewalk,4.0,,0,0,1,Z6Ja9-4wJxpQZPEnn8KSXA,13312.0,...,3744,3,326.437500,-10.906250,323.408020,1.379303,bb78d1ab-85d6-4620-a1de-396a5785080a,45.313820,-122.964005,POINT (-122.96400 45.31382)
2,151,oqg9ULHZul23xfU6DwTRNg,NoSidewalk,,,0,0,1,oqg9ULHZul23xfU6DwTRNg,13312.0,...,3776,2,285.258942,-11.508928,268.211182,0.926697,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317616,-122.953598,POINT (-122.95360 45.31762)
3,153,WWRq8hx_qzBmTiSkSBjEqQ,NoSidewalk,,,0,1,1,WWRq8hx_qzBmTiSkSBjEqQ,13312.0,...,3709,2,223.651779,-8.741072,268.896179,-0.547432,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317421,-122.954025,POINT (-122.95403 45.31742)
4,154,WWRq8hx_qzBmTiSkSBjEqQ,NoSidewalk,,,0,1,1,WWRq8hx_qzBmTiSkSBjEqQ,13312.0,...,3600,2,279.008942,-9.455358,268.896179,-0.547432,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317600,-122.954185,POINT (-122.95419 45.31760)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16842,8222,5KKnsHQgODUBbIRZYiVbqg,NoSidewalk,,,0,0,1,5KKnsHQgODUBbIRZYiVbqg,13312.0,...,3757,2,225.142853,-19.375000,22.380564,-3.511421,fc36440f-38b7-4060-892a-65fedd77f651,45.303329,-122.925797,POINT (-122.92580 45.30333)
16843,8272,mdbgEoi-XTfZGiKqcWMlqw,NoSidewalk,,,0,1,1,mdbgEoi-XTfZGiKqcWMlqw,3328.0,...,986,1,114.982140,-15.312500,359.096985,-9.832024,fc36440f-38b7-4060-892a-65fedd77f651,45.299858,-122.925591,POINT (-122.92559 45.29986)
16844,8281,BtyhcErvfI4z31FisBjeQg,NoSidewalk,,,0,0,1,BtyhcErvfI4z31FisBjeQg,13312.0,...,3633,1,43.812500,-1.875000,46.910660,-2.319252,fc36440f-38b7-4060-892a-65fedd77f651,45.305443,-122.926605,POINT (-122.92661 45.30544)
16845,16595,--gG93FA53Lkv0H31-9Ruw,CurbRamp,,1,1,0,1,--gG93FA53Lkv0H31-9Ruw,13312.0,...,3983,1,142.982147,-11.625000,270.257080,-1.495285,fc36440f-38b7-4060-892a-65fedd77f651,45.299377,-122.956367,POINT (-122.95637 45.29938)


In [7]:
df0 = cluster_label_type_at_index(df,0)[2]

In [8]:
df1 = cluster_label_type_at_index(df,1)[2]

In [9]:
df2 = cluster_label_type_at_index(df,2)[2]

In [10]:
df3 = cluster_label_type_at_index(df,3)[2]

In [11]:
df4 = cluster_label_type_at_index(df,4)[2]

In [12]:
df5 = cluster_label_type_at_index(df,5)[2]

In [13]:
df6 = cluster_label_type_at_index(df,6)[2]

In [14]:
df7 = cluster_label_type_at_index(df,7)[2]

In [15]:
df8 = cluster_label_type_at_index(df,8)[2]

In [16]:
df0["clustered"] = df0.duplicated(subset="cluster_id", keep=False)
df1["clustered"] = df1.duplicated(subset="cluster_id", keep=False)
df2["clustered"] = df2.duplicated(subset="cluster_id", keep=False)
df3["clustered"] = df3.duplicated(subset="cluster_id", keep=False)
df4["clustered"] = df4.duplicated(subset="cluster_id", keep=False)
df5["clustered"] = df5.duplicated(subset="cluster_id", keep=False)
df6["clustered"] = df6.duplicated(subset="cluster_id", keep=False)
df7["clustered"] = df7.duplicated(subset="cluster_id", keep=False)
df8["clustered"] = df8.duplicated(subset="cluster_id", keep=False)

In [17]:
df0

Unnamed: 0,label_id,gsv_panora,label_type,severity,correct,has_valida,expired,high_quali,gsv_pano_1,pano_width,...,heading,pitch,camera_hea,camera_pit,user_id,lat,lng,geometry,cluster_id,clustered
16,146,0R2JTLibSCNsNb7aGNUzWg,CurbRamp,1.0,1,1,1,1,0R2JTLibSCNsNb7aGNUzWg,13312.0,...,50.125000,-26.937500,269.813477,0.678093,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317612,-122.952751,POINT (-122.95275 45.31761),2433,True
18,250,SIRiUbNpH54imxj-z01Wnw,CurbRamp,2.0,1,1,1,1,SIRiUbNpH54imxj-z01Wnw,13312.0,...,214.750000,-18.919643,270.914520,1.339951,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317303,-122.973167,POINT (-122.97317 45.31730),1295,True
19,22,dslJGqaol13AzhdnPA1epA,CurbRamp,1.0,1,1,0,1,dslJGqaol13AzhdnPA1epA,13312.0,...,157.750000,-34.437500,116.283493,-0.733246,a7b29eaa-8032-45f9-be2b-fce49aa0b8c0,45.320690,-122.972359,POINT (-122.97236 45.32069),1275,True
20,23,kI15oZgTQY8bxvRp-Io7nw,CurbRamp,1.0,1,1,0,1,kI15oZgTQY8bxvRp-Io7nw,13312.0,...,152.875000,-34.625000,44.911987,0.174957,a7b29eaa-8032-45f9-be2b-fce49aa0b8c0,45.320698,-122.972198,POINT (-122.97220 45.32070),1299,True
27,147,0R2JTLibSCNsNb7aGNUzWg,CurbRamp,1.0,1,1,1,1,0R2JTLibSCNsNb7aGNUzWg,13312.0,...,72.830360,-13.714286,269.813477,0.678093,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317596,-122.952675,POINT (-122.95267 45.31760),745,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16834,12806,EAqwqE0688xPgGVN1xao7g,CurbRamp,,,1,0,0,EAqwqE0688xPgGVN1xao7g,,...,247.562500,-13.750000,63.113789,-0.970879,f51f44e3-5bae-4413-b3c5-6f28e39669bb,45.299812,-122.984261,POINT (-122.98426 45.29981),23,False
16836,12819,YK0FBcsy7Y6cK3a7Bycd5w,CurbRamp,,1,1,1,0,YK0FBcsy7Y6cK3a7Bycd5w,13312.0,...,128.852676,-20.566965,182.359085,0.663254,f51f44e3-5bae-4413-b3c5-6f28e39669bb,45.299454,-122.982124,POINT (-122.98212 45.29945),17,False
16837,12824,DAhcrCO5lWjbW6ODCJyhuQ,CurbRamp,,1,1,1,0,DAhcrCO5lWjbW6ODCJyhuQ,13312.0,...,224.500000,-35.000000,59.087475,0.343269,f51f44e3-5bae-4413-b3c5-6f28e39669bb,45.291168,-122.979034,POINT (-122.97903 45.29117),10,True
16845,16595,--gG93FA53Lkv0H31-9Ruw,CurbRamp,,1,1,0,1,--gG93FA53Lkv0H31-9Ruw,13312.0,...,142.982147,-11.625000,270.257080,-1.495285,fc36440f-38b7-4060-892a-65fedd77f651,45.299377,-122.956367,POINT (-122.95637 45.29938),5,True


In [18]:
#concatenate all dataframes
clusters = pd.concat([df0, df1, df2, df3, df4, df5, df6, df7, df8])

In [19]:
clusters

Unnamed: 0,label_id,gsv_panora,label_type,severity,correct,has_valida,expired,high_quali,gsv_pano_1,pano_width,...,heading,pitch,camera_hea,camera_pit,user_id,lat,lng,geometry,cluster_id,clustered
16,146,0R2JTLibSCNsNb7aGNUzWg,CurbRamp,1.0,1,1.0,1.0,1.0,0R2JTLibSCNsNb7aGNUzWg,13312.0,...,50.125000,-26.937500,269.813477,0.678093,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317612,-122.952751,POINT (-122.95275 45.31761),2433,True
18,250,SIRiUbNpH54imxj-z01Wnw,CurbRamp,2.0,1,1.0,1.0,1.0,SIRiUbNpH54imxj-z01Wnw,13312.0,...,214.750000,-18.919643,270.914520,1.339951,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317303,-122.973167,POINT (-122.97317 45.31730),1295,True
19,22,dslJGqaol13AzhdnPA1epA,CurbRamp,1.0,1,1.0,0.0,1.0,dslJGqaol13AzhdnPA1epA,13312.0,...,157.750000,-34.437500,116.283493,-0.733246,a7b29eaa-8032-45f9-be2b-fce49aa0b8c0,45.320690,-122.972359,POINT (-122.97236 45.32069),1275,True
20,23,kI15oZgTQY8bxvRp-Io7nw,CurbRamp,1.0,1,1.0,0.0,1.0,kI15oZgTQY8bxvRp-Io7nw,13312.0,...,152.875000,-34.625000,44.911987,0.174957,a7b29eaa-8032-45f9-be2b-fce49aa0b8c0,45.320698,-122.972198,POINT (-122.97220 45.32070),1299,True
27,147,0R2JTLibSCNsNb7aGNUzWg,CurbRamp,1.0,1,1.0,1.0,1.0,0R2JTLibSCNsNb7aGNUzWg,13312.0,...,72.830360,-13.714286,269.813477,0.678093,d363619a-53e1-4ef4-a541-bb0278cae7c3,45.317596,-122.952675,POINT (-122.95267 45.31760),745,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,18212,8wMxu-1AcIPQtvxnk0nSMg,Crosswalk,,,0.0,0.0,1.0,8wMxu-1AcIPQtvxnk0nSMg,3328.0,...,223.205353,-11.812500,269.475708,-2.551773,e76a46bc-4cab-4943-bbf8-92015a186bc6,45.292736,-122.977249,POINT (-122.97725 45.29274),12,False
553,18279,NIqd17WmG0oogKQrs23cKA,Crosswalk,,,0.0,0.0,1.0,NIqd17WmG0oogKQrs23cKA,13312.0,...,303.000000,-35.000000,296.686005,1.060738,e76a46bc-4cab-4943-bbf8-92015a186bc6,45.296886,-122.982231,POINT (-122.98223 45.29689),13,False
562,18261,Gzht24CNnGoVPHNI_NOjHw,Crosswalk,,,0.0,0.0,1.0,Gzht24CNnGoVPHNI_NOjHw,13312.0,...,193.625000,-35.000000,90.651543,1.083519,e76a46bc-4cab-4943-bbf8-92015a186bc6,45.299419,-122.977150,POINT (-122.97715 45.29942),14,False
572,18260,Gzht24CNnGoVPHNI_NOjHw,Crosswalk,,,0.0,0.0,1.0,Gzht24CNnGoVPHNI_NOjHw,13312.0,...,265.250000,-30.875000,90.651543,1.083519,e76a46bc-4cab-4943-bbf8-92015a186bc6,45.299480,-122.977226,POINT (-122.97723 45.29948),15,False


In [20]:
#function to fix the problem that cluster_ids are not unique across label_types
def fix_cluster_ids(df):
    #sort the df by label type and cluster id
    df = df.sort_values(by=['label_type', 'cluster_id'])
    #create a new cluster id column that is unique 
    df['new_cluster_id'] = range(1, len(df) + 1)
    #create a mapping of new cluster id to old cluster id
    mapping = df.drop_duplicates(['label_type', 'cluster_id']).set_index(['label_type', 'cluster_id'])['new_cluster_id'].to_dict()
    #map the new cluster id to the old cluster id
    df['new_cluster_id'] = df.apply(lambda row: mapping[(row['label_type'], row['cluster_id'])], axis=1)
    #drop the old cluster id column and rename the new cluster id column
    df = df.drop(columns=['cluster_id'])
    df = df.rename(columns={'new_cluster_id': 'cluster_id'})
    return df

In [21]:
clusters = fix_cluster_ids(clusters)

In [22]:
#count the number of lables in each cluster
clusters['cluster_label_count'] = clusters.groupby('cluster_id')['cluster_id'].transform('count')

In [23]:
#select only label_id label_type, clustered and count columns
clusters = clusters[['label_id', 'cluster_id', 'clustered','cluster_label_count']]

In [24]:
#change True and False in clustered to 1 and 0
clusters['clustered'] = clusters['clustered'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clusters['clustered'] = clusters['clustered'].astype(int)


In [25]:
clusters 

Unnamed: 0,label_id,cluster_id,clustered,cluster_label_count
341,18321,1,0,1
356,18214,2,0,1
422,18318,3,0,1
424,18337,4,0,1
426,18338,5,0,1
...,...,...,...,...
3398,4800,16843,0,1
1412,1918,16844,0,1
1500,11872,16845,0,1
1127,1558,16846,0,1


In [26]:
#export to csv
clusters.to_csv('../../../data/{}/features/clustered.csv'.format(city), index=False)