In [83]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine

# Initial Processing

In [26]:
geo = gpd.read_file('data/all.json')

In [27]:
#drop gsv_panomara_id, correct, high_quality_user column
geo = geo.drop(columns=['gsv_panorama_id','correct','high_quality_user','audit_task_id'])

In [28]:
geo

Unnamed: 0,label_id,label_type,severity,geometry
0,85055,CurbRamp,1.0,POINT (-122.33279 47.61860)
1,85057,Obstacle,2.0,POINT (-122.33189 47.61781)
2,85059,CurbRamp,1.0,POINT (-122.33183 47.61751)
3,85060,CurbRamp,1.0,POINT (-122.33188 47.61752)
4,85062,CurbRamp,1.0,POINT (-122.33205 47.61764)
...,...,...,...,...
195539,231270,CurbRamp,1.0,POINT (-122.26726 47.54077)
195540,231285,NoSidewalk,5.0,POINT (-122.26344 47.55229)
195541,231338,CurbRamp,1.0,POINT (-122.28179 47.56327)
195542,231339,CurbRamp,1.0,POINT (-122.28165 47.56347)


In [29]:
meta = pd.read_json('data/cvMetadata.json')

In [31]:
meta = meta.drop(columns=['image_width','image_height','sv_image_x','sv_image_y','canvas_width','canvas_height','canvas_x','canvas_y','label_type_id','agree_count','disagree_count','notsure_count'])

In [66]:
#merge metadata and geo data on label_id
df = pd.merge(meta, geo, on='label_id')

In [67]:
df

Unnamed: 0,label_id,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,label_type,severity,geometry
0,85053,3sAn6u8bQPVW3hTDfNUP1w,1,107.750000,-15.625000,270.675171,0.168884,CurbRamp,1.0,POINT (-122.33305 47.61862)
1,85004,CpU83mS7vz17EnE02JGOkg,1,41.750000,-10.375000,129.873657,0.026001,CurbRamp,1.0,POINT (-122.34052 47.61813)
2,85024,RDV4HddwNqwfBZndZRqqkA,1,100.437500,-12.625000,128.847916,0.042198,CurbRamp,1.0,POINT (-122.33761 47.61531)
3,85025,NpkOF1LfE024Ks0XIhCttw,1,136.062500,-11.500000,350.971405,1.353600,CurbRamp,1.0,POINT (-122.33766 47.61509)
4,85034,-l9SjmKbZmUfr3JV3PuTVg,1,180.312500,-15.562500,171.933517,-0.698708,CurbRamp,1.0,POINT (-122.33792 47.61485)
...,...,...,...,...,...,...,...,...,...,...
195539,96388,Ek8nn67kKNgCFjZkKf9ISA,2,348.645081,-3.973214,318.706665,0.226311,NoSidewalk,3.0,POINT (-122.38979 47.65871)
195540,122316,6U9DDI4UppbHDTyp8_J3JA,1,3.171875,-32.000000,186.167953,0.891960,NoCurbRamp,5.0,POINT (-122.38770 47.64471)
195541,122337,lydaO-3xUm3YPy2sNPyPoQ,1,288.500000,-20.250000,181.292786,5.152237,SurfaceProblem,2.0,POINT (-122.38781 47.64174)
195542,122341,ubr500pdt0vImqg71_dHsQ,1,198.312500,-23.562500,180.803787,3.150978,CurbRamp,1.0,POINT (-122.38776 47.64110)


In [68]:
#read in label-and-user-ids.csv
user_ids = pd.read_csv('data/label-and-user-ids.csv')
# select only city == seattle
user_ids = user_ids[user_ids['city'] == 'seattle']

In [69]:
#megre user_ids and df on label_id
df = pd.merge(df, user_ids, on='label_id', how='inner')

In [71]:
#drop city column
df = df.drop(columns=['city'])

In [72]:
df

Unnamed: 0,label_id,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,label_type,severity,geometry,user_id
0,85053,3sAn6u8bQPVW3hTDfNUP1w,1,107.750000,-15.625000,270.675171,0.168884,CurbRamp,1.0,POINT (-122.33305 47.61862),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184
1,85004,CpU83mS7vz17EnE02JGOkg,1,41.750000,-10.375000,129.873657,0.026001,CurbRamp,1.0,POINT (-122.34052 47.61813),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184
2,85024,RDV4HddwNqwfBZndZRqqkA,1,100.437500,-12.625000,128.847916,0.042198,CurbRamp,1.0,POINT (-122.33761 47.61531),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184
3,85025,NpkOF1LfE024Ks0XIhCttw,1,136.062500,-11.500000,350.971405,1.353600,CurbRamp,1.0,POINT (-122.33766 47.61509),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184
4,85034,-l9SjmKbZmUfr3JV3PuTVg,1,180.312500,-15.562500,171.933517,-0.698708,CurbRamp,1.0,POINT (-122.33792 47.61485),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184
...,...,...,...,...,...,...,...,...,...,...,...
195539,96388,Ek8nn67kKNgCFjZkKf9ISA,2,348.645081,-3.973214,318.706665,0.226311,NoSidewalk,3.0,POINT (-122.38979 47.65871),f2a1a9cc-466f-4ee2-83d4-2d8ff46a31fe
195540,122316,6U9DDI4UppbHDTyp8_J3JA,1,3.171875,-32.000000,186.167953,0.891960,NoCurbRamp,5.0,POINT (-122.38770 47.64471),7d65ec0f-4ba8-4895-abfb-eb75fab3104e
195541,122337,lydaO-3xUm3YPy2sNPyPoQ,1,288.500000,-20.250000,181.292786,5.152237,SurfaceProblem,2.0,POINT (-122.38781 47.64174),7d65ec0f-4ba8-4895-abfb-eb75fab3104e
195542,122341,ubr500pdt0vImqg71_dHsQ,1,198.312500,-23.562500,180.803787,3.150978,CurbRamp,1.0,POINT (-122.38776 47.64110),7d65ec0f-4ba8-4895-abfb-eb75fab3104e


# Clustering

In [85]:
def custom_dist(u, v):
    if u[2] == v[2]:
        return sys.float_info.max
    else:
        return haversine([u[0], u[1]], [v[0], v[1]])

thresholds = {'CurbRamp': 0.0075,
                      'NoCurbRamp': 0.0075,
                      'SurfaceProblem': 0.01,
                      'Obstacle': 0.01,
                      'NoSidewalk': 0.01,
                      'Crosswalk': 0.01,
                      'Signal': 0.01,
                      'Occlusion': 0.01,
                      'Other': 0.01,
                      'Problem': 0.01}

def cluster(labels, label_type, thresholds):

    # Makes a normal dist matrix for a single user, but uses special dist function for multi-user clustering that
    # prevents the same user's attributes from being clustered together.
    dist_matrix = pdist(np.array(labels[['lat', 'lng', 'user_id']].values), custom_dist)
    link = linkage(dist_matrix, method='complete')
    curr_type = labels['label_type'].values

    # Copies the labels dataframe and adds a column to it for the cluster id each label is in.
    labelsCopy = labels.copy()
    labelsCopy.loc[:,'cluster_id'] = fcluster(link, t=thresholds[label_type], criterion='distance')

    # Cuts tree so that only labels less than clust_threth kilometers apart are clustered.
    clusters = labelsCopy.groupby('cluster_id')

    # Computes the center of each cluster and assigns temporariness and severity.
    cluster_list = [] # list of tuples (label_type, cluster_num, lat, lng, severity, temporary).
    for clust_num, clust in clusters:
        #ave_pos = np.mean(clust['coords'].tolist(), axis=0) # use ave pos of clusters.
        #ave_sev = None if pd.isnull(clust['severity']).all() else int(round(np.median(clust['severity'][~np.isnan(clust['severity'])])))
        #ave_temp = None if pd.isnull(clust['temporary']).all() else bool(round(np.mean(clust['temporary'])))

        cluster_list.append((curr_type, clust_num))

    cluster_df = pd.DataFrame(cluster_list, columns=['label_type', 'cluster_id'])

    return (cluster_df, labelsCopy)


# Pick which label types should be included in clustering, and which should be included in the "Problem" type.
label_types = ['CurbRamp', 'NoSidewalk', 'Occlusion', 'SurfaceProblem', 'Obstacle', 'Other', 'NoCurbRamp', 'Crosswalk', 'Signal']
# problem_types = ['SurfaceProblem', 'Obstacle']

# These are the columns required in the POST requests for the labels and clusters, respectively.
label_cols = ['label_id', 'label_type', 'cluster_id']
cluster_cols = ['label_type', 'cluster_id', 'lat', 'lng', 'severity', 'temporary']


def cluster_label_type_at_index(label_data,i):
        clusters_for_type_i = pd.DataFrame(columns=cluster_cols)
        labels_for_type_i = pd.DataFrame(columns=label_cols)

        label_type = label_types[i]
        # if label_type == 'Problem':
        #     type_data = label_data[label_data.label_type.isin(problem_types)]
        # else:
        type_data = label_data[label_data.label_type == label_type]

        # If there are >1 labels, we can do clustering. Otherwise just copy the 1 (or 0) labels.
        if type_data.shape[0] > 1:
            (clusters_for_type_i, labels_for_type_i) = cluster(type_data, label_type, thresholds)
        elif type_data.shape[0] == 1:
            labels_for_type_i = type_data.copy()
            labels_for_type_i.loc[:,'cluster_id'] = 1 # Gives the single cluster a cluster_id of 1.
            labels_for_type_i.loc[:,'label_type'] = label_type # Gives Problem type if needed.
            clusters_for_type_i = labels_for_type_i.filter(items=cluster_cols)

        return (label_type, clusters_for_type_i, labels_for_type_i)

In [79]:
# break geometry into lat and lng
df['lat'] = df['geometry'].apply(lambda x: x.y)
df['lng'] = df['geometry'].apply(lambda x: x.x)

In [80]:
df

Unnamed: 0,label_id,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,label_type,severity,geometry,user_id,lat,lng
0,85053,3sAn6u8bQPVW3hTDfNUP1w,1,107.750000,-15.625000,270.675171,0.168884,CurbRamp,1.0,POINT (-122.33305 47.61862),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618618,-122.333054
1,85004,CpU83mS7vz17EnE02JGOkg,1,41.750000,-10.375000,129.873657,0.026001,CurbRamp,1.0,POINT (-122.34052 47.61813),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618134,-122.340515
2,85024,RDV4HddwNqwfBZndZRqqkA,1,100.437500,-12.625000,128.847916,0.042198,CurbRamp,1.0,POINT (-122.33761 47.61531),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.615307,-122.337608
3,85025,NpkOF1LfE024Ks0XIhCttw,1,136.062500,-11.500000,350.971405,1.353600,CurbRamp,1.0,POINT (-122.33766 47.61509),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.615089,-122.337662
4,85034,-l9SjmKbZmUfr3JV3PuTVg,1,180.312500,-15.562500,171.933517,-0.698708,CurbRamp,1.0,POINT (-122.33792 47.61485),4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.614845,-122.337921
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195539,96388,Ek8nn67kKNgCFjZkKf9ISA,2,348.645081,-3.973214,318.706665,0.226311,NoSidewalk,3.0,POINT (-122.38979 47.65871),f2a1a9cc-466f-4ee2-83d4-2d8ff46a31fe,47.658714,-122.389786
195540,122316,6U9DDI4UppbHDTyp8_J3JA,1,3.171875,-32.000000,186.167953,0.891960,NoCurbRamp,5.0,POINT (-122.38770 47.64471),7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.644711,-122.387703
195541,122337,lydaO-3xUm3YPy2sNPyPoQ,1,288.500000,-20.250000,181.292786,5.152237,SurfaceProblem,2.0,POINT (-122.38781 47.64174),7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.641743,-122.387810
195542,122341,ubr500pdt0vImqg71_dHsQ,1,198.312500,-23.562500,180.803787,3.150978,CurbRamp,1.0,POINT (-122.38776 47.64110),7d65ec0f-4ba8-4895-abfb-eb75fab3104e,47.641102,-122.387764


In [88]:
# turn df into a geodataframe, geometry =geometry
df = gpd.GeoDataFrame(df, geometry='geometry')


In [90]:
#select only longtitide <-120
df = df[df['lng'] < -120]

In [92]:
#save as geojson
df.to_file('data/seattle_labels.geojson', driver='GeoJSON')

In [93]:
df0 = cluster_label_type_at_index(df,0)[2]

KeyboardInterrupt: 