In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine

# Initial Processing

In [3]:
geo = gpd.read_file('data/source/all.json')

In [4]:
#drop gsv_panomara_id, correct, high_quality_user column
geo = geo.drop(columns=['gsv_panorama_id','correct','high_quality_user','audit_task_id'])

In [5]:
geo

Unnamed: 0,label_id,label_type,severity,geometry
0,85055,CurbRamp,1.0,POINT (-122.33279 47.61860)
1,85057,Obstacle,2.0,POINT (-122.33189 47.61781)
2,85059,CurbRamp,1.0,POINT (-122.33183 47.61751)
3,85060,CurbRamp,1.0,POINT (-122.33188 47.61752)
4,85062,CurbRamp,1.0,POINT (-122.33205 47.61764)
...,...,...,...,...
195539,231270,CurbRamp,1.0,POINT (-122.26726 47.54077)
195540,231285,NoSidewalk,5.0,POINT (-122.26344 47.55229)
195541,231338,CurbRamp,1.0,POINT (-122.28179 47.56327)
195542,231339,CurbRamp,1.0,POINT (-122.28165 47.56347)


In [7]:
meta = pd.read_json('data/source/cvMetadata.json')

In [8]:
meta

Unnamed: 0,label_id,gsv_panorama_id,label_type_id,agree_count,disagree_count,notsure_count,image_width,image_height,sv_image_x,sv_image_y,canvas_width,canvas_height,canvas_x,canvas_y,zoom,heading,pitch,photographer_heading,photographer_pitch
0,85053,3sAn6u8bQPVW3hTDfNUP1w,1,0,0,0,16384.0,8192.0,2484,-390,720,480,62,232,1,107.750000,-15.625000,270.675171,0.168884
1,85004,CpU83mS7vz17EnE02JGOkg,1,0,0,0,16384.0,8192.0,394,-389,720,480,149,254,1,41.750000,-10.375000,129.873657,0.026001
2,85024,RDV4HddwNqwfBZndZRqqkA,1,0,0,0,16384.0,8192.0,3867,-135,720,480,386,187,1,100.437500,-12.625000,128.847916,0.042198
3,85025,NpkOF1LfE024Ks0XIhCttw,1,0,1,0,13312.0,6656.0,5484,-295,720,480,435,222,1,136.062500,-11.500000,350.971405,1.353600
4,85034,-l9SjmKbZmUfr3JV3PuTVg,1,1,0,0,13312.0,6656.0,7887,-504,720,480,584,247,1,180.312500,-15.562500,171.933517,-0.698708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195539,96388,Ek8nn67kKNgCFjZkKf9ISA,7,0,0,0,16384.0,8192.0,12898,-166,720,480,364,259,2,348.645081,-3.973214,318.706665,0.226311
195540,122316,6U9DDI4UppbHDTyp8_J3JA,2,2,0,0,16384.0,8192.0,13246,-636,720,480,327,145,1,3.171875,-32.000000,186.167953,0.891960
195541,122337,lydaO-3xUm3YPy2sNPyPoQ,4,2,0,0,16384.0,8192.0,10316,-614,720,480,302,220,1,288.500000,-20.250000,181.292786,5.152237
195542,122341,ubr500pdt0vImqg71_dHsQ,1,1,0,0,16384.0,8192.0,7437,-322,720,480,377,148,1,198.312500,-23.562500,180.803787,3.150978


In [9]:
meta = meta.drop(columns=['image_width','image_height','sv_image_x','sv_image_y','canvas_width','canvas_height','canvas_x','canvas_y','label_type_id','agree_count','disagree_count','notsure_count'])

In [31]:
#merge metadata and geo data on label_id
df = pd.merge(geo, meta, on='label_id')

In [32]:
#df['severity'] value counts include nan
df['severity'].value_counts(dropna=False)

1.0    61526
3.0    41385
2.0    30493
5.0    29941
4.0    23544
NaN     8655
Name: severity, dtype: int64

In [15]:
df['severity'].value_counts(dropna=False)

1.0    61526
3.0    41385
2.0    30493
5.0    29941
4.0    23544
0.0     8655
Name: severity, dtype: int64

In [28]:
#severity mean by label_type
df.groupby('label_type')['severity'].median()

label_type
Crosswalk         1.0
CurbRamp          1.0
NoCurbRamp        3.0
NoSidewalk        4.0
Obstacle          3.0
Occlusion         0.0
Other             3.0
Signal            0.0
SurfaceProblem    2.0
Name: severity, dtype: float64

In [34]:
#label_type value counts
df['label_type'].value_counts(dropna=False)

CurbRamp          76024
NoCurbRamp        37207
NoSidewalk        36428
SurfaceProblem    30372
Obstacle          13012
Occlusion           970
Crosswalk           849
Other               460
Signal              222
Name: label_type, dtype: int64

In [42]:
#severity fill nan based on label_type median
df['severity'] = df.groupby('label_type')['severity'].transform(lambda x: x.fillna(x.median()))
df['severity'] = df['severity'].fillna(0)
df

Unnamed: 0,label_id,label_type,severity,geometry,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch
0,85055,CurbRamp,1.0,POINT (-122.33279 47.61860),NjPAkwTxWsayAq3kCugvdA,1,107.750000,-15.62500,270.556610,0.907036
1,85057,Obstacle,2.0,POINT (-122.33189 47.61781),f_Mibt-6nJU7HTN7kx5ZGQ,1,38.375000,-21.87500,164.133438,0.335564
2,85059,CurbRamp,1.0,POINT (-122.33183 47.61751),f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564
3,85060,CurbRamp,1.0,POINT (-122.33188 47.61752),f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564
4,85062,CurbRamp,1.0,POINT (-122.33205 47.61764),f_Mibt-6nJU7HTN7kx5ZGQ,1,257.187500,-35.00000,164.133438,0.335564
...,...,...,...,...,...,...,...,...,...,...
195539,231270,CurbRamp,1.0,POINT (-122.26726 47.54077),R-sTl5O6sOdVPurgTWy3EQ,1,332.468750,-35.00000,215.130295,-0.157570
195540,231285,NoSidewalk,5.0,POINT (-122.26344 47.55229),N5MrAAtHZO2JGXUrYAYnSQ,3,0.890625,-11.65625,179.161057,-5.825714
195541,231338,CurbRamp,1.0,POINT (-122.28179 47.56327),d3oBrJ-nDtKMwlE7svc6pg,1,253.562500,-35.00000,177.959915,-0.962440
195542,231339,CurbRamp,1.0,POINT (-122.28165 47.56347),AOIBJ12BjFanDJF1Ark05Q,1,110.312500,-35.00000,179.365723,-1.093400


In [43]:
df['severity'].value_counts(dropna=False)

1.0    63768
3.0    42427
2.0    31663
5.0    29941
4.0    26553
0.0     1192
Name: severity, dtype: int64

In [44]:
#read in label-and-user-ids.csv
user_ids = pd.read_csv('data/source/label-and-user-ids.csv')
# select only city == seattle
user_ids = user_ids[user_ids['city'] == 'seattle']

In [45]:
#megre user_ids and df on label_id
df = pd.merge(df, user_ids, on='label_id', how='inner')

In [46]:
#drop city column
df = df.drop(columns=['city'])

In [47]:
# break geometry into lat and lng
df['lat'] = df['geometry'].apply(lambda x: x.y)
df['lng'] = df['geometry'].apply(lambda x: x.x)


In [48]:
#select lng < -120
df = df[df['lng'] < -120]

In [49]:
#move geomery column to the end
df = df[[c for c in df if c not in ['geometry']] + ['geometry']]
df

Unnamed: 0,label_id,label_type,severity,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,user_id,lat,lng,geometry
0,85055,CurbRamp,1.0,NjPAkwTxWsayAq3kCugvdA,1,107.750000,-15.62500,270.556610,0.907036,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618599,-122.332787,POINT (-122.33279 47.61860)
1,85057,Obstacle,2.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,38.375000,-21.87500,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617805,-122.331886,POINT (-122.33189 47.61781)
2,85059,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617512,-122.331833,POINT (-122.33183 47.61751)
3,85060,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617519,-122.331879,POINT (-122.33188 47.61752)
4,85062,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,257.187500,-35.00000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617638,-122.332047,POINT (-122.33205 47.61764)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195539,231270,CurbRamp,1.0,R-sTl5O6sOdVPurgTWy3EQ,1,332.468750,-35.00000,215.130295,-0.157570,155af0a9-4149-4436-8aa3-7573e160036d,47.540771,-122.267258,POINT (-122.26726 47.54077)
195540,231285,NoSidewalk,5.0,N5MrAAtHZO2JGXUrYAYnSQ,3,0.890625,-11.65625,179.161057,-5.825714,155af0a9-4149-4436-8aa3-7573e160036d,47.552292,-122.263443,POINT (-122.26344 47.55229)
195541,231338,CurbRamp,1.0,d3oBrJ-nDtKMwlE7svc6pg,1,253.562500,-35.00000,177.959915,-0.962440,3ac3d4de-a70a-48ec-b46e-498d6e26ee6c,47.563271,-122.281792,POINT (-122.28179 47.56327)
195542,231339,CurbRamp,1.0,AOIBJ12BjFanDJF1Ark05Q,1,110.312500,-35.00000,179.365723,-1.093400,3ac3d4de-a70a-48ec-b46e-498d6e26ee6c,47.563469,-122.281654,POINT (-122.28165 47.56347)


In [50]:
# save to csv
df.to_csv('data/processed-labels/seattle_labels_all.csv', index=False)

In [51]:
#save df to shapefile
df.to_file('data/processed-labels/seattle_labels_all/seattle_labels_all.shp')