In [1]:
import numpy as np
from scipy import stats
import math
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
import pandas as pd
import geopandas as gpd

In [3]:
# read in gt.csv
gt = pd.read_csv('data/gt.csv')

In [5]:
#select label_id and verified
gt = gt[['label_id', 'verified']]

In [6]:
gt

Unnamed: 0,label_id,verified
0,86788,1
1,91037,1
2,93842,1
3,190528,1
4,190672,1
...,...,...
27340,84922,0
27341,230293,0
27342,231338,0
27343,231339,0


In [7]:
#read in labels.csv
labels = pd.read_csv('data/labels.csv')

In [8]:
#merge gt and labels
gt_labels = pd.merge(gt, labels, on='label_id', how='left')

In [9]:
gt_labels

Unnamed: 0,label_id,verified,audit_task_id,label_type,severity,correct,high_quality_user,gsv_panorama_id,agree_count,disagree_count,...,heading,pitch,photographer_heading,photographer_pitch,label_tags,label_description,heading_diff,pitch_diff,geometry,clustered
0,86788,1,19418,SurfaceProblem,2.0,1.0,False,BpVD1H9f1A6URlrjPUcyZg,2,0,...,183.062500,-7.125000,358.678406,6.516869,0.0,0.0,-175.615906,-13.641869,POINT (-122.28125 47.698307037353516),-1
1,91037,1,21038,CurbRamp,,1.0,True,g0kskJSQVRkZXwH8mmA3gA,1,0,...,34.062500,-35.000000,99.724670,-2.702782,0.0,0.0,-65.662170,-32.297218,POINT (-122.31624603271484 47.55794906616211),-1
2,93842,1,22195,CurbRamp,1.0,1.0,True,npPLy6aGDOcUNgSXA4sq8Q,1,0,...,175.187500,-19.250000,299.396454,2.216354,0.0,0.0,-124.208954,-21.466354,POINT (-122.32888793945312 47.5999755859375),-1
3,190528,1,47238,CurbRamp,1.0,1.0,True,KIOwpToFQQi4QK6hxq2Jjg,1,0,...,31.437500,-16.187500,132.761383,-1.222160,0.0,0.0,-101.323883,-14.965340,POINT (-122.3386001586914 47.6960334777832),-1
4,190672,1,47291,CurbRamp,1.0,1.0,True,00WCIfeoGC33NDp8s0_KGg,1,0,...,335.875000,-15.803572,180.815201,0.315163,0.0,0.0,155.059799,-16.118734,POINT (-122.28343200683594 47.546104431152344),-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27650,84922,0,18558,CurbRamp,,,False,-B8963iDbur4NYIYzdTJXA,0,0,...,324.500000,-17.812500,220.064758,-4.507965,0.0,0.0,104.435242,-13.304535,POINT (-122.33837890625 47.612701416015625),-1
27651,230293,0,75784,CurbRamp,1.0,,True,3KIM9a467DImsFzpQ9xAqQ,0,0,...,169.227676,-15.875000,22.459303,2.169060,0.0,0.0,146.768373,-18.044060,POINT (-122.29075622558594 47.556880950927734),-1
27652,231338,0,76193,CurbRamp,1.0,,True,d3oBrJ-nDtKMwlE7svc6pg,0,0,...,253.562500,-35.000000,177.959915,-0.962440,0.0,0.0,75.602585,-34.037560,POINT (-122.28179168701172 47.563270568847656),-1
27653,231339,0,76193,CurbRamp,1.0,,True,AOIBJ12BjFanDJF1Ark05Q,0,0,...,110.312500,-35.000000,179.365723,-1.093400,0.0,0.0,-69.053223,-33.906600,POINT (-122.28165435791016 47.56346893310547),-1


In [10]:
#export to csv
gt_labels.to_csv('data/gt.csv', index=False)

In [14]:
#number of verfied true labels in gt_labels and verified false labels in gt_labels and percentage of verified true labels in gt_labels

print(gt_labels['verified'].value_counts())
print(gt_labels['verified'].value_counts(normalize=True))

0    14518
1    13137
Name: verified, dtype: int64
0    0.524968
1    0.475032
Name: verified, dtype: float64


In [15]:
# read in gt_3.csv
gt_3 = pd.read_csv('data/gt_3.csv')
gt_3

Unnamed: 0,verified,label_id,audit_task_id,label_type,severity,correct,high_quality_user,gsv_panorama_id,agree_count,disagree_count,...,heading,pitch,photographer_heading,photographer_pitch,label_tags,label_description,heading_diff,pitch_diff,geometry,clustered
0,1,86788,19418,SurfaceProblem,2.0,1.0,False,BpVD1H9f1A6URlrjPUcyZg,2,0,...,183.062500,-7.125000,358.678406,6.516869,0.0,0.0,-175.615906,-13.641869,POINT (-122.28125 47.698307037353516),-1
1,1,91037,21038,CurbRamp,,1.0,True,g0kskJSQVRkZXwH8mmA3gA,1,0,...,34.062500,-35.000000,99.724670,-2.702782,0.0,0.0,-65.662170,-32.297218,POINT (-122.31624603271484 47.55794906616211),1
2,1,93842,22195,CurbRamp,1.0,1.0,True,npPLy6aGDOcUNgSXA4sq8Q,1,0,...,175.187500,-19.250000,299.396454,2.216354,0.0,0.0,-124.208954,-21.466354,POINT (-122.32888793945312 47.5999755859375),1
3,1,190528,47238,CurbRamp,1.0,1.0,True,KIOwpToFQQi4QK6hxq2Jjg,1,0,...,31.437500,-16.187500,132.761383,-1.222160,0.0,0.0,-101.323883,-14.965340,POINT (-122.3386001586914 47.6960334777832),1
4,1,190672,47291,CurbRamp,1.0,1.0,True,00WCIfeoGC33NDp8s0_KGg,1,0,...,335.875000,-15.803572,180.815201,0.315163,0.0,0.0,155.059799,-16.118734,POINT (-122.28343200683594 47.546104431152344),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16885,0,65281,14016,SurfaceProblem,3.0,0.0,True,5TVU8BDY3ZtYnRwYnLkFjw,0,1,...,49.455357,-17.437500,359.584869,0.085213,0.0,0.0,-310.129513,-17.522713,POINT (-122.32270050048828 47.532718658447266),-1
16886,0,79862,16930,SurfaceProblem,5.0,0.0,True,Tgdv_crZ2G73pDZs8Xo81w,0,2,...,295.883942,-15.598214,271.188171,-3.895775,0.0,0.0,24.695770,-11.702439,POINT (-122.33906555175781 47.701438903808594),-1
16887,0,79880,16933,SurfaceProblem,,,True,D5wUl-EWRUD5bl8H1fDd-Q,1,1,...,289.812500,-10.892858,269.897766,-1.949219,1.0,0.0,19.914734,-8.943639,POINT (-122.34080505371094 47.70146179199219),0
16888,0,79882,16933,SurfaceProblem,5.0,,True,D5wUl-EWRUD5bl8H1fDd-Q,1,1,...,331.508942,-29.910715,269.897766,-1.949219,1.0,1.0,61.611176,-27.961496,POINT (-122.34072875976562 47.70145797729492),0


In [16]:
print(gt_3['verified'].value_counts())
print(gt_3['verified'].value_counts(normalize=True))

1    13137
0     3753
Name: verified, dtype: int64
1    0.777798
0    0.222202
Name: verified, dtype: float64


In [12]:
#summarize labels df
labels

Unnamed: 0,audit_task_id,label_id,label_type,severity,correct,high_quality_user,gsv_panorama_id,agree_count,disagree_count,notsure_count,...,heading,pitch,photographer_heading,photographer_pitch,label_tags,label_description,heading_diff,pitch_diff,geometry,clustered
0,18604,85055,CurbRamp,1.0,,False,NjPAkwTxWsayAq3kCugvdA,0,0,0,...,107.750000,-15.62500,270.556610,0.907036,0.0,0.0,-162.806610,-16.532036,POINT (-122.3327865600586 47.61859893798828),-1
1,18608,85057,Obstacle,2.0,1.0,False,f_Mibt-6nJU7HTN7kx5ZGQ,2,0,0,...,38.375000,-21.87500,164.133438,0.335564,0.0,0.0,-125.758438,-22.210564,POINT (-122.3318862915039 47.61780548095703),-1
2,18608,85059,CurbRamp,1.0,,False,f_Mibt-6nJU7HTN7kx5ZGQ,0,0,0,...,131.187500,-24.50000,164.133438,0.335564,0.0,0.0,-32.945938,-24.835564,POINT (-122.33183288574219 47.61751174926758),-1
3,18608,85060,CurbRamp,1.0,,False,f_Mibt-6nJU7HTN7kx5ZGQ,0,0,0,...,131.187500,-24.50000,164.133438,0.335564,0.0,0.0,-32.945938,-24.835564,POINT (-122.33187866210938 47.61751937866211),-1
4,18608,85062,CurbRamp,1.0,,False,f_Mibt-6nJU7HTN7kx5ZGQ,0,0,0,...,257.187500,-35.00000,164.133438,0.335564,0.0,0.0,93.054062,-35.335564,POINT (-122.33204650878906 47.617637634277344),-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196809,76181,231270,CurbRamp,1.0,,True,R-sTl5O6sOdVPurgTWy3EQ,0,0,0,...,332.468750,-35.00000,215.130295,-0.157570,0.0,0.0,117.338455,-34.842430,POINT (-122.26725769042969 47.540771484375),-1
196810,76183,231285,NoSidewalk,5.0,,True,N5MrAAtHZO2JGXUrYAYnSQ,0,0,0,...,0.890625,-11.65625,179.161057,-5.825714,0.0,0.0,-178.270432,-5.830536,POINT (-122.26344299316406 47.55229187011719),-1
196811,76193,231338,CurbRamp,1.0,,True,d3oBrJ-nDtKMwlE7svc6pg,0,0,1,...,253.562500,-35.00000,177.959915,-0.962440,0.0,0.0,75.602585,-34.037560,POINT (-122.28179168701172 47.563270568847656),-1
196812,76193,231339,CurbRamp,1.0,,True,AOIBJ12BjFanDJF1Ark05Q,0,0,0,...,110.312500,-35.00000,179.365723,-1.093400,0.0,0.0,-69.053223,-33.906600,POINT (-122.28165435791016 47.56346893310547),-1
