In [7]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
import math

In [8]:
# read in dara/jon-miket-validates
df1 = pd.read_csv('data/source/jon-mikey-validated-labels-correct.csv')
df2 = pd.read_csv('data/source/jon-mikey-validated-labels-incorrect.csv')

In [9]:
#select chicago data
df1 = df1[df1['city'] == 'chicago']
df2 = df2[df2['city'] == 'chicago']

In [10]:
#select only label_id
df1 = df1[['label_id']]
df2 = df2[['label_id']]

In [11]:
#create a new column for correct/incorrect called ground_truth
df1['ground_truth'] = 1
df2['ground_truth'] = 0

In [12]:
#combine the two dataframes
df = pd.concat([df1, df2])

In [13]:
df.sort_values(by=['label_id'], inplace=True)

In [14]:
df

Unnamed: 0,label_id,ground_truth
20851,8,1
20852,9,1
20853,11,1
20854,14,1
20855,18,1
...,...,...
25957,22713,1
25958,22714,1
25959,22739,1
25960,22740,1


In [17]:
#read in the labels file
labels = gpd.read_file('data/processed-labels/labels_all_chicago/labels_all_chicago.shp')

In [18]:
labels

Unnamed: 0,audit_task,label_id,gsv_panora,label_type,severity,correct,high_quali,gsv_pano_1,zoom,heading,pitch,photograph,photogra_1,user_id,lat,lng,geometry
0,3,8,pMph48Z6Xz2hexFJ7q3d_Q,CurbRamp,2.0,,1,pMph48Z6Xz2hexFJ7q3d_Q,2,136.687500,-18.625000,268.225098,-0.562027,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,42.083668,-87.980347,POINT (-87.98035 42.08367)
1,3,9,pMph48Z6Xz2hexFJ7q3d_Q,CurbRamp,1.0,1,1,pMph48Z6Xz2hexFJ7q3d_Q,2,249.937500,-14.125000,268.225098,-0.562027,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,42.083694,-87.980606,POINT (-87.98061 42.08369)
2,3,10,pMph48Z6Xz2hexFJ7q3d_Q,CurbRamp,2.0,0,1,pMph48Z6Xz2hexFJ7q3d_Q,3,288.656250,-13.187500,268.225098,-0.562027,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,42.083786,-87.980606,POINT (-87.98061 42.08379)
3,3,11,pMph48Z6Xz2hexFJ7q3d_Q,CurbRamp,3.0,1,1,pMph48Z6Xz2hexFJ7q3d_Q,3,49.245537,-16.220982,268.225098,-0.562027,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,42.083794,-87.980354,POINT (-87.98035 42.08379)
4,3,12,pMph48Z6Xz2hexFJ7q3d_Q,CurbRamp,3.0,1,1,pMph48Z6Xz2hexFJ7q3d_Q,3,42.636162,-16.033482,268.225098,-0.562027,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,42.083817,-87.980362,POINT (-87.98036 42.08382)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18280,4588,21507,hhAF_qDTHgN0dL8SNsr-oA,SurfaceProblem,3.0,1,1,hhAF_qDTHgN0dL8SNsr-oA,1,126.341515,-32.937500,179.073898,0.395355,71959a82-3f47-4dec-a02b-86669bf8095b,41.611179,-87.664291,POINT (-87.66429 41.61118)
18281,4588,21508,hhAF_qDTHgN0dL8SNsr-oA,SurfaceProblem,2.0,,1,hhAF_qDTHgN0dL8SNsr-oA,1,213.154022,-35.000000,179.073898,0.395355,71959a82-3f47-4dec-a02b-86669bf8095b,41.611137,-87.664551,POINT (-87.66455 41.61114)
18282,4589,21509,tZWhmOw4CIP9bQLJycSyqQ,CurbRamp,1.0,,1,tZWhmOw4CIP9bQLJycSyqQ,3,217.380630,-16.302500,179.386520,-0.660645,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,41.609871,-87.646339,POINT (-87.64634 41.60987)
18283,4601,21517,nWTcoxo4S0MImzzvlQGwpQ,CurbRamp,1.0,1,1,nWTcoxo4S0MImzzvlQGwpQ,3,248.244995,-13.510625,359.491150,-0.079690,3e25ea1d-f762-421a-b98b-3cba36e1bbfb,41.625626,-87.651291,POINT (-87.65129 41.62563)


In [19]:
#check if all label_ids in df are in labels
df['label_id'].isin(labels['label_id']).all()

False

In [20]:
#select label_ids in df are in labels
df = df[df['label_id'].isin(labels['label_id'])]

In [21]:
df

Unnamed: 0,label_id,ground_truth
20851,8,1
20852,9,1
20853,11,1
20854,14,1
20855,18,1
...,...,...
25635,21502,1
25636,21505,1
25637,21506,1
25638,21507,1


In [22]:
#check values of ground_truth
df['ground_truth'].value_counts()

1    4789
0    1467
Name: ground_truth, dtype: int64

In [23]:
#save the ground truth labels to a csv
df.to_csv('data/processed-labels/chicago_ground_truth_labels.csv', index=False)

In [24]:
#merge the ground truth labels with the labels file on id
new = labels.merge(df, on='label_id', how='right')

In [25]:
#export new to shape
new.to_file('data/processed-labels/chicago_ground_truth_labels.shp')

In [26]:
new.explore(column="ground_truth", tooltip=["label_type","label_id"], tiles="CartoDB positron", cmap="Set1", legend=True)