In [297]:
import pandas as pd
import snorkel
import matplotlib.pyplot as plt
from snorkel.labeling import labeling_function
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
import seaborn as sns

In [298]:
# read in the training data
df_train = pd.read_csv("data/processed-labels/training_set_seattle_4.csv")
df_train

Unnamed: 0,label_id,label_type,severity,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,user_id,...,geometry,clustered,count,false_positive,distance,tag_list,description,tag_count,way_type,intersection_distance
0,85055,CurbRamp,1.0,NjPAkwTxWsayAq3kCugvdA,1,107.750000,-15.62500,270.556610,0.907036,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,...,POINT (-122.3327865600586 47.61859893798828),0.0,1.0,0,5.435411,0.0,0.0,0.0,residential,59.130699
1,85057,Obstacle,2.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,38.375000,-21.87500,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,...,POINT (-122.3318862915039 47.61780548095703),0.0,7.0,0,12.187807,0.0,1.0,0.0,-1,-1.000000
2,85059,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,...,POINT (-122.33183288574219 47.61751174926758),1.0,6.0,0,1.261442,0.0,0.0,0.0,tertiary,37.121175
3,85060,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.187500,-24.50000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,...,POINT (-122.33187866210938 47.61751937866211),0.0,4.0,0,0.859638,0.0,0.0,0.0,tertiary,33.832501
4,85062,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,257.187500,-35.00000,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,...,POINT (-122.33204650878906 47.617637634277344),0.0,6.0,0,2.443982,0.0,0.0,0.0,tertiary,46.324623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178958,231270,CurbRamp,1.0,R-sTl5O6sOdVPurgTWy3EQ,1,332.468750,-35.00000,215.130295,-0.157570,155af0a9-4149-4436-8aa3-7573e160036d,...,POINT (-122.26725769042969 47.540771484375),1.0,10.0,0,19.281464,0.0,0.0,0.0,residential,33.457578
178959,231285,NoSidewalk,5.0,N5MrAAtHZO2JGXUrYAYnSQ,3,0.890625,-11.65625,179.161057,-5.825714,155af0a9-4149-4436-8aa3-7573e160036d,...,POINT (-122.26344299316406 47.55229187011719),1.0,12.0,0,7.199212,0.0,0.0,0.0,-1,-1.000000
178960,231338,CurbRamp,1.0,d3oBrJ-nDtKMwlE7svc6pg,1,253.562500,-35.00000,177.959915,-0.962440,3ac3d4de-a70a-48ec-b46e-498d6e26ee6c,...,POINT (-122.28179168701172 47.563270568847656),0.0,9.0,1,14.445776,0.0,0.0,0.0,residential,152.820404
178961,231339,CurbRamp,1.0,AOIBJ12BjFanDJF1Ark05Q,1,110.312500,-35.00000,179.365723,-1.093400,3ac3d4de-a70a-48ec-b46e-498d6e26ee6c,...,POINT (-122.28165435791016 47.56346893310547),0.0,9.0,1,20.171446,0.0,0.0,0.0,residential,225.060263


In [302]:
# read in gt.csv file
df_test = pd.read_csv('data/processed-labels/test_set_seattle_4.csv')
Y_test = df_test.verified.values

In [299]:
label_types = df_train ['label_type'].unique().tolist()

In [300]:
label_types

['CurbRamp',
 'Obstacle',
 'SurfaceProblem',
 'NoSidewalk',
 'NoCurbRamp',
 'Occlusion',
 'Other',
 'Signal',
 'Crosswalk']

## Labeling functions based on heuristics

In [357]:
# Define the label mappings for convenience
NOT_SURE = -1
WRONG = 0
CORRECT = 1

In [358]:
@labeling_function()
def intersection(x):
    if x["way_type"] =='residential' and x["intersection_distance"] >= 120:
        return WRONG 
    elif x["way_type"] =='living_street' and x["intersection_distance"] >= 120:    
        return WRONG
    else:
        return NOT_SURE

    
#check if the label is in the cluster
@labeling_function()
def clustered(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['count'] >= 2 else NOT_SURE


#check if it is a severity 4 or higher
@labeling_function()
def severity(x):
    # x in this case is a row of the dataframe
    return CORRECT if x["severity"] >= 4 else NOT_SURE


#check if the user zoomed in
@labeling_function()
def zoom(x):
    # x in this case is a row of the dataframe
    if x["zoom"] > 2:
        return CORRECT
    elif x["zoom"] == 1:
        return WRONG
    else:
        return NOT_SURE


#check if the user has put a tag
@labeling_function()
def tags(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['tag_list']  == 1 else NOT_SURE


#check if the user has put a comment
@labeling_function()
def description(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['description']  == 1 else NOT_SURE

In [359]:
lfs = [
    intersection, 
    clustered,
    severity,
    zoom,
    tags, 
    description,
    ]

In [360]:
applier = PandasLFApplier(lfs=lfs)

In [361]:
#create a dataframe with label_type and accuracy as columns
summary = pd.DataFrame(columns=['label_type', 'accuracy'])

In [362]:
for i in range(0,5):
    L_train = applier.apply(df=df_train[df_train['label_type'] == label_types[i]])
    L_test = applier.apply(df=df_test[df_test['label_type'] == label_types[i]])
    Y_test = df_test[df_test['label_type'] == label_types[i]].verified.values
    # print(f"Labeling function analysis for {label_types[i]}")
    # print(LFAnalysis(L=L_train, lfs=lfs).lf_summary())
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
    label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
    # print(f"{'Label Model Accuracy for:':<25} {label_model_acc * 100:.1f}%")
    summary = summary.append({'label_type': label_types[i], 'accuracy': round(label_model_acc,4)}, ignore_index=True)

100%|██████████| 70690/70690 [00:02<00:00, 33390.82it/s]
100%|██████████| 5333/5333 [00:00<00:00, 40131.01it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=1.141]
INFO:root:[100 epochs]: TRAIN:[loss=0.001]
INFO:root:[200 epochs]: TRAIN:[loss=0.001]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 74%|███████▍  | 371/500 [00:00<00:00, 3701.90epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 3681.78epoch/s]
INFO:root:Finished Training
100%|██████████| 10103/10103 [00:00<00:00, 38563.83it/s]
100%|██████████| 2909/2909 [00:00<00:00, 36736.20it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=3.232]
INFO:root:[100 epochs]: TRAIN:[loss=0.003]
INFO:root:[200 epochs]: TRAIN:[loss=0.002]
INFO:root:[300 epochs]: TRAIN:[loss=0.002]
 70%|███████   | 351/500 [00:00<00:00, 3507.37epoch/s]INFO:root:[4

In [363]:
summary

Unnamed: 0,label_type,accuracy
0,CurbRamp,0.8731
1,Obstacle,0.5359
2,SurfaceProblem,0.5355
3,NoSidewalk,0.8157
4,NoCurbRamp,0.807
