In [190]:
import pandas as pd
import snorkel
import matplotlib.pyplot as plt
from snorkel.labeling import labeling_function
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
import seaborn as sns

In [340]:
# read in the training data
df_train = pd.read_csv("data/processed-labels/training_set_seattle.csv")
df_train.head()

Unnamed: 0,label_id,label_type,severity,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,user_id,lat,lng,geometry,clustered,count,false_positive,distance,tag_list,description
0,85055,CurbRamp,1.0,NjPAkwTxWsayAq3kCugvdA,1,107.75,-15.625,270.55661,0.907036,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618599,-122.332787,POINT (-122.3327865600586 47.61859893798828),0.0,1.0,0,5.435411,0.0,0.0
1,85057,Obstacle,2.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,38.375,-21.875,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617805,-122.331886,POINT (-122.3318862915039 47.61780548095703),0.0,7.0,0,12.187807,0.0,1.0
2,85059,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.1875,-24.5,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617512,-122.331833,POINT (-122.33183288574219 47.61751174926758),1.0,6.0,0,1.261442,0.0,0.0
3,85060,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.1875,-24.5,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617519,-122.331879,POINT (-122.33187866210938 47.61751937866211),0.0,4.0,0,0.859638,0.0,0.0
4,85062,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,257.1875,-35.0,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617638,-122.332047,POINT (-122.33204650878906 47.617637634277344),0.0,6.0,0,2.443982,0.0,0.0


In [None]:
# read in the training data
df_train = pd.read_csv("data/processed-labels/training_set_seattle.csv")
df_train.head()

Unnamed: 0,label_id,label_type,severity,gsv_panorama_id,zoom,heading,pitch,photographer_heading,photographer_pitch,user_id,lat,lng,geometry,clustered,count,false_positive,distance,tag_list,description
0,85055,CurbRamp,1.0,NjPAkwTxWsayAq3kCugvdA,1,107.75,-15.625,270.55661,0.907036,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.618599,-122.332787,POINT (-122.3327865600586 47.61859893798828),0.0,1.0,0,5.435411,0.0,0.0
1,85057,Obstacle,2.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,38.375,-21.875,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617805,-122.331886,POINT (-122.3318862915039 47.61780548095703),0.0,7.0,0,12.187807,0.0,1.0
2,85059,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.1875,-24.5,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617512,-122.331833,POINT (-122.33183288574219 47.61751174926758),1.0,6.0,0,1.261442,0.0,0.0
3,85060,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,131.1875,-24.5,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617519,-122.331879,POINT (-122.33187866210938 47.61751937866211),0.0,4.0,0,0.859638,0.0,0.0
4,85062,CurbRamp,1.0,f_Mibt-6nJU7HTN7kx5ZGQ,1,257.1875,-35.0,164.133438,0.335564,4f2a3a9a-a5d5-4d57-b2d1-6be61a6c2184,47.617638,-122.332047,POINT (-122.33204650878906 47.617637634277344),0.0,6.0,0,2.443982,0.0,0.0


In [341]:
#select only label_type = curb ramp
df_train = df_train[df_train['label_type'] == 'NoCurbRamp']
#df_train

In [342]:
# read in gt.csv file
df_test = pd.read_csv('data/processed-labels/test_set_seattle.csv')
#select only label_type = curb ramp
df_test = df_test[df_test['label_type'] == 'NoCurbRamp']
Y_test = df_test.verified.values

In [343]:
df_train['severity'] = df_train['severity'].fillna(0)
df_test['severity'] = df_train['severity'].fillna(0)

## Labeling functions based on heuristics

In [344]:
# Define the label mappings for convenience
NOT_SURE = -1
WRONG = 0
CORRECT = 1

### intersection

In [345]:
#check if it is in the curb false positive list
@labeling_function()
def intersection(x):
    # x in this case is a row of the dataframe
    return WRONG if x["false_positive"] == 1 else NOT_SURE

### cluster

In [346]:
#check if the label is in the cluster
@labeling_function()
def clustered(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['count'] >= 5 else NOT_SURE

### severity

In [347]:
#check if it is a severity 4 or higher
@labeling_function()
def severity(x):
    # x in this case is a row of the dataframe
    return CORRECT if x["severity"] >= 4 else NOT_SURE

### gsv

In [348]:
#check if the user zoomed in
@labeling_function()
def zoom(x):
    # x in this case is a row of the dataframe
    if x["zoom"] > 2:
        return CORRECT
    elif x["zoom"] == 1:
        return WRONG
    else:
        return NOT_SURE

### optional input

In [349]:
#check if the user has put a tag
@labeling_function()
def tags(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['tag_list']  == 1 else NOT_SURE

In [350]:
#check if the user has put a comment
@labeling_function()
def description(x):
    # x in this case is a row of the dataframe
    return CORRECT if x['description']  == 1 else NOT_SURE

In [351]:
#check for distance to the curb
@labeling_function()
def distance(x):
    # x in this case is a row of the dataframe
    return WRONG if x['distance']  >= 40 else NOT_SURE

In [352]:
lfs = [
    #intersection, 
    clustered,
    severity,
    zoom,
    tags, 
    description,
    distance
    ]

In [353]:
applier = PandasLFApplier(lfs=lfs)

In [354]:
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|██████████| 32968/32968 [00:00<00:00, 39683.14it/s]
100%|██████████| 4239/4239 [00:00<00:00, 31585.31it/s]


In [355]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
clustered,0,[1],0.888104,0.836872,0.550443
severity,1,[1],0.494085,0.491416,0.319279
zoom,2,"[0, 1]",0.741841,0.723914,0.606437
tags,3,[1],0.623514,0.612806,0.367023
description,4,[1],0.005793,0.005733,0.003579
distance,5,[0],0.003064,0.003064,0.002882


In [356]:
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [357]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=2.114]
INFO:root:[100 epochs]: TRAIN:[loss=0.002]
INFO:root:[200 epochs]: TRAIN:[loss=0.001]
INFO:root:[300 epochs]: TRAIN:[loss=0.001]
 77%|███████▋  | 385/500 [00:00<00:00, 3848.45epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.001]
100%|██████████| 500/500 [00:00<00:00, 3860.25epoch/s]
INFO:root:Finished Training


In [358]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   79.6%
Label Model Accuracy:     74.7%
