In [68]:
%autosave 0
import pandas as pd
import numpy as np
import sklearn.preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from math import exp
from sklearn.metrics import precision_score, recall_score
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)
pd.set_option("display.max_rows", 100)

Autosave disabled


# Load data

In [13]:
images = pd.read_csv("06_RSVP_images.csv", index_col=0)
images.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta,id,label
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.0,379,0
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.102,8,1


In [14]:
keys = pd.read_csv("06_RSVP_keys.csv", index_col=0)
keys.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta
0,2017-05-02 20:51:16.026,5fb56330-3c5f-4243-924c-e5457789918b,easy,key,0.433
1,2017-05-02 20:51:18.506,5fb56330-3c5f-4243-924c-e5457789918b,easy,key,2.913


In [42]:
# global algorithm parameters
p_Ck = len(keys) / len(images)
p_Ii = 36.0/240
mu = 0.378
sigma = 0.092

# Develop algorithm

In [46]:
def calc_individual_proba_contribution(C_w, timedelta):
    p_Cw = len(C_w) * p_Ck # multiples and divides...
    
    prod = 0.0
    for c in C_w:
        d = float(c-timedelta)
        if d > 0:
            prod += norm.logpdf(c-timedelta, loc=mu, scale=sigma)

    p_Cw_Ii = exp(prod)
    
    p_Ii_Cw = p_Ii * p_Cw_Ii # / p_Cw
    
    p_Ii_w = p_Ii_Cw # * p_Cw
    
    return p_Ii_w

In [53]:
def assign_probabilities(images, keys, task, label):
    # get list of positive image ids
    image_ids = images.ix[(images["task"] == task) & (images["label"] == label),"id"].unique()
    image_ids.sort()
    
    result = pd.DataFrame()
    result["id"] = image_ids
    result["label"] = label
    result["task"] = task
    result["nworkers"] = np.nan
    result["proba"] = np.nan

    for image_id in image_ids:
        # find at most n=5 workers that saw this id.
        # note: we have 9 workers at most anyway.
        n=5
        user_ids = images.ix[(images["task"] == task) 
                             & (images["id"] == image_id)
                             & (images["label"] == label), "uuid"].unique()
        user_ids = user_ids[:n]
        result.ix[result["id"] == image_id, "nworkers"] = len(user_ids)

        p_Ii_C = 0.0 
        for user_id in user_ids: 
            C_w = keys.ix[(keys["task"] == task) 
                          & (keys["uuid"] == user_id), "timedelta"]
            timedelta = images.ix[(images["id"] == image_id)
                                 & (images["uuid"] == user_id)
                                 & (images["task"] == task)
                                 & (images["label"] == label), "timedelta"]
            p_Iiw = calc_individual_proba_contribution(C_w, timedelta)
            p_Ii_C += p_Iiw

        result.ix[result["id"] == image_id, "proba"] = p_Ii_C
        
    return result

In [77]:
def make_predictions(result, threshold):
    result = result.copy()
    result["prediction"] = np.nan
    mask = result["proba"] >= threshold
    result.ix[mask, "prediction"] = 1
    result.ix[~mask, "prediction"] = 0

    return result

In [79]:
def compute_precision_recall(result, threshold=0.5):
    result = make_predictions(result, threshold)
    y_true = result["label"]
    y_pred = result["prediction"]
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return precision, recall

In [70]:
def assign_probabilities_both(images, keys, task):
    result_1 = assign_probabilities(images, keys, task, 1)
    result_0 = assign_probabilities(images, keys, task, 0)
    return pd.concat([result_1, result_0], axis=0).reset_index(drop=True)

# Compute results

In [87]:
all_results = pd.DataFrame()
all_results["task"] = ["easy", "medium", "hard", "all_concepts"]
all_results["precision"] = np.nan
all_results["recall"] = np.nan

In [83]:
results = {}
results["easy"] = assign_probabilities_both(images, keys, "easy")
results["medium"] = assign_probabilities_both(images, keys, "medium")
results["hard"] = assign_probabilities_both(images, keys, "hard")

In [84]:
results["all_concepts"] = pd.concat([results["easy"], results["medium"], results["hard"]], axis=0)

In [116]:
threshold = 0.53
for task in ["easy", "medium", "hard", "all_concepts"]:
    precision, recall = compute_precision_recall(results[task], threshold=threshold)
    all_results.ix[all_results["task"] == task, ["precision", "recall"]] = [precision, recall]

In [117]:
all_results

Unnamed: 0,task,precision,recall
0,easy,0.2,0.055556
1,medium,0.181818,0.055556
2,hard,0.125,0.027778
3,all_concepts,0.172414,0.046296


In [99]:
all_results

Unnamed: 0,task,precision,recall
0,easy,0.181818,0.055556
1,medium,0.181818,0.055556
2,hard,0.111111,0.027778
3,all_concepts,0.16129,0.046296
