In [4]:
%pylab inline
from qualityBaseline import *
from analysis_toolbox import *

Populating the interactive namespace from numpy and matplotlib


#### Variable defintions

|Math notation  | Variable name | Meaning|
|:-------------: |:-------------:| -----|
|$\mathcal{I}$ | I|set of all images seen |
|$\mathcal{U}$ | U|set of all images |
|$\mathcal{E}$ | E|expert list|
|$\mathcal{B}$ | B|'bot' (bad workers) list|
|$\mathcal{A_i}$ | Ai|set of all workers that have provided labels for that image|
|$\mathcal{A'}$ | A_all|set of all workers|
|$p(l_{ij}|z_i)$ | plij|posterior on worker annotation|
|$\hat{p}(z_i)$ | pzi,phat(Lij,a)|posterior on target value (i.e. what we think BB is objectively)|
|$p(a_j)$ | pa| prob of honest worker|
|$\mathcal{L}_{ij}$|Lij |set of all labels ij provided so far |
|$\tau$|tau|confidence threshold s.t. the zi estimate is "good enough" that we can stop requesting labels|
|m|m|max cutoff for number of labels we can request (for now lets set this as all the labels available in $\mathcal{U}$)|
|$\theta_v$|var_thres|threshold on variance (how much evidence we need to determine if someone is a expert or bot)|

In [5]:
img_info,object_tbl,bb_info,hit_info = load_info()
ai_tbl= pd.read_csv("interpolated_aligned_bb_info.csv",index_col=0)

In [6]:
U = list(img_info.id)

In [7]:
def set_union(a, b):
    """ return the union of two lists """
    return list(set(a) | set(b))

In [85]:
A_all = list(set(bb_info.worker_id))
U = img_info.id.values
tau =  0.8
m = len(U)

I = []
L = [[] for x in range(len(U)+1)] # L is a *-by-|U| list
E = []
B = []

while len(I)<len(U): 
    # add a image in U but not in I (pick randomly)
    I.append(random.choice([u for u in U if u not in I ]))
    # Looping through all the seen images 
    for i in I: 
        Li = L[i]
        Ai = [] 
        # Compute posterior
        phatzi = [10,10]#delta(x,zi)
        while max(phatzi)<tau and len(L)<m: 
            #Obtain label from some annotator 
            # first chose from expert list 
            try:
                j = random.choice([j for j in E ])
            except (ValueError):
                # due to empty E list 
                # then just pick something from A_all that's not a bot
                j = random.choice([j for j in A_all if j not in B ])
            lij = ai_tbl[(ai_tbl.object_id==i)&(ai_tbl.worker_id==j)]
            Li.append(set_union(Li,lij))
            Ai.append(set_union(Ai,j))
            #Recompute phatzi from updated Li and a 
            phatzi = [10,10]#delta(x,zi)
    E = [] 
    B = []
    for j in Ai: 
        # Estimate aj from phatzi by max aj Qj
        Q = -----
        a = np.argmax(Q)

In [9]:
def area(image_id,wh=False):
    '''
    Given area of image , lambda_i
    '''
    img_name = img_info[img_info["id"]==image_id].filename.values[0]
    fname = "../web-app/app/static/"+img_name+".png"
    width,height = get_size(fname)
    img_area = width*height
    if wh: 
        return float(img_area),width,height
    else:
        return float(img_area)

In [10]:
area(1)

307200.0

In [11]:
def phat(i,Li,a):
    '''
    Compute the posterior on the target value 
    given worker annotations (Lij) and best estimates on the worker accuracy (a)
    i : image id 
    Ai : list of worker ids that have provided responses
    Li : *-by-|A| vector containing all worker annotations for image i 
    a : 1-by-|A| vector containing all worker accuracies 
    |A| : the total numebr of workers (workers who have not provided response denoted by -1, so we can index with id)
    '''
    img_area = area(i)
    prod_plij =1
    #loop through all the workers that have provided responses
    for j in Ai: 
        prod_plij *= a[j]*scipy.stats.multivariate_normal(zi,a)+(1-a[j])/(img_area**2)
    return prod_plij/img_area

In [12]:
bb = ai_tbl.ix[0]
bbx , bby = process_raw_locs([bb["aix_locs"],bb["aiy_locs"]])

In [13]:
# we represent lij as a flattened 1x100 vector 
lij = np.array(list(flatten(zip(bbx,bby))))

In [14]:
zi = np.zeros_like(lij)

In [15]:
a,w,h=area(1,wh=True)

In [17]:
# 4d mgrid will crash, we need 100-d mgrid 
# np.mgrid[0:w,0:h,0:w,0:h]
# we could do interpolation on smaller number of points, but that would not be enough 

In [21]:
# scipy.stats.multivariate_normal.pdf(x,zi,cov=4)

In [20]:
def delta(x,zi):
    '''
    Approximate the posterior on zi by a delta function 
    '''
    if x==zi: 
        return np.inf
    else: 
        return 0 