# Dumping new ground-truth

Apparently, there is a notion of weak-aggreement that we can use to subsample "outliers" annotators.


## 1. Weak agreement

- 1st criterion: __3 or more__ annotators match either start or end.

- 2nd criterion: The other endpoint is off at most by 1 "clip" (5 seconds).

In [2]:
import json
import numpy as np
VALID_OFFSETS = [-1, 1]
MIN_TIME, MAX_TIME = 0, 5
NUM_ANNOTATORS_IN_WA = 3

filename = '../data/raw/{}_data.json'
new_gt = '../data/raw/{}_data_wwa.json'
multimodal_agreement = {}
unanimity = {}
def validate_criterion2(endpoint_c1, endpoint, min_quota=3):
    # pick most voted end-point
    unique_endpoint_c1, votes_c1 = np.unique(endpoint_c1, return_counts=True)
    ind_most_voted_endpoint_c1 = np.argmax(votes_c1)
    ind_criterion1 = endpoint_c1 == unique_endpoint_c1[ind_most_voted_endpoint_c1]
    picked_endpoint_c1 = unique_endpoint_c1[ind_most_voted_endpoint_c1]
    
    # check offset for the subset of annotators that satisfy first criterion
    endpoint = np.sort(endpoint[ind_criterion1])
    median_endpoint = endpoint[len(endpoint) // 2]
    offset = np.abs(endpoint - median_endpoint)
    ind_criterion2 = offset <= 1
    if ind_criterion2.sum() >= min_quota:
        return True, picked_endpoint_c1, endpoint[ind_criterion2]
    else:
        return False, None, None
    
def make_annotations(start_, end_):
    if isinstance(start_, np.ndarray):
        start_ = np.unique(start_)
        end_ = np.repeat(end_, len(start_))
    else:
        end_ = np.unique(end_)
        start_ = np.repeat(start_, len(end_))
    annotations = np.empty((len(end_), 2), dtype=end_.dtype)
    annotations[:, 0] = start_
    annotations[:, -1] = end_
    return annotations

for subset in ['train', 'val', 'test']:
    with open(filename.format(subset), 'r') as f:
        data = json.load(f)
    
    new_data = []
    unanimity[subset] = {'ids': []}
    multimodal_agreement[subset] = {'ids': [], 'tied': []}
    unanimity_ids = unanimity[subset]['ids']
    multimodal_agreement_ids = multimodal_agreement[subset]['ids']
    multimodal_agreement_tied = multimodal_agreement[subset]['tied']
    print('Subset:', subset)
    for query in data:
        annotations = np.array(query['times'])
        start, end = np.split(annotations, 2, axis=1)
        unique_start_points, start_votes = np.unique(start, return_counts=True)
        unique_end_points, end_votes = np.unique(end, return_counts=True)
        start_matched = (start_votes >= NUM_ANNOTATORS_IN_WA).any()
        end_matched = (end_votes >= NUM_ANNOTATORS_IN_WA).any()
        one_endpoint_matched_wac1 = start_matched or end_matched
        assert one_endpoint_matched_wac1
        criterion2 = False
        
        # if all are equal, no purpose of testing offset
        new_annotations, counts = np.unique(annotations, axis=0, return_counts=True)
        if new_annotations.shape[0] == 1:
            unanimity_ids.append(query['annotation_id'])
            criterion2 = True
            start_, end_ = None, None

        # Check multiple modes
        if start_matched and len(start_votes) > 1:
            num_annot_in_wa = start_votes >= NUM_ANNOTATORS_IN_WA
            possible_tied = np.where((start_votes - start_votes.max()) == 0)[0]
            if num_annot_in_wa.sum() >= 2:
                multimodal_agreement_ids.append(query['annotation_id'])
            if len(possible_tied) > 1:
                multimodal_agreement_tied.append(query['annotation_id'])
        elif end_matched and len(end_votes) > 1:
            num_annot_in_wa = end_votes >= NUM_ANNOTATORS_IN_WA
            possible_tied = np.where((end_votes - end_votes.max()) == 0)[0]
            if num_annot_in_wa.sum() >= 2:
                multimodal_agreement_ids.append(query['annotation_id'])
            if len(possible_tied) > 1:
                multimodal_agreement_tied.append(query['annotation_id'])

        # 2nd criterion
        if start_matched and not criterion2:
            criterion2, start_, end_ = validate_criterion2(start, end, NUM_ANNOTATORS_IN_WA)
        if end_matched and not criterion2:
            criterion2, end_, start_ = validate_criterion2(end, start, NUM_ANNOTATORS_IN_WA)

        if criterion2 and start_ is not None:
            new_annotations = make_annotations(start_, end_)
        elif not criterion2:
            raise
            
        new_data.append(dict(query))
        new_data[-1]['times'] = new_annotations.tolist()

    with open(new_gt.format(subset), 'w') as f:
        json.dump(new_data, f)

    print(f'Pctg unanimity {100 * len(unanimity_ids) / len(data):.2f}')
    print('Number of multimodal annotations', len(multimodal_agreement_ids))
    print(f'Pctg of multimodal annotations {100 * len(multimodal_agreement_ids) / len(data):.2f}')
    print(f'Tied multimodal annotations', len(multimodal_agreement_tied))
    print()

Subset: train
Pctg unanimity 34.70
Number of multimodal annotations 12
Pctg of multimodal annotations 0.04
Tied multimodal annotations 3

Subset: val
Pctg unanimity 34.14
Number of multimodal annotations 1
Pctg of multimodal annotations 0.02
Tied multimodal annotations 0

Subset: test
Pctg unanimity 34.59
Number of multimodal annotations 2
Pctg of multimodal annotations 0.05
Tied multimodal annotations 0



## 2. Matching criterion

We opt for using IOU=1.0 and merging multiple annotations, if any, with the $\max := \text{OR}$ operation.

In other words, a prediciton is valid iff it matches one of the annotations with IOU=1.0.

Why IOU=1.0?
1. the annotations are coarse, thus they are already relax.
    
2. we are dealing with ambiguity by considering multiple annotations with the weak-agreement enforced during the collection of the annotations.

In [1]:
import sys
sys.path.append('..')
from evaluation import RetrievalEvaluation
import h5py
import numpy as np

file_corpus = '../data/interim/mcn/corpus_test_flow.hdf5'
file_queries = '../data/interim/mcn/queries_test_flow.hdf5'
file_annotations = '../data/raw/test_data_wwa.json'
judge = RetrievalEvaluation(file_corpus, file_annotations, (1, 5, 10))

with h5py.File(file_queries, 'r') as fid:
    import time
    start = time.time()
    for sample_key, h5ds in fid.items():
        query_id = int(sample_key)
        # Copy vector
        video_index = judge.gt_queries[query_id]['video_index']
        segment_indices = judge.gt_queries[query_id]['segment_indices']
        corpus_indices = judge.corpus.repo_to_ind(video_index, segment_indices)
        for sampled_index in corpus_indices:
            query_vector = judge.corpus.features[sampled_index, :]
            judge.eval_single_vector(query_vector, query_id)
            # assert and erase mess
            for i, k in enumerate(judge.k):
                assert judge.hit_k[i][-1]
                judge.hit_k[i].pop()
                judge.hit_k_iou[i].pop()
            assert judge.rank[-1] == 0
            judge.rank.pop()
            judge.miou.pop()
            judge.avg_rank.pop()
            
        np.random.shuffle(corpus_indices)
        sampled_index = corpus_indices[0]
        query_vector = judge.corpus.features[sampled_index, :]
        judge.eval_single_vector(query_vector, query_id)
    performace = judge.eval(full=True)
    print('Test Flow')
    print('R@{0:}={2:};\nR@{0:},{1:}={3:};\nR@{0:},didemo={4:};\n'
          'mIOU={5:.4f};\nmRank={6:.2f};'
          .format(judge.k, judge.iou_threshold,
                  *performace))
    print('Elapsed time:', time.time() - start)


Test Flow
R@(1, 5, 10)=[1.0, 1.0, 1.0];
R@(1, 5, 10),0.75=[1.0, 1.0, 1.0];
R@(1, 5, 10),didemo=[0.6607809002735638, 0.7689629445411589, 0.8067644864461577];
mIOU=0.9055;
mRank=0.00;
Elapsed time: 65.75612425804138


## Exemplifying weak agreement
Finding example

In [14]:
import json
import numpy as np
import random
VALID_OFFSETS = [-1, 1]
MIN_TIME, MAX_TIME = 0, 5
NUM_ANNOTATORS_IN_WA = 3

filename = '../data/raw/{}_data.json'
multimodal_agreement = {}
def validate_criterion2(endpoint_c1, endpoint, min_quota=3):
    # pick most voted end-point
    unique_endpoint_c1, votes_c1 = np.unique(endpoint_c1, return_counts=True)
    ind_most_voted_endpoint_c1 = np.argmax(votes_c1)
    ind_criterion1 = endpoint_c1 == unique_endpoint_c1[ind_most_voted_endpoint_c1]
    picked_endpoint_c1 = unique_endpoint_c1[ind_most_voted_endpoint_c1]
    
    # check offset for the subset of annotators that satisfy first criterion
    endpoint = np.sort(endpoint[ind_criterion1])
    median_endpoint = endpoint[len(endpoint) // 2]
    offset = np.abs(endpoint - median_endpoint)
    ind_criterion2 = offset <= 1
    if ind_criterion2.sum() >= min_quota:
        return True, picked_endpoint_c1, endpoint[ind_criterion2]
    else:
        return False, None, None
    
def make_annotations(start_, end_):
    if isinstance(start_, np.ndarray):
        start_ = np.unique(start_)
        end_ = np.repeat(end_, len(start_))
    else:
        end_ = np.unique(end_)
        start_ = np.repeat(start_, len(end_))
    annotations = np.empty((len(end_), 2), dtype=end_.dtype)
    annotations[:, 0] = start_
    annotations[:, -1] = end_
    return annotations

for subset in ['val']:
    with open(filename.format(subset), 'r') as f:
        data = json.load(f)
    # 1406 -> showcase trouble of choosing start-point first
    # 71193 -> recover example of weak-agreement in slides
    random.seed(1406)
    random.shuffle(data)
    
    trigger = False
    multimodal_agreement[subset] = {'ids': [], 'tied': []}
    multimodal_agreement_ids = multimodal_agreement[subset]['ids']
    multimodal_agreement_tied = multimodal_agreement[subset]['tied']
    print('Subset:', subset)
    start_end_tied = 0
    for query in data:
        annotations = np.array(query['times'])
        start, end = np.split(annotations, 2, axis=1)
        unique_start_points, start_votes = np.unique(start, return_counts=True)
        unique_end_points, end_votes = np.unique(end, return_counts=True)
        start_matched = (start_votes >= NUM_ANNOTATORS_IN_WA).any()
        end_matched = (end_votes >= NUM_ANNOTATORS_IN_WA).any()
        one_endpoint_matched_wac1 = start_matched or end_matched
        assert one_endpoint_matched_wac1
        criterion2 = False
        
        # if all are equal, no purpose of testing offset
        new_annotations, counts = np.unique(annotations, axis=0, return_counts=True)
        if new_annotations.shape[0] == 1:
            criterion2 = True
            start_, end_ = None, None

        # Check multiple modes
        if start_matched and len(start_votes) > 1:
            num_annot_in_wa = start_votes >= NUM_ANNOTATORS_IN_WA
            possible_tied = np.where((start_votes - start_votes.max()) == 0)[0]
            if num_annot_in_wa.sum() >= 2:
                multimodal_agreement_ids.append(query['annotation_id'])
            if len(possible_tied) > 1:
                multimodal_agreement_tied.append(query['annotation_id'])
        elif end_matched and len(end_votes) > 1:
            num_annot_in_wa = end_votes >= NUM_ANNOTATORS_IN_WA
            possible_tied = np.where((end_votes - end_votes.max()) == 0)[0]
            if num_annot_in_wa.sum() >= 2:
                multimodal_agreement_ids.append(query['annotation_id'])
            if len(possible_tied) > 1:
                multimodal_agreement_tied.append(query['annotation_id'])

        # 2nd criterion
        if (start_votes.max() == end_votes.max()) and not criterion2:
            start_end_tied += 1
        if start_matched and not criterion2:
            criterion2, start_, end_ = validate_criterion2(start, end, NUM_ANNOTATORS_IN_WA)
        if end_matched and not criterion2:
            criterion2, end_, start_ = validate_criterion2(end, start, NUM_ANNOTATORS_IN_WA)

        if criterion2 and start_ is not None:
            new_annotations = make_annotations(start_, end_)
            if len(new_annotations) > 1:
                trigger = True
        elif not criterion2:
            raise
        
        if trigger:
            print(query['video'])
            print(query['description'])
            print('Original annotations')
            print(annotations)
            print('New annotations')
            print(new_annotations)
            break
#     print('Number of multimodal annotations', len(multimodal_agreement_ids))
#     print(f'Pctg of multimodal annotations {100 * len(multimodal_agreement_ids) / len(data):.2f}')
#     print(f'Tied multimodal annotations', len(multimodal_agreement_tied))
    print(start_end_tied, start_end_tied/len(data))

Subset: val
730 0.17464114832535885
