## Post Annotation Processing
* This notebook illustrates all the data processing steps involved in handling annotated data from the full task. Including:
* * Converting raw AMT output to more readable JSON data objects;
* * Calculate inter-annotater agreement;
* * Merge multiple annotations to one following a set of merging rules;
* * Basic statistics analysis of the processed data

### Imports

In [None]:
import pandas as pd
import os
import pprint
import ujson
import random
import numpy as np
from collections import Counter


random.seed(5)

from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

### Functions

In [None]:
def convert_sample(df_sample):
    
    """
    convert an annotated dataframe into a list of JSON objects
    """
    
    res = {}
    sample_info = {}
    worker_info = {}
    annotation = {}
    # parse existing info
    input_keys = [i for i in df_sample.keys() if 'Input.' in i]
    for input_key in input_keys:
        sample_info[input_key] = df_sample[input_key]
    sample_info['HIT_id'] = df_sample['HITId']
    
    worker_info['worker_id'] = df_sample['WorkerId']
    worker_info['work_time_in_seconds'] = df_sample['WorkTimeInSeconds']
    
    # add base info
    annotation.update(get_base_info(df_sample))
    
    # add victims
    annotation['victims'] = get_victims(df_sample)
    
    worker_info['annotation'] = annotation
    
    res['sample'] = sample_info
    res['worker'] = worker_info
    
    return res
    
    
def get_base_info(df_sample):
    
    """
    Parse base attributes such as perpetrators, violation types, locations, and time
    """
    
    # get perpetrator mention
    perp_mention_key = 'Answer.perpetrator_mention_true_1.perpetrator_mention_true_1'
    perp_mentioned = df_sample[perp_mention_key]
    
    # get perpetrator type
    if perp_mentioned:
        perp_type_keys = [i for i in df_sample.keys() if '_actor_' in i] + ['Answer.insufficient_info_1_0.insufficient_info_1_0']
        for perp_type_key in perp_type_keys:
            if df_sample[perp_type_key]:
                if 'insufficient_info_' in perp_type_key:
                    perp_type = 'insufficient info'
                elif 'state_actor_' in perp_type_key:
                    perp_type = 'state actor'
                elif 'other_actor_w_' in perp_type_key:
                    perp_type = 'other actors with permissions'
                elif 'other_actor_wo_' in perp_type_key:
                    perp_type = 'other actors without permissions'
                elif 'other_actor_na_' in perp_type_key:
                    perp_type = 'other actors with unknown permissions'
                else:
                    perp_type = 'error type'
                break
    else:
        perp_type = None
    
    # get violationt types
    violation_type_keys = [i for i in df_sample.keys() if 'Answer.abuse_type_' in i]
    violation_types = []
    for violation_type_key in violation_type_keys:
        if df_sample[violation_type_key]:
            if '_arbitrary_detention_' in violation_type_key:
                violation_type = 'arbitrary detention'
            elif '_enforced_disappearance_' in violation_type_key:
                violation_type = 'enforced disappearance'
            elif '_kidnapping_' in violation_type_key:
                violation_type = 'kidnapping'
            elif '_killing_' in violation_type_key:
                violation_type = 'killing'
            elif '_torture_' in violation_type_key:
                violation_type = 'torture'
            elif '_other_' in violation_type_key:
                violation_type = 'other'
            else:
                violation_type = 'error type'
            violation_types.append(violation_type)
    
    # get time info
    year_input = str(int(df_sample['Answer.year_incident_1'])) if df_sample['Answer.year_incident_1'] else None
    month_input = str(df_sample['Answer.month_incident_1']) if df_sample['Answer.month_incident_1'] else None
    date_input = str(df_sample['Answer.date_incident_1']) if df_sample['Answer.date_incident_1'] else None
    
    # get location info
    city_input = str(df_sample['Answer.location_city_1'])
    region_input = str(df_sample['Answer.location_region_1'])
    country_input = str(df_sample['Answer.location_country_1'])
    
    return {
        'perpetrator_mention': perp_mentioned,
        'perpetrator_type': perp_type,
        'violation_types': violation_types,
        'year': year_input,
        'month': month_input,
        'date': date_input,
        'city': city_input,
        'region': region_input,
        'country': country_input,
    }

def get_victims(df_sample):
    
    """
    Parse victim annotations
    """
    
    # get the maximal number of victims
    max_num_vic = len([i for i in df_sample.keys() if 'Answer.victim_pop_type_multiple_1' in i])
    # get the actual number of victims
    num_vic = len([i for i in range(1, max_num_vic+1) if df_sample['Answer.victim_pop_type_multiple_1_'+str(i)+'.victim_pop_type_multiple_1_'+str(i)] 
                   is not None])
    vics = []
    pop_keys = [i for i in df_sample.keys() if 'Answer.victim_pop_type_' in i]
    for vic_idx in range(1, num_vic+1):
        # get population type
        pop_key_multi = 'Answer.victim_pop_type_multiple_1_'+str(vic_idx)+'.victim_pop_type_multiple_1_'+str(vic_idx)
        if df_sample[pop_key_multi]:
            pop_type = "multiple"
        else:
            pop_type = "individual"
        # get name
        if pop_type == "individual":
            vic_name_key = 'Answer.victim_name_1_'+str(vic_idx)
            vic_name = df_sample[vic_name_key]
        else:
            vic_name = None
        # get keywords
        vic_keywords_key = 'Answer.victim_keywords_1_'+str(vic_idx)
        vic_keywords = df_sample[vic_keywords_key]
        # get victim types
        vic_type_keys = [i for i in df_sample.keys() if 'Answer.victim_type_' in i]
        vic_types = []
        for vic_type_key in vic_type_keys:
            if vic_type_key[-1] == str(vic_idx) and df_sample[vic_type_key]:
                if 'journalist' in vic_type_key:
                    vic_type = 'journalist'
                elif 'trade_unionist' in vic_type_key:
                    vic_type = 'trade unionist'
                elif '_hrd_' in vic_type_key:
                    vic_type = 'human rights defender'
                elif '_na_' in vic_type_key:
                    vic_type = 'insufficient information'
                else:
                    vic_type = 'error type'
                vic_types.append(vic_type)
        # get victim sex type
        vic_sex_keys = [i for i in df_sample.keys() if 'Answer.victim_sex_type_' in i]
        for vic_sex_key in vic_sex_keys:
            if vic_sex_key[-1] == str(vic_idx) and df_sample[vic_sex_key]:
                if '_man_' in vic_sex_key:
                    vic_sex_type = 'man'
                elif '_woman_' in vic_sex_key:
                    vic_sex_type = 'woman'
                elif '_other_' in vic_sex_key:
                    vic_sex_type = 'other'
                elif '_unkown_' in vic_sex_key:
                    vic_sex_type = 'unknown'
                else:
                    vic_sex_type = 'error type'
                break
        # get age group
        vic_age_keys = [i for i in df_sample.keys() if 'Answer.victim_age_group_' in i]
        for vic_age_key in vic_age_keys:
            if vic_age_key[-1] == str(vic_idx) and df_sample[vic_age_key]:
                if '_adult_' in vic_age_key:
                    vic_age_type = 'adult'
                elif '_child_' in vic_age_key:
                    vic_age_type = 'child'
                elif '_other_' in vic_age_key:
                    vic_age_type = 'other'
                elif '_unknown_' in vic_age_key:
                    vic_age_type = 'unknown'
                else:
                    vic_age_type = 'error type'
                break
        vics.append({
            'victim_idx': vic_idx,
            'victim_population_type': pop_type,
            'victim_name': vic_name,
            'victim_keywords': vic_keywords,
            'victim_type': vic_types,
            'victim_sex_type': vic_sex_type,
            'victim_age_group': vic_age_type,
        })
    return vics


def merge_annotations(annotated_samples):
    """
    merge replicated annotations (from 3 workers) into one report per sample but with annotations from all three workers
    """
    merged_samples = []
    HIT_ids = {i['sample']['HIT_id'] for i in annotated_samples}
    for HIT_id in HIT_ids:
        merged_sample = {}
        HITs = [i for i in annotated_samples if i['sample']['HIT_id'] == HIT_id]
        assert HITs[0]['sample'] == HITs[1]['sample'] == HITs[2]['sample']
        merged_sample['sample'] = HITs[0]['sample']
        merged_sample['annotations'] = []
        for HIT in HITs:
            merged_sample['annotations'].append(HIT['worker'])
        merged_samples.append(merged_sample)
    return merged_samples


def norm_label(label):
    """
    normalize the label if the label is None, convert it to "None"
    """
    return "None" if not label else str(label)


def norm_score(score):
    """
    normalize the score if the score is nan, convert it to 0
    """
    return 0 if np.isnan(score) else score


def flat_nested_dict(input_dict):
    res_dict = {}
    for key, item in input_dict.items():
        if type(item) != dict:
            res_dict[key] = item
        else:
            for child_key, child_item in item.items():
                res_dict[key+'_'+child_key] = child_item
    return res_dict


def convert_annotation_df(df_path):
    df = pd.read_csv(df_path)
    df1 = df.where(pd.notnull(df), None)
    samples = df1.to_dict('records')
    res_samples = [convert_sample(i) for i in samples]
    return res_samples


def merge_sample(sample):
    
    merged_annotation = {}
        
    for key in text_input_keys:
        ans_list = [[i['worker_id'], i['annotation'][key]] for i in sample['annotations'] 
                    if i['annotation'][key] and i['annotation'][key]!='None']
        if not ans_list:
            merged_annotation[key] = None
#         elif len(ans_list) == 1:
#             merged_annotation[key] = ans_list[0][1]
        else:
            val_cnt = Counter([i[1] for i in ans_list])
            # if majority exist, take majority vote
            if len(val_cnt) != len(ans_list):
                merged_annotation[key] = val_cnt.most_common(1)[0][0]
            # if not, take answer from the better worker
            else:
                merged_annotation[key] = sorted(ans_list, key=lambda x: worker_qa_score[x[0]])[-1][1]
    
    for key in selection_input_keys:
        # take majority vote (has to happen)
        if key == 'perpetrator_mention':
            ans_list = [[i['worker_id'], i['annotation'][key]] for i in sample['annotations']]
            val_cnt = Counter([i[1] for i in ans_list])
            merged_annotation[key] = val_cnt.most_common(1)[0][0]
        elif key == 'perpetrator_type':
            if merged_annotation['perpetrator_mention']:
                # if did mention, 1) take majority vote; 2) better worker otherwise
                # only look at the workers who picked yes
                worker_ids = [i['worker_id'] for i in sample['annotations'] if i['annotation']['perpetrator_mention']]
                ans_list = [[i['worker_id'], i['annotation'][key]] for i in sample['annotations'] if i['worker_id'] in worker_ids]
                val_cnt = Counter([i[1] for i in ans_list])
                # if majority exist, take majority vote
                if len(val_cnt) != len(ans_list):
                    merged_annotation[key] = val_cnt.most_common(1)[0][0]
                # if not, take answer from the better worker
                else:
                    merged_annotation[key] = sorted(ans_list, key=lambda x: worker_qa_score[x[0]])[-1][1]
            else:
                # if did not mention, pass in None
                merged_annotation[key] = None
        else:
            # in general, take majority or better worker
            ans_list = [[i['worker_id'], i['annotation'][key]] for i in sample['annotations']]
            val_cnt = Counter([i[1] for i in ans_list])
            # if majority exist, take majority vote
            if len(val_cnt) != len(ans_list):
                merged_annotation[key] = val_cnt.most_common(1)[0][0]
            # if not, take answer from the better worker
            else:
                merged_annotation[key] = sorted(ans_list, key=lambda x: worker_qa_score[x[0]])[-1][1]
    
    for key in list_input_keys:
        ans_list = [[i['worker_id'], i['annotation'][key]] for i in sample['annotations']]
        # take from the better worker
        merged_annotation[key] = sorted(ans_list, key=lambda x: -worker_qa_score[x[0]])[0][1]
    
    return merged_annotation

### Step 1. Load and parse the raw output dataframe
* After loading, convert the data format first
* Then merge the same HITs from multiple workers into a single record containing multiple annotations from all workers using `merge_annotations`. Note this is different from merging within the sample using `merge_sample` where all replicated annotations for a single HIT is merged using a set of rules.

In [None]:
df = pd.read_csv('../dataframes/full_task_results/Batch_4797737_batch_results_full_1.csv')
df1 = df.where(pd.notnull(df), None)
samples = df1.to_dict('records')
res_samples = [convert_sample(i) for i in samples]
merged_samples = merge_annotations(res_samples)
pprint.pprint(random.choice(merged_samples))

### Step 2. Calculate inter-annotator agreement
1. Inter-annotator agreement is calculated by Pair-wise Cohen-Kappa score
2. on these tags: 1). Perpetrator mention; 2). Perpetrator Type; 3). First violation type; 4). First victim population type; 5). First victim type.

In [None]:
# Steps to evaluate each worker:
#    for each unique worker id:
#    locate the related merged samples 
#    calculate pair-wise cohen-kappa for the five questions
#    average two pair-wise scores to generate a score per question
#    average 5 scores to get final agreement score for this worker.

In [None]:
worker_ids = list({i['worker']['worker_id'] for i in res_samples})

In [None]:
single_value_keys = ['perpetrator_mention', 'perpetrator_type']
first_choice_keys = ['violation_types']
first_victim_keys = ['victim_population_type']
first_victim_fist_choice_keys = ['victim_type']

score_list = []

for worker_id in worker_ids:
    worker_scores = {}
    # get HITs that this worker finished
    worker_HITs = [i for i in merged_samples if worker_id in [j['worker_id'] for j in i['annotations']]]
    # get annotations that are NOT from this worker, instead from the other two workers
    paired_annotations = []
    worker_annotations = []
    for worker_HIT in worker_HITs:
        worker_annotations.append([i for i in worker_HIT['annotations'] if i['worker_id']==worker_id][0])
        paired_annotations.append([i for i in worker_HIT['annotations'] if i['worker_id']!=worker_id])
    add_worker1_annotations, add_worker2_annotations = [list(i) for i in zip(*paired_annotations)]
    
    for single_value_key in single_value_keys:
        worker_labels = [norm_label(i['annotation'][single_value_key]) for i in worker_annotations]
        add_worker1_labels = [norm_label(i['annotation'][single_value_key]) for i in add_worker1_annotations]
        add_worker2_labels = [norm_label(i['annotation'][single_value_key]) for i in add_worker2_annotations]
        
        #calculate scores
        pair1_score = cohen_kappa_score(worker_labels, add_worker1_labels)
        pair2_score = cohen_kappa_score(worker_labels, add_worker2_labels)
        worker_scores[single_value_key] = norm_score((pair1_score + pair2_score) / 2)
    
    for first_choice_key in first_choice_keys:
        worker_labels = [norm_label(i['annotation'][first_choice_key][0]) for i in worker_annotations]
        add_worker1_labels = [norm_label(i['annotation'][first_choice_key][0]) for i in add_worker1_annotations]
        add_worker2_labels = [norm_label(i['annotation'][first_choice_key][0]) for i in add_worker2_annotations]
        #calculate scores
        pair1_score = cohen_kappa_score(worker_labels, add_worker1_labels)
        pair2_score = cohen_kappa_score(worker_labels, add_worker2_labels)
        worker_scores[first_choice_key] = norm_score((pair1_score + pair2_score) / 2)
        
    for first_victim_key in first_victim_keys:
        worker_labels = [norm_label(i['annotation']['victims'][0][first_victim_key]) for i in worker_annotations]
        add_worker1_labels = [norm_label(i['annotation']['victims'][0][first_victim_key]) for i in add_worker1_annotations]
        add_worker2_labels = [norm_label(i['annotation']['victims'][0][first_victim_key]) for i in add_worker2_annotations]
        #calculate scores
        pair1_score = cohen_kappa_score(worker_labels, add_worker1_labels)
        pair2_score = cohen_kappa_score(worker_labels, add_worker2_labels)
        worker_scores[first_victim_key] = norm_score((pair1_score + pair2_score) / 2)
        
    for first_victim_fist_choice_key in first_victim_fist_choice_keys:
        worker_labels = [norm_label(i['annotation']['victims'][0][first_victim_fist_choice_key][0]) for i in worker_annotations]
        add_worker1_labels = [norm_label(i['annotation']['victims'][0][first_victim_fist_choice_key][0]) for i in add_worker1_annotations]
        add_worker2_labels = [norm_label(i['annotation']['victims'][0][first_victim_fist_choice_key][0]) for i in add_worker2_annotations]
        #calculate scores
        pair1_score = cohen_kappa_score(worker_labels, add_worker1_labels)
        pair2_score = cohen_kappa_score(worker_labels, add_worker2_labels)
        worker_scores[first_victim_fist_choice_key] = norm_score((pair1_score + pair2_score) / 2)
        
    score_list.append({
        'worker_id': worker_id,
        'scores': worker_scores,
        'num_HITs_done': len(worker_annotations),
        'avg_score': sum(list(worker_scores.values()))/len(worker_scores)
    })

In [None]:
score_list = sorted(score_list, key=lambda x: -x['avg_score'])

In [None]:
score_df = pd.DataFrame([flat_nested_dict(i) for i in score_list])

In [None]:
score_df

### Step 3. Merge samples via majority voting/merging
* Rules for merging the samples:
1. for text input answers, if only one answer exists (two None), take the only answer; If more than one exists, take the answer from the worker that has higher qualification scores
2. for single choice answers, take majority vote, if no majority, take the answer from the worker that has higher qualification scores
3. for victim answers, take the victim list that has the most number of victims, if tie, take the list from the worker that has higher qualification scores

In [None]:
# Note: the worker IDs are anonymized.
worker_qa_score = {
    'da86f058-00ca-4ec2-b63e-b87f42dab844': 82,
    'fd5ebe40-5dbb-4a22-b959-6cf3d0ae0829': 75,
    '24528428-dbe4-46d7-bd43-27324cef42f6': 80,
    '7b387f77-1f0b-48dd-b77d-ff8b7e2af7ef': 84,
    '7b8989dd-46ba-4242-a051-7457344fba3d': 98,
    '1f400fe4-8bb5-413a-ba43-1c46576aeb51': 91,
    '3da648d3-cbda-4938-8d40-4afce6ed7973': 82,
    'ac287307-81d2-4211-a19d-c6a1c38a754a': 77,
    'ea7ca118-9f34-4464-9425-0be339155b76': 75,
    'cfe772f1-9e88-4332-850f-791cd09c3fd2': 84,
}

text_input_keys = {'city', 'country', 'date', 'month', 'region', 'year'}
selection_input_keys = ['perpetrator_mention', 'perpetrator_type']
list_input_keys = {'victims', 'violation_types'}

In [None]:
for i in merged_samples:
    i['merged_sample'] = merge_sample(i)

In [None]:
with open('../processed_annotated_data/full_batch_1.json', 'w')  as json_file:
    ujson.dump(merged_samples, json_file)

### Data EDA

In [None]:
### Worker distrubution
df.WorkerId.value_counts()

In [None]:
# percentage of samples DOES NOT have explicit perpetrator mentions
mentioned_samples = [i for i in merged_samples if i['merged_sample']['perpetrator_mention']]
print("Ratio: {}".format(round(len(mentioned_samples)/len(merged_samples), 2)))

In [None]:
import numpy as np
# Average number of victims
print("Avg. # Victims: {}".format(np.mean([len(i['merged_sample']['victims']) for i in merged_samples])))