## Qualification Task Worker Evaluation
* This notebook demonstrates how to calculate an aggregated accuracy score for each worker based on their submissions for the qualification task.

### Imports

In [None]:
import pandas as pd
import os
import random
import pprint
import ujson
from copy import deepcopy

random.seed(4)

from thefuzz import process, fuzz

### Functions

In [None]:
def get_fuzzy_score(target, candidate):
    """
    return the fuzzy match score between two texts
    the higher the more similar
    """
    return fuzz.token_sort_ratio(target, candidate)


def valid_checkpoint(answer):
    """
    check if the worker copy/pasted the code during the checkpoint
    """
    if type(answer['Answer.checkpoint_code_worker_input']) != str:
        return False
    return answer['Answer.checkpoint_code_gold_input'].strip() == answer['Answer.checkpoint_code_worker_input'].strip()


def valid_selected_keys(selected_keys):
    """
    consolidate the selected keys to pass into the next steps for worker evaluation
    """
    
    names = [[], [], []]
    fuzzy = {}
    other = {}
    
    for key_dict in selected_keys:
        key = key_dict['key']
        
        if 'checkpoint_code' in key:
            continue
        
        if 'Answer.victim_name' in key:
            page_idx = int(key.split('_')[2])-1
            names[page_idx].append(key_dict['value'])
        else:
            if key_dict['fuzzy']:
                fuzzy[key] = key_dict['value']
            else:
                other[key] = key_dict['value']
                
    return names, fuzzy, other


def answer_validation(answer, names, fuzzy, other, name_weight=2, verbose=False):
    """
    takes in a worker submission and evaluate their performance based on the selected_keys

    Params:
        answer: Dict, key-value pairs from the AMT batch for a particular worker
        names: List[List[String]], a list of names from each page
        fuzzy: Dict, a dictionary of answers that use fuzzy matching to evaluate
        other: Dict, a dictionary of answers on exact match
        name_weight: Int, a weighting factor to scale up the accuracy contribution for finding correct names
    Returns:
        rejected: Boolean, if this worker should be rejected
        accuracy: Float, accuracy for this worker
    """
    rejected = True
    checkpoint_failed = False
    accuracy = 0
    
    acc_list = []
    
    gt_names = deepcopy(names)
    answer_names = [[], [], []]
    
    #step 1: check checkpoint
    if not valid_checkpoint(answer):
        checkpoint_failed = True
    
    #step 2: check each key, validate names in the end
    for key, val in answer.items():
        if 'Answer.victim_name' in key and 'example' not in key:
            page_idx = int(key.split('_')[2])-1
            answer_names[page_idx].append(val)
    
        else:
            
            if key in fuzzy:
                res = get_fuzzy_score(val, fuzzy[key]) >= 85
                if verbose:
                    print(key, val, fuzzy[key], res)
                
                acc_list.append(res)
            elif key in other:
                res = str(val) == str(other[key])
                if verbose:
                    print(key, val, other[key], res)
                
                acc_list.append(res)
    
    #step 3: validate names
    for names_on_page, gt_names_on_page in zip(answer_names, gt_names):
        names_on_page = [i for i in names_on_page if type(i)==str]
        if verbose:
            print(names_on_page, gt_names_on_page)
        for cadi_name in names_on_page:
            found_match = [False]*name_weight
            for gt_name in gt_names_on_page:
                if get_fuzzy_score(cadi_name, gt_name) >= 85:
                    found_match = [True]*name_weight
                    gt_names_on_page.remove(gt_name)
                    break
            acc_list += found_match
    
    accuracy = round(sum(acc_list)/len(acc_list), 2)
    if verbose:
        print(acc_list)
    return accuracy < .4, accuracy, checkpoint_failed

def eval_qa_batch(df_path, selected_keys):
    """
    load an annotation batch from AMT and calculate worker performance
    """
    qa_batch_df = pd.read_csv(df_path)
    answers = qa_batch_df.to_dict('records')
    
    names, fuzzy, other = valid_selected_keys(selected_keys)
    
    for answer in answers:
        answer['should_reject'], answer['accuracy'], answer['checkpoint_failed'] = answer_validation(answer, names, fuzzy, other)
        
    new_batch_df = pd.DataFrame(answers)
    
    res_df_path = df_path.split('.csv')[0]+'_evaluated.csv'
    
    new_batch_df.to_csv(res_df_path)
    print("Evaluation results saved to {}".format(res_df_path))

### Step 1. Processing ground truth annotations
* To calculate the accuracy, we first manually submit ONE qualification HIT in the sandbox environment using the ground truth answers, then download the response.
* The annotated data is stored in `Batch_360680_batch_results_gt_1.csv`

In [None]:
gt_df_name = 'Batch_360680_batch_results_gt_1.csv'
gt_df = pd.read_csv(os.path.join('../dataframes', gt_df_name))

In [None]:
len(gt_df.columns)

### Step 2. Select the keys (columns) that should be included in the calculation
* There are more than 200 columns in the annotated dataframe, only the ones corresponding to the attributes should be included in the calculation.
* We first compile all the keys into a dataframe, then put the dataframe into a GoogleSheet, then manually mark the keys to be included

In [None]:
gt_answer = gt_df.to_dict('records')[0]
metric_df = pd.DataFrame()
keys, vals = [], []
for key, val in gt_answer.items():
    keys.append(key)
    vals.append(val)

metric_df = pd.DataFrame({
    'key': keys,
    'value': vals
})

In [None]:
metric_df.to_csv('../dataframes/metric_df_v1.csv')

### Step 3. Edit in google sheet then load the dataframe to get the selected keys
* Here we provide the edited dataframe in `../dataframes/qualification_metric_keys.csv`;
* All the selected keys are stored in `../util_data/qualification_metric_key_values.json`

In [None]:
# load it back
qa_keys_df = pd.read_csv('../dataframes/qualification_metric_keys.csv')
selected_keys = qa_keys_df[qa_keys_df.check].to_dict('records')
for key_dict in selected_keys:
    if key_dict['value'] == 'TRUE':
        key_dict['value'] = True
with open('../util_data/qualification_metric_key_values.json', 'w') as json_file:
    ujson.dump(selected_keys, json_file)

### Step 4. Run evaluation functions
* The `eval_qa_batch` function enclose all the steps required to evaluate a batch of qualification HITs.
* Refer to `valid_selected_keys` and `answer_validation` for more details.

In [None]:
# test using the ground truth df
# the resulting df should contain an accuracy of 1 
# meaning a perfect score, located in the last three columns
qa_batch_df_name = '../dataframes/Batch_360680_batch_results_gt_1.csv'
eval_qa_batch(qa_batch_df_name, selected_keys)

In [None]:
# similarly, applying the evaluation process to
# other qualification batches as following:
eval_qa_batch('../dataframes/Batch_4796448_batch_results_pilot_qualification_3_2nd.csv', 
              selected_keys)