In [161]:
import json
import numpy as np
import os
from pprint import pprint

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554069931  # Oracle
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence
# dataset = 'race'

### DREAM
dataset = 'dream'
# prompt_type, task_id = 'question', 1554582693  # Q-only
prompt_type, task_id = 'quote and question', 1554587404  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554596686  # TFIDF(Q+O)


# split = 'middle'
split = None

# Set useful variables
task_dir += str(task_id)
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 241
# Passed Test: 90 / 151 = 59.6 %


In [162]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
durations = []
durations_by_worker = []
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print(hit_result['worker_id'], hit_result['assignment_id'],
              '| reject_reasons:', hit_result['reject_reasons'],
              '| block_reasons:', hit_result['block_reasons'],
              '| bonus_reasons: ' + str(hit_result['bonus_reasons']) if 'bonus_reasons' in hit_result else '')
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
#     if hit_result['worker_id'] == 'A1PUHCEBSOWETV':
#         print('*** A1PUHCEBSOWETV ***', hit_result['assignment_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            metrics[qid]['answer'] = answer
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    print(np.array(hit_durations))
    duration = np.sum(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    print('| Time:', round(duration, 1),
          '| Acc:', round(100 * hit_result['accuracy'][prompt_type]),
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0]['worker_id'])

[58.533 12.91  19.734 15.303 20.246 12.989  8.932 17.311 22.268 15.985
  7.445 24.956 30.346 15.313 35.885 11.967  5.537  6.776  8.182  7.741]
| Time: 358.4 | Acc: 60 | Max Freq: 40.0 | Rate: 10 | Feedback: It would be great to know which ones we got correct because I'm surpirised I only got 60%. It was fun, though, so thank you! | Quote Rating: 10 | Quote Desc: A little gray. Just had to figure it out. #12 didn't have a quote, though, only answer choices.
[19.867  7.862 14.073  8.025  6.46  10.425  7.278 20.321  8.723 11.987
  4.675  7.33   5.555 31.089  9.281 12.022  5.315  6.794  6.724  6.707]
| Time: 210.5 | Acc: 60 | Max Freq: 35.0 | Rate: 9 | Feedback: Correct the instances where there is a single full stop only instead of a few words or a sentence. | Quote Rating: 6 | Quote Desc: Some of them provided no background context for answering the question. 
[35.845 25.8   20.604 18.192 13.857 10.78  16.246  5.222  9.378 15.727
 23.051 21.492  4.329 18.419 30.741  9.413 26.134  4.628  

In [163]:
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for model_stance, prompt in qid_metrics.items():
        if not (model_stance in [None] + options):
            continue

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

Evals per sample: 6.0
Fraction insuffient evals: 0.0
Convinced: 43.43 %
- Correct debater: 75.19 %
- Incorrect debater: 27.55 %
Accuracy: 61.43 %
- Correct debater: 75.19 %
- Incorrect debater: 54.55 %
Extra Evals: 16.67 %
Evals per sample distribution: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8]


In [86]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (30.49, 7.8)


In [135]:
durations_by_worker

array([ 1.865 ,  2.096 ,  3.5675,  4.523 ,  8.587 ,  9.014 ,  9.157 ,
       11.431 , 11.846 , 12.01  , 12.033 , 12.121 , 12.461 , 12.4825,
       12.684 , 12.8755, 13.097 , 13.237 , 13.924 , 13.976 , 14.271 ,
       14.456 , 14.584 , 14.689 , 14.855 , 15.1985, 15.308 , 15.371 ,
       15.394 , 15.4975, 15.576 , 15.836 , 16.038 , 16.048 , 16.0685,
       16.184 , 16.2045, 16.251 , 16.443 , 16.835 , 16.989 , 17.041 ,
       17.4575, 17.739 , 17.863 , 17.877 , 18.096 , 18.199 , 18.455 ,
       18.657 , 18.75  , 18.852 , 18.986 , 19.074 , 19.099 , 19.29  ,
       19.4735, 19.695 , 20.275 , 20.299 , 20.371 , 20.533 , 20.656 ,
       20.678 , 20.893 , 21.024 , 21.289 , 21.811 , 21.813 , 21.933 ,
       22.505 , 22.603 , 22.858 , 23.007 , 23.206 , 23.332 , 23.457 ,
       23.573 , 23.582 , 23.8445, 23.929 , 24.056 , 24.239 , 24.363 ,
       24.6885, 24.787 , 25.59  , 25.932 , 25.965 , 26.165 , 26.249 ,
       26.285 , 26.672 , 27.746 , 28.301 , 30.25  , 30.848 , 31.149 ,
       31.509 , 31.7

In [132]:
np.median(accuracy_by_worker[prompt_type])

0.5263157894736842

In [89]:
(accuracy_by_worker[prompt_type] > np.median(accuracy_by_worker[prompt_type])).mean()

0.3048780487804878

In [91]:
hit_results[0]

{'data': [{'sample': {'id': 'dream',
    'text': 'W: What does the term "alternative energy source" mean? M: When we think of energy or fuel for our homes and cars, we think of petroleum, a fossil fuel processed from oil removed from the ground, of which there is a limited supply. But alternative fuels can be many things. Wind, sun and water can all be used to create fuel. W: Is the threat of running out of petroleum real? M: It has taken thousands of years to create the natural stores of petroleum we have available at a much faster rate than it is being produced over time. The real controversy surrounding the amount of petroleum we have is how much we need to keep in reserve for future use. Most experts agree that by around 2025, the amount of petroleum we use will reach a peak. Then production and availability will begin to seriously decline. This is not to say there will be no petroleum at this point. But it\'ll become very difficult and therefore expensive to extract. W: Is that th

In [155]:
accuracy_by_worker[prompt_type]

array([0.10526316, 0.10526316, 0.15      , 0.15789474, 0.2       ,
       0.25      , 0.26315789, 0.26315789, 0.26315789, 0.26315789,
       0.26315789, 0.26315789, 0.26315789, 0.3       , 0.3       ,
       0.31578947, 0.35      , 0.35      , 0.36842105, 0.36842105,
       0.36842105, 0.36842105, 0.36842105, 0.36842105, 0.36842105,
       0.36842105, 0.36842105, 0.4       , 0.4       , 0.42105263,
       0.42105263, 0.42105263, 0.42105263, 0.42105263, 0.42105263,
       0.42105263, 0.42105263, 0.42105263, 0.42105263, 0.42105263,
       0.42105263, 0.42105263, 0.45      , 0.45      , 0.45      ,
       0.47368421, 0.47368421, 0.47368421, 0.47368421, 0.47368421,
       0.47368421, 0.47368421, 0.47368421, 0.47368421, 0.47368421,
       0.47368421, 0.5       , 0.5       , 0.5       , 0.52631579,
       0.52631579, 0.52631579, 0.52631579, 0.52631579, 0.52631579,
       0.52631579, 0.52631579, 0.52631579, 0.52631579, 0.52631579,
       0.52631579, 0.52631579, 0.55      , 0.55      , 0.55   