In [41]:
import json
import numpy as np
import os
from pprint import pprint

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554069931  # Oracle
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence
# dataset = 'race'

### DREAM
dataset = 'dream'
# prompt_type, task_id = 'question', 1554582693  # Q-only
prompt_type, task_id = 'quote and question', 1554587404  # Q-only

# split = 'middle'
split = None

# Set useful variables
task_dir += str(task_id)
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 241
# Passed Test: 90 / 151 = 59.6 %


In [42]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
durations = []
durations_by_worker = []
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print('| reject_reasons:', hit_result['reject_reasons'], '| block_reasons:', hit_result['block_reasons'])
#         continue
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            metrics[qid]['answer'] = answer
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    duration = np.median(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    print('| Time:', round(duration, 1),
          '| Acc:', round(100 * hit_result['accuracy'][prompt_type]),
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0]['worker_id'])

| Time: 15.3 | Acc: 60 | Max Freq: 40.0 | Rate: 10 | Feedback: It would be great to know which ones we got correct because I'm surpirised I only got 60%. It was fun, though, so thank you! | Quote Rating: 10 | Quote Desc: A little gray. Just had to figure it out. #12 didn't have a quote, though, only answer choices.
| Time: 7.9 | Acc: 60 | Max Freq: 35.0 | Rate: 9 | Feedback: Correct the instances where there is a single full stop only instead of a few words or a sentence. | Quote Rating: 6 | Quote Desc: Some of them provided no background context for answering the question. 
| Time: 16.0 | Acc: 55 | Max Freq: 45.0 | Rate: 4 | Feedback: maybe make some of the questions not just one word | Quote Rating: 3 | Quote Desc: Some of them were very difficult to understand, and hard to gather information from. 
| Time: 17.7 | Acc: 80 | Max Freq: 50.0 | Rate: 10 | Feedback: Not sure, I enjoyed it a lot like it is. | Quote Rating: 8 | Quote Desc: Most were helpful for answering but some were hard 

| Time: 7.0 | Acc: 70 | Max Freq: 55.0 | Rate: 10 | Feedback: A progress counter, perhaps? | Quote Rating: 4 | Quote Desc: snippets
| Time: 10.1 | Acc: 65 | Max Freq: 45.0 | Rate: 8 | Feedback: I enjoyed this task.  I don't think that I would change anything.  Thank you for the opportunity! | Quote Rating: 6 | Quote Desc: They were alright.  Some of them were pretty detailed but other's only consisted of sentence fragments or just a few words.  It was sometimes difficult to determine what the subject was about but I tried my best.  Thank you!
| Time: 10.7 | Acc: 55 | Max Freq: 50.0 | Rate: 10 | Feedback: Nada, I enjoyed it. THanks for letting me work! | Quote Rating: 7 | Quote Desc: Most were ok and I could guess. Some didn't provide enough context.
| Time: 10.8 | Acc: 75 | Max Freq: 55.0 | Rate: 10 | Feedback: more information on the quote | Quote Rating: 9 | Quote Desc: interesting fun
| Time: 18.2 | Acc: 60 | Max Freq: 50.0 | Rate: 10 | Feedback: NOthing great set up  | Quote Rating

In [43]:
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for model_stance, prompt in qid_metrics.items():
        if not (model_stance in [None] + options):
            continue

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

Evals per sample: 6.0
Fraction insuffient evals: 0.0
Convinced: 43.43 %
- Correct debater: 75.19 %
- Incorrect debater: 27.55 %
Accuracy: 61.43 %
- Correct debater: 75.19 %
- Incorrect debater: 54.55 %
Extra Evals: 16.67 %
Evals per sample distribution: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8]


In [44]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (31.11, 8.08)


In [45]:
durations_by_worker

array([ 6.172 ,  6.6075,  6.849 ,  7.0265,  7.3355,  7.3735,  7.5445,
        7.6285,  7.691 ,  7.7515,  7.9435,  7.9615,  8.1425,  8.274 ,
        8.3575,  8.5165,  8.74  ,  8.7515,  8.9515,  9.087 ,  9.3095,
        9.3615,  9.464 ,  9.563 ,  9.716 , 10.0015, 10.082 , 10.661 ,
       10.6665, 10.7675, 10.807 , 10.8775, 11.073 , 11.425 , 11.437 ,
       11.566 , 11.582 , 11.5915, 12.297 , 12.2985, 12.4075, 12.707 ,
       12.78  , 13.002 , 13.062 , 13.148 , 13.336 , 13.373 , 13.6955,
       13.706 , 13.739 , 14.6715, 14.714 , 14.844 , 15.1865, 15.308 ,
       15.329 , 15.563 , 15.6425, 15.672 , 15.8845, 15.9865, 16.093 ,
       16.4835, 16.6105, 16.6815, 16.846 , 16.954 , 17.2005, 17.5205,
       17.658 , 18.2075, 18.9795, 19.3685, 20.156 , 20.824 , 21.4605,
       21.5295, 21.683 , 21.9415, 22.4395, 22.5105, 22.6785, 23.6245,
       25.8175, 29.5245, 33.3565, 33.9535, 35.054 , 42.987 ])

In [46]:
np.median(accuracy_by_worker[prompt_type])

0.6

In [47]:
pprint(hit_results[0])

{'accuracy': {'quote and question': 0.6},
 'assignment_id': '3WQQ9FUS6B9K3IUPFGQKZESUEGQB81',
 'block_reasons': [],
 'data': [{'context': 'Woman: Is that the most important reason to develop '
                      'alternative fuel and energy sources?\n'
                      '\n'
                      'What do most experts agree on according to the man?\n'
                      '\n'
                      'A: Oil will soon be replaced by alternative energy '
                      'sources.\n'
                      'B: Oil production will begin to decline worldwide by '
                      '2025.\n'
                      'C: Oil consumption has given rise to many global '
                      'problems.',
           'duration': 58533,
           'response': 'B',
           'sample': {'answer_starts': 1,
                      'debate_mode': 'Ⅰ',
                      'episode_done': True,
                      'eval_labels': 'B: Oil production will begin to decline '
                