In [68]:
import json
import numpy as np
import os
from pprint import pprint

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554069931  # Search
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence
# dataset = 'race'

### DREAM
dataset = 'dream'
# prompt_type, task_id = 'question', 1554582693  # Q-only
# prompt_type, task_id = 'quote and question', 1554596686  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554587404  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554662280  # FastText(O)
# prompt_type, task_id = 'quote and question', 1554675304  # Search
# prompt_type, task_id = 'quote and question', 1554685131  # SL
# prompt_type, task_id = 'quote and question', 1554692472  # SL-Sents
# prompt_type, task_id = 'quote and question', 1554729998  # SL-Sents-Influence
# prompt_type, task_id = 'quote and question', 1555333992  # SL-Theory-of-Mind
## All answers at once
# prompt_type, task_id = 'question, answers, and quotes', 1555707929  # TFIDF(O)
# prompt_type, task_id = 'question, answers, and quotes', 1555722489  # Search
prompt_type, task_id = 'question and quotes', 1555789302  # SL

# split = 'middle'
split = None

# Set useful variables
task_dir += str(task_id)
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 132
# Passed Test: 34 / 98 = 34.69 %


In [69]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
quote_ratings = []
durations = []
durations_by_worker = []
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print(hit_result['worker_id'], hit_result['assignment_id'],
              '| reject_reasons:', hit_result['reject_reasons'],
              '| block_reasons:', hit_result['block_reasons'],
              '| bonus_reasons: ' + str(hit_result['bonus_reasons']) if 'bonus_reasons' in hit_result else '')
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
#     if hit_result['worker_id'] == 'A1PUHCEBSOWETV':
#         print('*** A1PUHCEBSOWETV ***', hit_result['assignment_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    if (hit_result['quote_rating'] is not None) and (hit_result['quote_rating'].isdigit()):
        quote_ratings.append(int(hit_result['quote_rating']))
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            metrics[qid]['answer'] = answer
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    print(np.array(hit_durations))
    duration = np.median(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    print('| Time:', round(duration, 1),
          '| Acc:', round(100 * hit_result['accuracy'][prompt_type]),
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

quote_ratings = np.array(quote_ratings)
durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
print('Quote Rating:',
      '| Mean:', round(quote_ratings.mean(), 2),
      '| Median:', round(np.median(quote_ratings), 2),
      '| Std:', round(np.std(quote_ratings), 2))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0])

[12.709  7.208 29.393 20.374 15.629  8.965 12.496 28.015 14.957 12.516
  8.2   19.473 15.736 16.859 10.372 19.74  10.425  8.739 10.94  29.596]
| Time: 13.8 | Acc: 85 | Max Freq: 35.0 | Rate: 8 | Feedback: A couple of them were pretty impossible | Quote Rating: 9 | Quote Desc: Interesting
[77.276 51.117 33.66  24.222 93.05  35.05  13.801 33.338 30.877 29.096
 30.975 52.833 13.62  42.506 67.475 26.976 27.185 26.089 33.053 42.753]
| Time: 33.2 | Acc: 80 | Max Freq: 50.0 | Rate: 10 | Feedback: I honestly think the set up is really good the way it is and does not need improved. The practice questions helped a lot in understanding the task. | Quote Rating: 8 | Quote Desc: They were helpful overall in figuring out which answer was best. The passage quotes were concise and informative.
[20.652 12.837  9.077 12.312 12.004  9.241  7.124  7.497  8.581 10.245
 12.425 10.398  7.317  4.607  8.445  6.366  6.163  4.728  7.822  3.345]
| Time: 8.5 | Acc: 50 | Max Freq: 55.0 | Rate: 2 | Feedback: make th

 10.305 29.596 12.264 41.943 21.053  9.591 11.772  7.471  7.774  9.578]
| Time: 12.0 | Acc: 70 | Max Freq: 50.0 | Rate: 10 | Feedback: I found this to be enjoyable just the way it is. Thank you! | Quote Rating: 7 | Quote Desc: They were helpful about 75% of the time.  For a couple questions there was no help and it was a complete guess.  For one example(14), the quotes alone indicated two correct answers, of which I picked one that must have been invalidated elsewhere in the paragraph.
[15.359 18.591  9.476 19.093  8.182 10.688 11.084 14.588 18.361 12.346
 10.44  20.772 16.208  8.871 12.64   9.447 21.405 11.564  9.785 15.687]
| Time: 12.5 | Acc: 75 | Max Freq: 40.0 | Rate: 0 | Feedback: The interface was a bit glitchy at times and slow to load | Quote Rating: 5 | Quote Desc: Short, without much relevant information to the question
[ 22.336 122.982  45.753  72.634  46.478  35.19   77.269  34.388  39.978
 136.835  21.359  36.638  32.543  28.013  79.506  13.765  28.899  27.43
  28.477  21

In [70]:
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for model_stance, prompt in qid_metrics.items():
        if not (model_stance in [None] + options):
            continue

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

Evals per sample: 5.8
Fraction insuffient evals: 0.2
Convinced: nan %
- Correct debater: nan %
- Incorrect debater: nan %
Accuracy: 75.17 %
- Correct debater: nan %
- Incorrect debater: nan %
Extra Evals: 13.79 %
Evals per sample distribution: [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]


  ret = ret.dtype.type(ret / rcount)


In [71]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (10.34, 7.38)


In [72]:
durations_by_worker

array([ 5.401 ,  8.513 , 11.2215, 11.412 , 11.552 , 12.018 , 12.493 ,
       13.4265, 13.6475, 13.833 , 15.1395, 16.7235, 17.716 , 18.0665,
       19.6935, 20.0455, 20.8885, 21.2925, 22.543 , 23.987 , 24.2415,
       24.81  , 25.385 , 25.4535, 26.0395, 26.204 , 32.859 , 33.1955,
       34.789 ])

In [73]:
np.mean(accuracy_by_worker[prompt_type])

0.746551724137931

In [74]:
task_ratings

{0: 1, 1: 0, 2: 1, 3: 1, 4: 0, 5: 3, 6: 2, 7: 7, 8: 3, 9: 1, 10: 10}