In [2]:
import json
import numpy as np
import os
from pprint import pprint

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554069931  # Search
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence
# dataset = 'race'

### DREAM
dataset = 'dream'
# prompt_type, task_id = 'question', 1554582693  # Q-only
# prompt_type, task_id = 'quote and question', 1554596686  # TFIDF(Q+O)
# prompt_type, task_id = 'quote and question', 1554587404  # TFIDF(O)
# prompt_type, task_id = 'quote and question', 1554662280  # FastText(O)
# prompt_type, task_id = 'quote and question', 1554675304  # Search
# prompt_type, task_id = 'quote and question', 1554685131  # SL
# prompt_type, task_id = 'quote and question', 1554692472  # SL-Sents
# prompt_type, task_id = 'quote and question', 1554729998  # SL-Sents-Influence
prompt_type, task_id = 'quote and question', 1555333992  # SL-Theory-of-Mind

# split = 'middle'
split = None

# Set useful variables
task_dir += str(task_id)
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 261
# Passed Test: 82 / 179 = 45.81 %


In [3]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
quote_ratings = []
durations = []
durations_by_worker = []
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print(hit_result['worker_id'], hit_result['assignment_id'],
              '| reject_reasons:', hit_result['reject_reasons'],
              '| block_reasons:', hit_result['block_reasons'],
              '| bonus_reasons: ' + str(hit_result['bonus_reasons']) if 'bonus_reasons' in hit_result else '')
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
#     if hit_result['worker_id'] == 'A1PUHCEBSOWETV':
#         print('*** A1PUHCEBSOWETV ***', hit_result['assignment_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    if (hit_result['quote_rating'] is not None) and (hit_result['quote_rating'].isdigit()):
        quote_ratings.append(int(hit_result['quote_rating']))
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            metrics[qid]['answer'] = answer
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    duration = np.median(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    print('| Time:', round(duration, 1),
          '| Acc:', round(100 * hit_result['accuracy'][prompt_type]),
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

quote_ratings = np.array(quote_ratings)
durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
print('Quote Rating:',
      '| Mean:', round(quote_ratings.mean(), 2),
      '| Median:', round(np.median(quote_ratings), 2),
      '| Std:', round(np.std(quote_ratings), 2))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0]['worker_id'])

| Time: 7.9 | Acc: 50 | Max Freq: 50.0 | Rate: 10 | Feedback: Nothing | Quote Rating: 8 | Quote Desc: Interesting
| Time: 11.9 | Acc: 35 | Max Freq: 50.0 | Rate: 10 | Feedback: I don't know, I enjoyed it the way it was.  I thought it was easy to understand the directions and fun to do. | Quote Rating: 8 | Quote Desc: Some of them were much more helpful than others,  some didn't help at all.
| Time: 15.8 | Acc: 55 | Max Freq: 55.0 | Rate: 10 | Feedback: Make more of the quotes relevant, or perhaps longer interchange/conversation. | Quote Rating: 8 | Quote Desc: Mostly useful, some not helpful at all nor relevant to the question.
| Time: 9.7 | Acc: 45 | Max Freq: 55.0 | Rate: 7 | Feedback: Add more details into the passage quotes | Quote Rating: 6 | Quote Desc: Some of them indicated the correct answer, some of them were too vague.
| Time: 14.3 | Acc: 70 | Max Freq: 40.0 | Rate: 9 | Feedback: I think it's fine the way it is. | Quote Rating: 8 | Quote Desc: Brief and usually helpful.
| Ti

In [4]:
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for model_stance, prompt in qid_metrics.items():
        if not (model_stance in [None] + options):
            continue

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

Evals per sample: 5.4
Fraction insuffient evals: 0.0
Convinced: 48.46 %
- Correct debater: 75.39 %
- Incorrect debater: 35.0 %
Accuracy: 52.89 %
- Correct debater: 75.39 %
- Incorrect debater: 41.64 %
Extra Evals: 7.41 %
Evals per sample distribution: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7]


In [47]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (23.33, 8.03)


In [48]:
durations_by_worker

array([ 3.0275,  4.4515,  5.2565,  5.6405,  5.6475,  5.703 ,  6.156 ,
        6.2705,  6.6475,  6.8395,  7.2245,  7.317 ,  7.4135,  7.7045,
        7.7315,  7.917 ,  7.932 ,  8.148 ,  8.1855,  8.6555,  8.87  ,
        8.9845,  8.9915,  9.451 ,  9.4655,  9.6525, 10.696 , 11.7755,
       12.808 , 18.698 ])

In [29]:
np.mean(accuracy_by_worker[prompt_type])

0.6372222222222224