In [46]:
import json
import numpy as np
import os
from pprint import pprint
from scipy.stats import ttest_ind

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF-QA
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF-A
# prompt_type, task_id = 'quote and question', 1554069931  # Cross-Ranker
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence
### RACE: Test
prompt_type, task_id = 'quote and question', 1556671432  # TFIDF-QA
### RACE Test
# prompt_type, task_id = 'passage and question', 1555823963  # Full Passage
# prompt_type, task_id = 'quotes and question', 1555946909  # FastText
# prompt_type, task_id = 'quotes and question', 1555952058  # Search (6-10 sentence incorrectly placed at end)
dataset = 'race'

### DREAM
# prompt_type, task_id = 'question', 1554582693  # Q-only
# prompt_type, task_id = 'quote and question', 1554596686  # TFIDF-QA
# prompt_type, task_id = 'quote and question', 1554587404  # TFIDF-A
# prompt_type, task_id = 'quote and question', 1554662280  # BoW-A
# prompt_type, task_id = 'quote and question', 1556670413  # Bi-Ranker
# prompt_type, task_id = 'quote and question', 1554675304  # Cross-Ranker
# prompt_type, task_id = 'quote and question', 1554685131  # SL
# prompt_type, task_id = 'quote and question', 1554692472  # SL-Sents
# prompt_type, task_id = 'quote and question', 1554729998  # SL-Sents-Influence
# prompt_type, task_id = 'quote and question', 1555333992  # SL-Theory-of-Mind
## All answers at once
# prompt_type, task_id = 'question, answers, and quotes', 1555707929  # TFIDF(O): 64.21%: (Less filter / no feedback)
# prompt_type, task_id = 'question, answers, and quotes', 1555722489  # Search: 65.38%: (Less filter / no feedback)
# prompt_type, task_id = 'question and quotes', 1555789302  # SL: 75.17% (4/5 filter)
# prompt_type, task_id = 'question and quotes', 1555812443  # SL: 79.32% Actually: quotes and question (4/5 filter)
# prompt_type, task_id = 'passage and question', 1555804551  # Full Passage: 92.97%
# prompt_type, task_id = 'quotes and question', 1555823257  # FastText (5/5 filter) (77.33%)
# prompt_type, task_id = 'quotes and question', 1555946647  # Search on RACE (4 sentences incorrectly placed at end) (80.84%)
# dataset = 'dream'


split = 'high'  # 'middle', 'high', None


# Set useful variables
task_dir += str(task_id)
if dataset != 'race':
    split = None
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}
question_type_labels = ['a', 'c', 'l', 'm', 's'] if dataset == 'dream' else ['a', 'b', 'c', 'd', 'e']

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 275
# Passed Test: 108 / 167 = 64.67 %


In [54]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
quote_ratings = []
durations = []
durations_by_worker = []
durations_by_worker_with_min_acc = []
min_acc = 85
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []
hits_by_qid = {}

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print(hit_result['worker_id'], hit_result['assignment_id'],
              '| reject_reasons:', hit_result['reject_reasons'],
              '| block_reasons:', hit_result['block_reasons'],
              '| bonus_reasons: ' + str(hit_result['bonus_reasons']) if 'bonus_reasons' in hit_result else '')
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
#     if hit_result['worker_id'] == 'A1PUHCEBSOWETV':
#         print('*** A1PUHCEBSOWETV ***', hit_result['assignment_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    if (hit_result.get('quote_rating') is not None) and (hit_result['quote_rating'].isdigit()):
        quote_ratings.append(int(hit_result['quote_rating']))
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        hits_by_qid[qid] = hits_by_qid.get(qid, [])
        hits_by_qid[qid].append(prompt)
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            for qtype in question_type_labels:
                metrics[qid][qtype] = {
                    'num': 0,
                    'num_correct': 0,
                }
            metrics[qid]['answer'] = answer
        for qtype in set(''.join(prompt['sample'].get('question_type_labels', []))):
            qtype = qtype.lower()
            metrics[qid][qtype]['num'] += 1
            metrics[qid][qtype]['num_correct'] += human_correct
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    duration = np.sum(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    acc = round(100 * hit_result['accuracy'][prompt_type])
    if acc >= min_acc:
        durations_by_worker_with_min_acc.append(duration)
#     print(np.array(hit_durations))
    print('| Time:', round(duration, 1),
          '| Acc:', acc,
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

quote_ratings = np.array(quote_ratings)
durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
durations_by_worker_with_min_acc = np.array(durations_by_worker_with_min_acc)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
durations_by_worker_with_min_acc.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Mean Question Duration:', np.mean(durations[int(durations.shape[0] / 10.):int(9. * durations.shape[0] / 10.)]))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
print('Quote Rating:',
      '| Mean:', round(quote_ratings.mean(), 2),
      '| Median:', round(np.median(quote_ratings), 2),
      '| Std:', round(np.std(quote_ratings), 2))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0])

| Time: 641.4 | Acc: 70 | Max Freq: 42.9 | Rate: 6 | Feedback: Provide better pay. And have better quotes. | Quote Rating: 7 | Quote Desc: Some were easy. Others were very vague.
| Time: 276.0 | Acc: 55 | Max Freq: 35.7 | Rate: 1 | Feedback: Make the choices more relevant to the passages. | Quote Rating: 3 | Quote Desc: Largely unrelated to the choices.
| Time: 217.2 | Acc: 55 | Max Freq: 42.9 | Rate: 5 | Feedback: It would help if there was a more clear connection between things. Some of them are very hard. | Quote Rating: 2 | Quote Desc: Many of them seemed random and disconnected from the questions.
| Time: 243.4 | Acc: 65 | Max Freq: 28.6 | Rate: 8 | Feedback: No way that I can think of | Quote Rating: 6 | Quote Desc: Out of context
| Time: 371.5 | Acc: 55 | Max Freq: 35.7 | Rate: 4 | Feedback: Add some more context or information to the passage. | Quote Rating: 6 | Quote Desc: Some of them were straightforward and easy to understand, while some seemed like they might be trick ques

In [55]:
accuracy_by_qtype = {qtype: [] for qtype in question_type_labels}
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for qid_metric_key, prompt in qid_metrics.items():
        if qid_metric_key in question_type_labels:
            qtype = qid_metric_key
            if qid_metrics[qtype]['num'] > 0:
                accuracy_by_qtype[qtype].append(qid_metrics[qtype]['num_correct'] / qid_metrics[qtype]['num'])
            continue
        if not (qid_metric_key in [None] + options):
            continue
        model_stance = qid_metric_key

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
#         if 'num_debate_mode_responses' not in prompt_metrics:
#             print(qid_metric_key, prompt_metrics)
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

accuracy_by_qtype = {qtype: (np.array(accuracy_by_qtype[qtype]).mean(), len(accuracy_by_qtype[qtype])) for qtype in question_type_labels}
worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

print('Accuracy/Num-Samples by Q Type:')
pprint(accuracy_by_qtype)

Evals per sample: 5.25
Fraction insuffient evals: 0.0
Convinced: 32.2 %
- Correct debater: 67.12 %
- Incorrect debater: 20.56 %
Accuracy: 56.38 %
- Correct debater: 67.12 %
- Incorrect debater: 52.8 %
Extra Evals: 4.76 %
Evals per sample distribution: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
Accuracy/Num-Samples by Q Type:
{'a': (0.5062777284826974, 14),
 'b': (0.5590782248627238, 23),
 'c': (0.6568056787932564, 35),
 'd': (0.5557201252853426, 39),
 'e': (0.4723667184

In [52]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (10.48, 7.49)


In [50]:
print(durations_by_worker_with_min_acc)
print(np.median(durations_by_worker_with_min_acc))

[]
nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [53]:
durations_by_worker

array([10.01  , 10.152 , 10.3005, 10.3535, 10.4935, 10.849 , 12.246 ,
       12.3235, 12.6385, 12.7865, 12.8425, 14.222 , 14.566 , 14.5935,
       14.8325, 15.276 , 15.481 , 15.634 , 16.0755, 16.129 , 16.14  ,
       16.161 , 16.69  , 16.7745, 16.7775, 17.459 , 17.537 , 17.5915,
       17.6525, 17.755 , 17.8375, 17.971 , 18.093 , 18.202 , 18.485 ,
       18.6425, 18.8865, 19.0105, 19.099 , 19.1015, 19.2035, 19.3185,
       19.3325, 19.6045, 20.084 , 20.1385, 20.4735, 20.701 , 21.097 ,
       21.35  , 21.4165, 21.545 , 21.641 , 21.835 , 21.8545, 21.919 ,
       22.164 , 22.322 , 22.46  , 22.4615, 22.8765, 24.4695, 24.592 ,
       24.6645, 24.679 , 25.074 , 25.434 , 25.473 , 25.6725, 25.912 ,
       26.1485, 26.435 , 27.097 , 27.3165, 27.4335, 27.627 , 27.8275,
       27.854 , 27.8945, 27.9855, 28.046 , 28.321 , 28.773 , 28.818 ,
       28.838 , 28.9105, 29.5465, 29.893 , 30.3375, 30.341 , 30.3445,
       30.7075, 30.776 , 31.2595, 31.468 , 32.7205, 33.775 , 34.969 ,
       36.055 , 41.9

In [28]:
ttest_ind(convinced_freqs, convinced_freqs_2, equal_var=False)

NameError: name 'convinced_freqs_2' is not defined