In [125]:
import json
import numpy as np
import os
from pprint import pprint
from scipy.stats import ttest_ind

task_dir = '/Users/ethanperez/research/ParlAI/parlai/mturk/core/run_data/live/context_evaluator_'

### RACE: Unfiltered Workers
# prompt_type, task_id = 'question', '1553982706'  # Q-only
# prompt_type, task_id = 'context_question', 1553790696  # TFIDF
# prompt_type, task_id = 'context_question', 1553901953  # FastText
### RACE: Filtered Workers
# prompt_type, task_id = 'question', '1554052233'  # Q-only
# prompt_type, task_id = 'quote and question', 1554006689  # TFIDF-QA
# prompt_type, task_id = 'quote and question', 1554130485  # TFIDF-A
# prompt_type, task_id = 'quote and question', 1554069931  # Cross-Ranker
# prompt_type, task_id = 'quote and question', 1554072277  # SL
# prompt_type, task_id = 'quote and question', 1554132868  # SL-Influence

### RACE Test
## Convinced
# prompt_type, task_id = 'quote and question', 1556671432  # TFIDF-QA
# prompt_type, task_id = 'quote and question', 1556725767  # TFIDF-A
prompt_type, task_id = 'quote and question', 1556739336  # BoW-A
## Acc on Summary
# prompt_type, task_id = 'passage and question', 1555823963  # Full Passage
# prompt_type, task_id = 'quotes and question', 1555946909  # FastText
# prompt_type, task_id = 'quotes and question', 1555952058  # Search (6-10 sentence incorrectly placed at end)
dataset = 'race'

### DREAM
# prompt_type, task_id = 'question', 1554582693  # Q-only
# prompt_type, task_id = 'quote and question', 1554596686  # TFIDF-QA
# prompt_type, task_id = 'quote and question', 1554587404  # TFIDF-A
# prompt_type, task_id = 'quote and question', 1554662280  # BoW-A
# prompt_type, task_id = 'quote and question', 1556670413  # Bi-Ranker
# prompt_type, task_id = 'quote and question', 1554675304  # Cross-Ranker
# prompt_type, task_id = 'quote and question', 1554685131  # SL
# prompt_type, task_id = 'quote and question', 1554692472  # SL-Sents
# prompt_type, task_id = 'quote and question', 1554729998  # SL-Sents-Influence
# prompt_type, task_id = 'quote and question', 1555333992  # SL-Theory-of-Mind
## All answers at once
# prompt_type, task_id = 'question, answers, and quotes', 1555707929  # TFIDF(O): 64.21%: (Less filter / no feedback)
# prompt_type, task_id = 'question, answers, and quotes', 1555722489  # Search: 65.38%: (Less filter / no feedback)
# prompt_type, task_id = 'question and quotes', 1555789302  # SL: 75.17% (4/5 filter)
# prompt_type, task_id = 'question and quotes', 1555812443  # SL: 79.32% Actually: quotes and question (4/5 filter)
# prompt_type, task_id = 'passage and question', 1555804551  # Full Passage: 92.97%
# prompt_type, task_id = 'quotes and question', 1555823257  # FastText (5/5 filter) (77.33%)
# prompt_type, task_id = 'quotes and question', 1555946647  # RACE Cross-Ranker (4 sentences incorrectly placed at end) (80.84%)
# prompt_type, task_id = 'quotes and question', 1556727396  # Cross-Ranker
# prompt_type, task_id = 'quotes and question', 1556740293  # Bi-Ranker
# dataset = 'dream'


split = 'high'  # 'middle', 'high', None


# Set useful variables
task_dir += str(task_id)
if dataset != 'race':
    split = None
num_options = 3 if dataset == 'dream' else 4
options = ['A', 'B', 'C', 'D'][:num_options]
debate_mode_to_option = {'Ⅰ': 'A', 'Ⅱ': 'B', 'Ⅲ': 'C', 'Ⅳ': 'D', 'ⅰ': 'A', 'ⅱ': 'B', 'ⅲ': 'C', 'ⅳ': 'D', None: None}
question_type_labels = ['a', 'c', 'l', 'm', 's'] if dataset == 'dream' else ['a', 'b', 'c', 'd', 'e']

# Read HIT data
print('# HIT Files:', len(os.listdir(task_dir)))
hit_results = []
num_passed_test = 0
for hit_dir in os.listdir(task_dir):
    if hit_dir.startswith('o_'):
        continue
    num_passed_test += 1
    with open(os.path.join(task_dir, hit_dir, 'custom/data.json'), 'r') as file:
        hit_results.append(json.load(file))
        file.close()
if len(os.listdir(task_dir)) != num_passed_test:
    num_total_tested = len(os.listdir(task_dir)) - num_passed_test
    print('# Passed Test:', num_passed_test, '/', num_total_tested, '=', round((100. * num_passed_test) / num_total_tested, 2), '%')

# HIT Files: 301
# Passed Test: 109 / 192 = 56.77 %


In [126]:
num_valid_hits, num_rejected_hits, num_incomplete_hits = 0, 0, 0
metrics = {}
task_ratings = {i: 0 for i in range(11)}
quote_ratings = []
durations = []
durations_by_worker = []
durations_by_worker_with_min_acc = []
min_acc = 85
accuracy_by_worker = {}
max_response_freqs = []
worker_ids = []
hits_by_qid = {}

for hit_result in hit_results:
    if ((len(hit_result['reject_reasons']) > 0) or
        (len(hit_result['block_reasons']) > 0)):
        num_rejected_hits += 1
        print(hit_result['worker_id'], hit_result['assignment_id'],
              '| reject_reasons:', hit_result['reject_reasons'],
              '| block_reasons:', hit_result['block_reasons'],
              '| bonus_reasons: ' + str(hit_result['bonus_reasons']) if 'bonus_reasons' in hit_result else '')
    elif hit_result['feedback'] is None:
        num_incomplete_hits += 1
        continue
    
    worker_ids.append(hit_result['worker_id'])
    num_valid_hits += 1
    if (hit_result['task_rating'] is not None) and (hit_result['task_rating'].isdigit()):
        task_ratings[int(hit_result['task_rating'])] += 1
    if (hit_result.get('quote_rating') is not None) and (hit_result['quote_rating'].isdigit()):
        quote_ratings.append(int(hit_result['quote_rating']))
    for qtype, qtype_accuracy in hit_result['accuracy'].items():
        accuracy_by_worker[qtype] = accuracy_by_worker.get(qtype, []) + [qtype_accuracy]
    
    hit_durations = []
    response_option_counts = {option: 0 for option in options}
    responses = []
    for prompt in hit_result['data']:
        qid = prompt['sample']['qid']
        if (split is not None) and (split not in qid):
            continue
        hits_by_qid[qid] = hits_by_qid.get(qid, [])
        hits_by_qid[qid].append(prompt)
        model_stance = debate_mode_to_option[prompt['sample']['debate_mode']]
        answer = prompt['sample']['eval_labels'][0]
        human_correct = (prompt['response'] == answer)
        assert answer in options, 'Answer must be in options.'
        
        # Calculate metrics
        if qid not in metrics:
            metrics[qid] = {
                option: {
                    'num': 0,
                    'num_correct': 0,
                    'num_correct_debate_mode': 0,
                    'num_incorrect_debate_mode': 0,
                    'num_correct_with_correct_debate_mode': 0,
                    'num_correct_with_incorrect_debate_mode': 0,
                    'num_debate_mode_responses': 0,
                }
                for option in ([None] if model_stance is None else options)
            }
            for qtype in question_type_labels:
                metrics[qid][qtype] = {
                    'num': 0,
                    'num_correct': 0,
                }
            metrics[qid]['answer'] = answer
        for qtype in set(''.join(prompt['sample'].get('question_type_labels', []))):
            qtype = qtype.lower()
            metrics[qid][qtype]['num'] += 1
            metrics[qid][qtype]['num_correct'] += human_correct
        prompt_metrics = metrics[qid][model_stance]
        prompt_metrics['num'] += 1
        prompt_metrics['num_correct'] += human_correct
        if model_stance == answer:
            prompt_metrics['num_correct_with_correct_debate_mode'] += human_correct
            prompt_metrics['num_correct_debate_mode'] += 1
        else:
            prompt_metrics['num_correct_with_incorrect_debate_mode'] += human_correct
            prompt_metrics['num_incorrect_debate_mode'] += 1
        prompt_metrics['num_debate_mode_responses'] += (prompt['response'] == model_stance)
        
        hit_durations.append(prompt['duration'] / 1000.)
        response_option_counts[prompt['response']] += 1
        responses.append(prompt['response'])
    duration = np.median(np.array(hit_durations))
    durations_by_worker.append(duration)
    durations += hit_durations
    response_options_array = np.array(list(response_option_counts.values()))
    response_options_array = response_options_array / response_options_array.sum()
    max_response_freq = response_options_array.max()
    max_response_freqs.append(max_response_freq)
    acc = round(100 * hit_result['accuracy'][prompt_type])
    if acc >= min_acc:
        durations_by_worker_with_min_acc.append(duration)
#     print(np.array(hit_durations))
    print('| Time:', round(duration, 1),
          '| Acc:', acc,
          '| Max Freq:', round(100 * max_response_freq, 1),
          '| Rate:', hit_result['task_rating'],
          '| Feedback:', hit_result['feedback'],
          '| Quote Rating:', None if 'quote_rating' not in hit_result else hit_result['quote_rating'], 
          '| Quote Desc:', None if 'quote_description' not in hit_result else hit_result['quote_description'])

quote_ratings = np.array(quote_ratings)
durations = np.array(durations)
durations_by_worker = np.array(durations_by_worker)
durations_by_worker_with_min_acc = np.array(durations_by_worker_with_min_acc)
max_response_freqs = np.array(max_response_freqs)
durations.sort()
durations_by_worker.sort()
durations_by_worker_with_min_acc.sort()
max_response_freqs.sort()
for qtype in accuracy_by_worker:
    accuracy_by_worker[qtype] = np.array(accuracy_by_worker[qtype])
    accuracy_by_worker[qtype].sort()
print('REJECTED:', num_rejected_hits)
print('INCOMPLETE:', num_incomplete_hits)
print('VALID:', num_valid_hits)
print('Median Question Duration:', np.median(durations))
print('Mean Question Duration:', np.mean(durations[int(durations.shape[0] / 10.):int(9. * durations.shape[0] / 10.)]))
print('Median Worker Duration:', np.median(durations_by_worker))
print('Median Worker Accuracy:', np.median(accuracy_by_worker[prompt_type]))
print('Median Max Response Freq:', np.median(max_response_freqs))
print('Quote Rating:',
      '| Mean:', round(quote_ratings.mean(), 2),
      '| Median:', round(np.median(quote_ratings), 2),
      '| Std:', round(np.std(quote_ratings), 2))
# pprint(hit_results[0]['data'][0])
# pprint(hit_results[0])

| Time: 18.2 | Acc: 45 | Max Freq: 28.6 | Rate: 1 | Feedback: Add more to the passage so that it is not so vague and less cumbersome to choose an answer for | Quote Rating: 2 | Quote Desc: vague
| Time: 26.8 | Acc: 50 | Max Freq: 35.7 | Rate: 8 | Feedback: I think it's good and it works well.  | Quote Rating: 4 | Quote Desc: It made me think. I appreciate that. 
| Time: 10.9 | Acc: 60 | Max Freq: 50.0 | Rate: 5 | Feedback: Better passage help | Quote Rating: 2 | Quote Desc: A lot of them were useless
| Time: 16.9 | Acc: 40 | Max Freq: 28.6 | Rate: 7 | Feedback: I don't have any suggestions at this time | Quote Rating: 7 | Quote Desc: Some were downright impossible to understand it seemed but most were straightforward
| Time: 19.7 | Acc: 80 | Max Freq: 28.6 | Rate: 7 | Feedback: Better answer choices | Quote Rating: 6 | Quote Desc: Sometimes confusing, as none seemed to fit
| Time: 21.9 | Acc: 55 | Max Freq: 28.6 | Rate: 2 | Feedback: The guessing the title of the article was very diffi

| Time: 17.9 | Acc: 50 | Max Freq: 35.7 | Rate: 5 | Feedback: make it easier | Quote Rating: 5 | Quote Desc: Some of them were random and didn't provide enough context
| Time: 33.0 | Acc: 65 | Max Freq: 42.9 | Rate: 10 | Feedback: I think this task is perfect, I did find the drop down for answers seem to go back to A at times, one during one of my answers. | Quote Rating: 7 | Quote Desc: Seemingly unrelated passages at times that may or may not have different meanings or appeared unrelated.
| Time: 21.1 | Acc: 65 | Max Freq: 50.0 | Rate: 7 | Feedback: add a progress bar | Quote Rating: 6 | Quote Desc: sometimes quite helpful, and the remaining times nonsensical 
| Time: 31.5 | Acc: 50 | Max Freq: 50.0 | Rate: 8 | Feedback: It's fine as is | Quote Rating: 8 | Quote Desc: Informative
| Time: 14.0 | Acc: 60 | Max Freq: 42.9 | Rate: 9 | Feedback: I see no room for improvement, thanks | Quote Rating: 10 | Quote Desc: Random snippets of dialogue
| Time: 18.9 | Acc: 55 | Max Freq: 50.0 | Rate

In [127]:
accuracy_by_qtype = {qtype: [] for qtype in question_type_labels}
accuracy_by_sample = []
accuracy_by_sample_correct_debate_mode = []
accuracy_by_sample_incorrect_debate_mode = []
convinced_freqs = []
convinced_freqs_with_correct_debate_mode = []
convinced_freqs_with_incorrect_debate_mode = []
num_evals_by_sample = []
for qid, qid_metrics in metrics.items():
    answer = metrics[qid]['answer']
    for qid_metric_key, prompt in qid_metrics.items():
        if qid_metric_key in question_type_labels:
            qtype = qid_metric_key
            if qid_metrics[qtype]['num'] > 0:
                accuracy_by_qtype[qtype].append(qid_metrics[qtype]['num_correct'] / qid_metrics[qtype]['num'])
            continue
        if not (qid_metric_key in [None] + options):
            continue
        model_stance = qid_metric_key

        # Q-only stats
        prompt_metrics = metrics[qid][model_stance]
        num_evals_by_sample.append(prompt_metrics['num'])
        accuracy_by_sample.append(prompt_metrics['num_correct'] / prompt_metrics['num'])
        if model_stance is None:
            continue
        
        # Debater stats
#         if 'num_debate_mode_responses' not in prompt_metrics:
#             print(qid_metric_key, prompt_metrics)
        convinced_freq = prompt_metrics['num_debate_mode_responses'] / prompt_metrics['num']
        if model_stance == answer:
            convinced_freqs_with_correct_debate_mode.append(convinced_freq)
            accuracy_by_sample_correct_debate_mode.append(
                prompt_metrics['num_correct_with_correct_debate_mode'] /
                prompt_metrics['num_correct_debate_mode'])
        else:
            convinced_freqs_with_incorrect_debate_mode.append(convinced_freq)
            accuracy_by_sample_incorrect_debate_mode.append(
                prompt_metrics['num_correct_with_incorrect_debate_mode'] /
                prompt_metrics['num_incorrect_debate_mode'])
        convinced_freqs.append(convinced_freq)

accuracy_by_qtype = {qtype: (np.array(accuracy_by_qtype[qtype]).mean(), len(accuracy_by_qtype[qtype])) for qtype in question_type_labels}
worker_ids = set(worker_ids)
        
num_evals_by_sample = np.array(num_evals_by_sample)
print('Evals per sample:', num_evals_by_sample.mean())
print('Fraction insuffient evals:', (num_evals_by_sample < 5).mean())

convinced_freqs = np.array(convinced_freqs)
print('Convinced:', round(100 * convinced_freqs.mean(), 2), '%')
convinced_freqs_with_correct_debate_mode = np.array(convinced_freqs_with_correct_debate_mode)
print('- Correct debater:', round(100 * convinced_freqs_with_correct_debate_mode.mean(), 2), '%')
convinced_freqs_with_incorrect_debate_mode = np.array(convinced_freqs_with_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * convinced_freqs_with_incorrect_debate_mode.mean(), 2), '%')

accuracy_by_sample = np.array(accuracy_by_sample)
print('Accuracy:', round(100 * accuracy_by_sample.mean(), 2), '%')
accuracy_by_sample_correct_debate_mode = np.array(accuracy_by_sample_correct_debate_mode)
print('- Correct debater:', round(100 * accuracy_by_sample_correct_debate_mode.mean(), 2), '%')
accuracy_by_sample_incorrect_debate_mode = np.array(accuracy_by_sample_incorrect_debate_mode)
print('- Incorrect debater:', round(100 * accuracy_by_sample_incorrect_debate_mode.mean(), 2), '%')

num_target_evals = 5
print('Extra Evals:', round(((100. * (num_evals_by_sample - num_target_evals).sum()) / num_evals_by_sample.sum()), 2), '%')
num_evals_by_sample.sort()
print('Evals per sample distribution:', num_evals_by_sample)

print('Accuracy/Num-Samples by Q Type:')
pprint(accuracy_by_qtype)

Evals per sample: 5.25
Fraction insuffient evals: 0.0
Convinced: 35.65 %
- Correct debater: 70.84 %
- Incorrect debater: 23.92 %
Accuracy: 54.18 %
- Correct debater: 70.84 %
- Incorrect debater: 48.63 %
Extra Evals: 4.76 %
Evals per sample distribution: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
Accuracy/Num-Samples by Q Type:
{'a': (0.5089285714285714, 14),
 'b': (0.5316252587991718, 23),
 'c': (0.6055442176870748, 35),
 'd': (0.5604395604395606, 39),
 'e': (0.48430059

In [128]:
def nps(task_ratings):
    num_ratings = sum(list(task_ratings.values()))
    if num_ratings == 0:
        return None

    nps_sum_ratings = 0
    sum_ratings = 0
    for score, num_raters in task_ratings.items():
        sum_ratings += num_raters * score
        if score >= 9:
            nps_sum_ratings += num_raters
        elif score <= 6:
            nps_sum_ratings -= num_raters
    return round(100 * (nps_sum_ratings / float(num_ratings)), 2), round((sum_ratings / float(num_ratings)), 2)

print('NPS, Mean:', nps(task_ratings))

NPS, Mean: (-1.9, 7.06)


In [129]:
print(durations_by_worker_with_min_acc)
print(np.median(durations_by_worker_with_min_acc))

[25.369]
25.369


In [119]:
durations_by_worker

array([ 8.723 ,  9.1505, 10.613 , 10.623 , 11.5635, 12.0355, 12.677 ,
       12.6905, 13.394 , 13.664 , 13.7675, 13.859 , 14.045 , 14.109 ,
       14.224 , 14.228 , 14.3945, 14.407 , 14.6205, 14.741 , 14.7515,
       14.945 , 15.3465, 16.096 , 16.1455, 16.3895, 16.454 , 16.5925,
       17.169 , 17.193 , 17.6325, 17.8105, 18.1785, 18.1915, 18.206 ,
       18.294 , 18.517 , 18.583 , 18.706 , 19.2735, 19.388 , 19.846 ,
       19.934 , 19.966 , 20.1005, 20.214 , 20.2825, 20.333 , 20.387 ,
       20.3905, 20.401 , 20.42  , 20.9475, 21.588 , 21.6255, 21.686 ,
       21.8405, 21.8495, 22.2155, 22.9135, 23.0485, 23.2535, 23.3105,
       23.432 , 24.0465, 24.145 , 24.4615, 24.7615, 24.814 , 24.8435,
       24.864 , 25.32  , 25.8865, 25.9225, 26.215 , 26.494 , 26.525 ,
       26.827 , 27.3015, 27.751 , 27.8305, 27.937 , 28.0475, 28.904 ,
       29.808 , 30.0685, 31.5035, 31.8115, 32.2375, 32.8115, 32.9505,
       33.045 , 34.888 , 35.017 , 35.1045, 35.3805, 35.428 , 37.185 ,
       37.6045, 38.1

In [107]:
ttest_ind(convinced_freqs, convinced_freqs_2, equal_var=False)

NameError: name 'convinced_freqs_2' is not defined