In [None]:
from random import randint
import math
import numpy as np
import argparse
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import os
import random
import shutil
import time
from tqdm import tqdm
import sys
from scipy.stats import multivariate_normal
import  scipy.stats as st
from matplotlib import cm
from __future__ import print_function
import pickle


Utilities for evaluation

In [None]:

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)



def evaluate_truth_pred(truths, preds):
    '''
    truths, preds: matched arrays of ground truth answers and predictions
    '''
    f1 = exact_match = total = 0
    array_f1 = []
    for i in range(len(truths)):
        total += 1
        if truths[i] in ['yes', "no"]:
            continue
        ground_truths = [truths[i]]
        prediction = preds[i]
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)
        array_f1.append(metric_max_over_ground_truths(
            f1_score, prediction, ground_truths))
    exact_match = 100.0 * exact_match / (total+ 0.00000000001)
    f1 = 100.0 * f1 / (total+ 0.00000000001)

    return {'exact_match': exact_match, 'f1': f1, 'array_f1':array_f1}


def get_answer( model, tokenizer, context, question):
    # 1. TOKENIZE THE INPUT
    # note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for
    # exploration but you cannot feed that into a model.
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    inputs = inputs.to(device)
    # 2. OBTAIN MODEL SCORES
    # the AutoModelForQuestionAnswering class includes a span predictor on top of the model.
    # the model returns answer start and end scores for each word in the text
    answer_start_scores, answer_end_scores = model(**inputs, return_dict=False)
    answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
    # 3. GET THE ANSWER SPAN
    # once we have the most likely start and end tokens, we grab all the tokens between them
    # and convert tokens back to words!
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer
# https://huggingface.co/transformers/migration.html
def model_evaluation(model, tokenizer, questions, contexts, answers, to_print = False):
    preds = []
    my_list = list(range(len(questions)))
    with tqdm(total=len(my_list)) as pbar:
        for ex in range(len(questions)):
            answer = get_answer(model, tokenizer, contexts[ex],questions[ex])
            preds.append(answer)
            if ex % 100 == 0 and to_print:
                print("context " +contexts[ex] )
                print("quest " +questions[ex] )
                print("truth " +answers[ex]['text'] )
                print("pred " + answer)
            pbar.update(1)
    truths = [answers[i]['text'] for i in range(len(answers))]
    scores = evaluate_truth_pred(truths, preds)
    print(scores)
    return scores


def get_conf_interval(arr):
    alpha_level = 0.95
    err  = st.t.interval(alpha_level, len(arr)-1, loc=np.mean(arr), scale=st.sem(arr))[1]/2  - st.t.interval(alpha_level, len(arr)-1, loc=np.mean(arr), scale=st.sem(arr))[0]/2 
    return err

In [None]:
data_saved = pickle.load(open("data_saved.pkl", "rb"))

In [None]:

tasks = data_saved['tasks']

No-Teaching:

In [None]:
responses_0 = data_saved['responses_0']
all_answers_0 = []
all_truths_0 = []
all_anwers_defer_0 = []
all_truths_defer_0 = []
all_truths_human_0 = []
all_answers_human_0 = []
times_0 = []
all_defers_b_0 = []
test_times_0 = []
defers_0 = []
for i in range(len(responses)):
    #completed_task
    #task_id = int(responses[i]['task_id'].split('task')[1])
    if int(responses_0[i]['completed_task']) != 0:
        raw_task = tasks[responses_0[i]['task_id']]
        task_as = raw_task['test_as']
        task_ai_as = raw_task['test_ai_as']
        hum_answers = responses_0[i]['test_user_answers']
        hum_defer = responses_0[i]['test_user_defers']
        hum_ans = []
        hum_ans_defer = []
        truth_defer = []
        truth_all = []
        time = 0
        teach_pressed_times = responses_0[i]['teach_pressed_times']
        test_pressed_times = responses_0[i]['test_pressed_times']
        if responses_0[i]['completed_task'] == 0:
            continue
            
            
        
        for j in range(len(hum_answers)):
            truth_all.append(task_as[j])
            if hum_defer[j] == 0:
                hum_ans.append(hum_answers[j])
            else:
                hum_ans.append(task_ai_as[j])
                hum_ans_defer.append(task_ai_as[j])
                truth_defer.append(task_as[j])
        overall_f1 = evaluate_truth_pred(hum_ans, truth_all)['f1']


        #print(responses[i]['user_lessons'])
        
        for i in range(1,len(teach_pressed_times)):
            interval = teach_pressed_times[i] - teach_pressed_times[i-1]
            if interval/60/1000 <= 4:
                time += teach_pressed_times[i] - teach_pressed_times[i-1]

        for i in range(1,len(test_pressed_times)):
            interval = test_pressed_times[i] - test_pressed_times[i-1]

            if interval/60/1000 <= 4:
                test_times_0.append(interval/60/1000)
                time += test_pressed_times[i] - test_pressed_times[i-1]
        times_0.append(time/60/1000)
        


        for j in range(len(hum_answers)):
            f1_ai = metric_max_over_ground_truths(f1_score, task_as[j], [task_ai_as[j]])
            if hum_defer[j] == 0:
                if f1_ai <= 0.5:
                    all_defers_b_0.append(1)
                else:
                    all_defers_b_0.append(0)#-1)
            else:        
                if f1_ai >= 0.5:
                    all_defers_b_0.append(1)
                else:
                    all_defers_b_0.append(0)#-1)
                
        for j in range(len(hum_answers)):
            
            truth_all.append(task_as[j])
            defers_0.append( hum_defer[j])
            if hum_defer[j] == 0:
                hum_ans.append(hum_answers[j])
                all_truths_human_0.append(task_as[j])
                all_answers_human_0.append(hum_answers[j])
            else:
                hum_ans.append(task_ai_as[j])
                hum_ans_defer.append(task_ai_as[j])
                truth_defer.append(task_as[j])
                all_anwers_defer_0.append(task_as[j])
                all_truths_defer_0.append(task_ai_as[j])
                
            all_answers_0.append(hum_ans[j])
            all_truths_0.append(task_as[j])
    
        a = evaluate_truth_pred(hum_ans, truth_all)['f1']


In [None]:
print("System F1")
print(evaluate_truth_pred(all_answers_0, all_truths_0)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_answers_0, all_truths_0)['array_f1']))
print("Defer F1")
print(evaluate_truth_pred(all_anwers_defer_0, all_truths_defer_0)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_anwers_defer_0, all_truths_defer_0)['array_f1']))
print("Non-Defer F1")
print(evaluate_truth_pred(all_answers_human_0, all_truths_human_0)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_answers_human_0, all_truths_human_0)['array_f1']))


Our Teaching:

In [None]:
responses_1 = data_saved['responses_1']
all_answers_1 = []
all_truths_1 = []
all_anwers_defer_1 = []
all_truths_defer_1 = []
all_truths_human_1 = []
all_answers_human_1 = []
times_1 = []
test_times_1 = []
all_defers_b_1 = []
teach_all_answers_1 = []
teach_all_truths_1 = []
teach_all_anwers_defer_1 = []
teach_all_truths_defer_1 = []
teach_all_truths_human_1 = []
teach_all_answers_human_1 = []
seen_all_answers_1 = []
seen_all_truths_1 = []
notseen_all_answers_1 = []
notseen_all_truths_1 = []
seen_defers = []
notseen_defers = []
defers_1 = []
user_lessons = {}
for i in range(len(responses_1)):
    #completed_task
    #task_id = int(responses[i]['task_id'].split('task')[1])
    if int(responses_1[i]['completed_task']) != 0:
        raw_task = tasks[responses_1[i]['task_id']]
        task_as = raw_task['test_as']
        task_ai_as = raw_task['test_ai_as']
        hum_answers = responses_1[i]['test_user_answers']
        hum_defer = responses_1[i]['test_user_defers']
        teach_task_as = raw_task['teach_as']
        teach_task_ai_as = raw_task['teach_ai_as']
        teach_hum_answers = responses_1[i]['teach_user_answers']
        teach_hum_defer = responses_1[i]['teach_user_defers']
        test_clusters = raw_task['test_clusters']
        teach_clusters = raw_task['teaching_clusters']
        hum_ans = []
        hum_ans_defer = []
        truth_defer = []
        truth_all = []
        time = 0
        teach_pressed_times = responses_1[i]['teach_pressed_times']
        test_pressed_times = responses_1[i]['test_pressed_times']
        if responses_1[i]['completed_task'] == 0:
            continue
            
        for j in range(len(hum_answers)):
            truth_all.append(task_as[j])
            if hum_defer[j] == 0:
                hum_ans.append(hum_answers[j])
            else:
                hum_ans.append(task_ai_as[j])
                hum_ans_defer.append(task_ai_as[j])
                truth_defer.append(task_as[j])
        overall_f1 = evaluate_truth_pred(hum_ans, truth_all)['f1']

        rand_id = np.random.randint(10000)
        user_lessons[rand_id]= responses_1[i]['user_lessons']

        #print(responses[i]['user_lessons'])
        for i in range(1,len(teach_pressed_times)):
            interval = teach_pressed_times[i] - teach_pressed_times[i-1]
            if interval/60/1000 <= 4:
                time += teach_pressed_times[i] - teach_pressed_times[i-1]

        for i in range(1,len(test_pressed_times)):
            interval = test_pressed_times[i] - test_pressed_times[i-1]

            if interval/60/1000 <= 4:
                test_times_1.append(interval/60/1000)
                time += test_pressed_times[i] - test_pressed_times[i-1]
        times_1.append(time/60/1000)
        


        for j in range(len(hum_answers)):
            f1_ai = metric_max_over_ground_truths(f1_score, task_as[j], [task_ai_as[j]])
            if hum_defer[j] == 0:
                if f1_ai <= 0.5:
                    all_defers_b_1.append(1)
                else:
                    all_defers_b_1.append(0)#-1)
            else:        
                if f1_ai >= 0.5:
                    all_defers_b_1.append(1)
                else:
                    all_defers_b_1.append(0)#-1)
                
        for j in range(len(hum_answers)):
            defers_1.append(hum_defer[j])
            truth_all.append(task_as[j])
            if hum_defer[j] == 0:
                hum_ans.append(hum_answers[j])
                all_truths_human_1.append(task_as[j])
                all_answers_human_1.append(hum_answers[j])
            else:
                hum_ans.append(task_ai_as[j])
                hum_ans_defer.append(task_ai_as[j])
                truth_defer.append(task_as[j])
                all_anwers_defer_1.append(task_as[j])
                all_truths_defer_1.append(task_ai_as[j])
            if test_clusters[j] in teach_clusters:
                seen_all_answers_1.append(hum_ans[j])
                seen_all_truths_1.append(task_as[j])
                seen_defers.append(hum_defer[j])
            else:
                notseen_all_answers_1.append(hum_ans[j])
                notseen_all_truths_1.append(task_as[j])
                notseen_defers.append(hum_defer[j])
            all_answers_1.append(hum_ans[j])
            all_truths_1.append(task_as[j])
            
            

        for j in range(len(teach_hum_answers)):

            if teach_hum_defer[j] == 0:
                teach_all_truths_human_1.append(teach_task_as[j])
                teach_all_answers_human_1.append(teach_hum_answers[j])
                teach_all_answers_1.append(teach_hum_answers[j])

            else:

                teach_all_anwers_defer_1.append(teach_task_as[j])
                teach_all_truths_defer_1.append(teach_task_ai_as[j])
                teach_all_answers_1.append(teach_task_ai_as[j])

            teach_all_truths_1.append(teach_task_as[j])
        a = evaluate_truth_pred(hum_ans, truth_all)['f1']


In [None]:
print("Post-Teaching")
print("System F1")
print(evaluate_truth_pred(all_answers_1, all_truths_1)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_answers_1, all_truths_1)['array_f1']))
print("Defer F1")
print(evaluate_truth_pred(all_anwers_defer_1, all_truths_defer_1)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_anwers_defer_1, all_truths_defer_1)['array_f1']))

print("Non-Defer F1")
print(evaluate_truth_pred(all_answers_human_1, all_truths_human_1)['f1'])
print(get_conf_interval(evaluate_truth_pred(all_answers_human_1, all_truths_human_1)['array_f1']))


In [None]:
print("Coverage")
print(np.mean(defers_1))
print(get_conf_interval(defers_1))

In [None]:
treat1 = evaluate_truth_pred(all_answers_human_1, all_truths_human_1)['array_f1']
treat0 = evaluate_truth_pred(all_answers_human_0, all_truths_human_0)['array_f1']

In [None]:
from scipy import stats
stats.ttest_ind(treat1,treat0)

{'exact_match': 49.16666666666585, 'f1': 54.330939213291245}


In [None]:
print("not seen clusters")
print(evaluate_truth_pred(notseen_all_answers_1, notseen_all_truths_1)['f1'])
print(" seen clusters")
print(evaluate_truth_pred(seen_all_answers_1, seen_all_truths_1)['f1'])

In [None]:
treat1 = evaluate_truth_pred(notseen_all_answers_1, notseen_all_answers_1)['array_f1']
treat0 = evaluate_truth_pred(seen_all_answers_1, seen_all_truths_1)['array_f1']
print(stats.ttest_ind(treat1,treat0))

## testing differences

In [None]:
print("Non defer")
treat1 = evaluate_truth_pred(all_anwers_defer_1, all_truths_defer_1)['array_f1']
treat0 = evaluate_truth_pred(all_anwers_defer_0, all_truths_defer_0)['array_f1']
print(stats.ttest_ind(treat1,treat0))

In [None]:
print(" defer")
treat1 = evaluate_truth_pred(all_answers_human_1, all_truths_human_1)['array_f1']
treat0 = evaluate_truth_pred(all_answers_human_0, all_truths_human_0)['array_f1']
print(stats.ttest_ind(treat1,treat0))

In [None]:
print("overall")
treat1 = evaluate_truth_pred(all_answers_1, all_truths_1)['array_f1']
treat0 = evaluate_truth_pred(all_answers_0, all_truths_0)['array_f1']
print(stats.ttest_ind(treat1,treat0))

In [None]:
print("defer")
treat1 = defers_0
treat0 = defers_1
print(stats.ttest_ind(treat1,treat0))