In [136]:
import pandas as pd
import pickle
import re

In [137]:
def load_cache(cache_fname):
    with open(cache_fname, 'rb') as fp:
        cache = pickle.load(fp)
    return cache

def retrieve_from_cache(cache, question, answer, narrative):

    """
    Use question, answer and narrative to retrieve all associated values
    Return failure if key not found
    """

    key = (question.lower(), answer.lower(), narrative.lower())
    try:
        return cache[key]
    except:
        return {'message': 'Key not found'}

In [138]:
cache = load_cache('../artifacts/human_eval_cache.pkl')

In [139]:
def remove_punctuation(text):
    result = re.sub(r'[^\w\s]', '', text)
    return result

In [140]:
def overall_avg_likert(df, cache):
    likerts = []
    for idx, row in df.iterrows():
        question = row['question']
        try:
            answer = remove_punctuation(row['predicted_answer'])
        except:
            answer = remove_punctuation(row['answer'])
        narrative = row['narrative']
        key = (question, answer, narrative)
        info = retrieve_from_cache(cache, key[0], key[1], key[2])
        likertscores = info['val_annotations']
        likert = sum(likertscores) / len(likertscores)
        likerts.append(likert)
    print(f'Overall avg Likert for all answers {round(sum(likerts) / len(likerts), 2)}')

In [141]:
def overall_avg_binary_likert(df, cache):
    likerts = []
    for idx, row in df.iterrows():
        question = row['question']
        try:
            answer = remove_punctuation(row['predicted_answer'])
        except:
            answer = remove_punctuation(row['answer'])
        narrative = row['narrative']
        key = (question, answer, narrative)
        info = retrieve_from_cache(cache, key[0], key[1], key[2])
        likertscores = info['val_annotations']
        binary_likertscores = [0 if x < 1 else 1 for x in likertscores]
        likert = sum(binary_likertscores) / len(binary_likertscores)
        likerts.append(likert)
    print(f'Overall avg binary Likert for all answers: {round(sum(likerts) / len(likerts), 2)}')

In [142]:
def get_all_numbers(df, cache):
    overall_avg_likert(df, cache)
    overall_avg_binary_likert(df, cache)

In [143]:
def evaluate_df_by_onto(df, cache):
    
    conseq_df = df[df['onto'] == 'Consequence']
    print('Consequence')
    get_all_numbers(conseq_df, cache)
    
    goal_df = df[df['onto'] == 'Goal seeking']
    print('Goal seeking')
    get_all_numbers(goal_df, cache)
    
    reac_df = df[df['onto'] == 'Reactionary']
    print('Reactionary')
    get_all_numbers(reac_df, cache)
    
    desire_df = df[df['onto'] == 'Desire']
    print('Desire')
    get_all_numbers(desire_df, cache)
    
    other_df = df[df['onto'] == 'Other']
    print('Other')
    get_all_numbers(other_df, cache)

In [144]:
ontology_df = pd.read_csv('../artifacts/hidden_test_set_ontology.csv')

In [145]:
meta_to_ontology_dict = {}
for idx, row in ontology_df.iterrows():
    meta_to_ontology_dict[row['question_meta']] = row['Ontology']

In [146]:
def add_onto_to_df(df, meta_to_ontology_dict):
    ontos = []
    for idx, row in df.iterrows():
        try:
            ontos.append(meta_to_ontology_dict[row['question_meta']])
        except:
            ontos.append(meta_to_ontology_dict[row['meta']])
    df['onto'] = ontos
    return df

In [147]:
onto_count_dict = ontology_df['Ontology'].value_counts().to_dict()

In [148]:
onto_count_dict

{'Consequence': 140,
 'Goal seeking': 135,
 'Reactionary': 118,
 'Desire': 41,
 'Other': 30}

In [149]:
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_w_n_separator.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Overall avg Likert for all answers 0.58
Overall avg binary Likert for all answers: 0.61


In [150]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.02
Overall avg binary Likert for all answers: 0.42


In [151]:
evaluate_df_by_onto(impl_t5_df, cache)

Consequence
Overall avg Likert for all answers -0.22
Overall avg binary Likert for all answers: 0.32
Goal seeking
Overall avg Likert for all answers 0.33
Overall avg binary Likert for all answers: 0.55
Reactionary
Overall avg Likert for all answers 0.25
Overall avg binary Likert for all answers: 0.49
Desire
Overall avg Likert for all answers 0.29
Overall avg binary Likert for all answers: 0.52
Other
Overall avg Likert for all answers -0.09
Overall avg binary Likert for all answers: 0.37


In [152]:
t5_knowl_df = pd.read_csv('../artifacts/model_predictions/t5base_w_n_separator_w_knowl.csv')
t5_knowl_df = add_onto_to_df(t5_knowl_df, meta_to_ontology_dict)
get_all_numbers(t5_knowl_df, cache)

Overall avg Likert for all answers 0.91
Overall avg binary Likert for all answers: 0.73


In [153]:
impl_t5_knowl_df = t5_knowl_df[t5_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_knowl_df, cache)

Overall avg Likert for all answers 0.56
Overall avg binary Likert for all answers: 0.61


In [154]:
evaluate_df_by_onto(impl_t5_knowl_df, cache)

Consequence
Overall avg Likert for all answers 0.45
Overall avg binary Likert for all answers: 0.57
Goal seeking
Overall avg Likert for all answers 0.7
Overall avg binary Likert for all answers: 0.68
Reactionary
Overall avg Likert for all answers 0.8
Overall avg binary Likert for all answers: 0.71
Desire
Overall avg Likert for all answers 0.63
Overall avg binary Likert for all answers: 0.63
Other
Overall avg Likert for all answers 0.41
Overall avg binary Likert for all answers: 0.53


In [155]:
t511b_df = pd.read_csv('../artifacts/model_predictions/t511b_w_n_separator.csv')
t511b_df = add_onto_to_df(t511b_df, meta_to_ontology_dict)
get_all_numbers(t511b_df, cache)

Overall avg Likert for all answers 1.21
Overall avg binary Likert for all answers: 0.84


In [156]:
impl_t511b_df = t511b_df[t511b_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t511b_df, cache)

Overall avg Likert for all answers 0.97
Overall avg binary Likert for all answers: 0.75


In [157]:
evaluate_df_by_onto(impl_t511b_df, cache)

Consequence
Overall avg Likert for all answers 0.93
Overall avg binary Likert for all answers: 0.72
Goal seeking
Overall avg Likert for all answers 0.97
Overall avg binary Likert for all answers: 0.75
Reactionary
Overall avg Likert for all answers 1.18
Overall avg binary Likert for all answers: 0.84
Desire
Overall avg Likert for all answers 0.98
Overall avg binary Likert for all answers: 0.76
Other
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.74


In [158]:
# this file is the t511b with top 3 diverse comet verbalized
t511b_knowl_df = pd.read_csv('../artifacts/model_predictions/t511b_w_n_separator_w_knowl.csv')
t511b_knowl_df = add_onto_to_df(t511b_knowl_df, meta_to_ontology_dict)
get_all_numbers(t511b_knowl_df, cache)

Overall avg Likert for all answers 1.27
Overall avg binary Likert for all answers: 0.85


In [159]:
impl_t511b_knowl_df = t511b_knowl_df[t511b_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t511b_knowl_df, cache)

Overall avg Likert for all answers 1.04
Overall avg binary Likert for all answers: 0.77


In [160]:
evaluate_df_by_onto(impl_t511b_knowl_df, cache)

Consequence
Overall avg Likert for all answers 1.01
Overall avg binary Likert for all answers: 0.74
Goal seeking
Overall avg Likert for all answers 1.29
Overall avg binary Likert for all answers: 0.87
Reactionary
Overall avg Likert for all answers 0.96
Overall avg binary Likert for all answers: 0.73
Desire
Overall avg Likert for all answers 1.08
Overall avg binary Likert for all answers: 0.82
Other
Overall avg Likert for all answers 0.9
Overall avg binary Likert for all answers: 0.74


In [161]:
gpt3_df = pd.read_csv('../artifacts/model_predictions/gpt3.csv')
gpt3_df = add_onto_to_df(gpt3_df, meta_to_ontology_dict)
get_all_numbers(gpt3_df, cache)

Overall avg Likert for all answers 1.17
Overall avg binary Likert for all answers: 0.83


In [162]:
impl_gpt3_df = gpt3_df[gpt3_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_gpt3_df, cache)

Overall avg Likert for all answers 1.1
Overall avg binary Likert for all answers: 0.8


In [163]:
evaluate_df_by_onto(impl_gpt3_df, cache)

Consequence
Overall avg Likert for all answers 1.11
Overall avg binary Likert for all answers: 0.81
Goal seeking
Overall avg Likert for all answers 1.05
Overall avg binary Likert for all answers: 0.79
Reactionary
Overall avg Likert for all answers 1.09
Overall avg binary Likert for all answers: 0.78
Desire
Overall avg Likert for all answers 1.16
Overall avg binary Likert for all answers: 0.81
Other
Overall avg Likert for all answers 1.09
Overall avg binary Likert for all answers: 0.82


In [164]:
gpt3_knowl_df = pd.read_csv('../artifacts/model_predictions/gpt3_w_knowl.csv')
gpt3_knowl_df = add_onto_to_df(gpt3_knowl_df, meta_to_ontology_dict)
get_all_numbers(gpt3_knowl_df, cache)

Overall avg Likert for all answers 1.32
Overall avg binary Likert for all answers: 0.87


In [165]:
impl_gpt3_knowl_df = gpt3_knowl_df[gpt3_knowl_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_gpt3_knowl_df, cache)

Overall avg Likert for all answers 1.24
Overall avg binary Likert for all answers: 0.85


In [166]:
evaluate_df_by_onto(impl_gpt3_knowl_df, cache)

Consequence
Overall avg Likert for all answers 1.26
Overall avg binary Likert for all answers: 0.85
Goal seeking
Overall avg Likert for all answers 1.3
Overall avg binary Likert for all answers: 0.9
Reactionary
Overall avg Likert for all answers 1.2
Overall avg binary Likert for all answers: 0.83
Desire
Overall avg Likert for all answers 1.23
Overall avg binary Likert for all answers: 0.85
Other
Overall avg Likert for all answers 1.16
Overall avg binary Likert for all answers: 0.82


# Model Setup

## Base

In [32]:
print('Gtup top3')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_tup_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtup top3
Overall avg Likert for all answers 0.52
Overall avg binary Likert for all answers: 0.59


In [33]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.08
Overall avg binary Likert for all answers: 0.45


In [101]:
print('Gtupsep top3')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_tupsep_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gtupsep top3
Overall avg Likert for all answers 0.52
Overall avg binary Likert for all answers: 0.6


In [102]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.1
Overall avg binary Likert for all answers: 0.46


In [103]:
print('Gverb. top1')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_verb_top1_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top1
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.72


In [104]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.51
Overall avg binary Likert for all answers: 0.59


In [105]:
print('Gverb. top5 diverse')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_verb_top5_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top5 diverse
Overall avg Likert for all answers 0.91
Overall avg binary Likert for all answers: 0.73


In [106]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.56
Overall avg binary Likert for all answers: 0.61


In [107]:
print('Gverb. top3 original')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_verb_top3_original.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 original
Overall avg Likert for all answers 0.75
Overall avg binary Likert for all answers: 0.67


In [108]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.26
Overall avg binary Likert for all answers: 0.5


In [109]:
print('Gverb. top3 diverse')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_verb_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 diverse
Overall avg Likert for all answers 0.84
Overall avg binary Likert for all answers: 0.7


In [110]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.47
Overall avg binary Likert for all answers: 0.58


In [111]:
print('Gverb. top3 reranked')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base_verb_top3_reranked.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

Gverb. top3 reranked
Overall avg Likert for all answers 0.88
Overall avg binary Likert for all answers: 0.71


In [112]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.59
Overall avg binary Likert for all answers: 0.62


In [169]:
print('T5 Appendix D.3 format - no separator')
t5_df = pd.read_csv('../artifacts/model_predictions/t5base.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

T5 Appendix D.3 format - no separator


KeyError: 'val_annotations'

In [170]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

KeyError: 'val_annotations'

## 11B

In [None]:
print('Gtup top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_tup_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gtupsep top3')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_tupsep_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gverb. top1')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top1_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gverb. top5 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top5_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gverb. top3 original')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_original.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gverb. top3 diverse')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_diverse.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [None]:
print('Gverb. top3 reranked')
t5_df = pd.read_csv('../data/camera-ready-predictions/t511b_verb_top3_reranked.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

In [None]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

In [52]:
print('T5 Appendix D.3 format - no separator')
t5_df = pd.read_csv('../artifacts/model_predictions/t511b.csv')
t5_df = add_onto_to_df(t5_df, meta_to_ontology_dict)
get_all_numbers(t5_df, cache)

T5 Appendix D.3 format - no separator
Overall avg Likert for all answers 0.99
Overall avg binary Likert for all answers: 0.77


In [53]:
impl_t5_df = t5_df[t5_df['is_ques_answerable'] == 'Not Answerable']
get_all_numbers(impl_t5_df, cache)

Overall avg Likert for all answers 0.6
Overall avg binary Likert for all answers: 0.62
