In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm

In [2]:
# shared helper functions

def _get_refusal_for_one_response(response) -> bool:
    """
    evaluate whether ONE response is a refusal
    """
    res = False
    if response: 
        keywords = {"not", "no", "sorry", "don't", "doesn't", "didn't", "can't", "couldn't", "won't", "wouldn't", "isn't", "unknown", "unclear"}
        for target_word in keywords:
            negative_keyword_present = re.search(r'\b' + target_word + r'\b', response.lower())
            if negative_keyword_present:
                res = True       
                
    return res


def _get_correctness_for_one_response(response, label) -> bool:
    if response:
        # our key condition: if any of the labels are in the response, then it is correct
        if any([x in response for x in label]):
            return True
    return False

# Close-source Models

In [3]:
def get_file_and_build_df_close_source(model_name, task_name, task_mode, model_type):
    """
    model_type: "open_source" or "proprietary"
    model_name: "chatgpt", "claude", "palm", "gpt-4-0613", etc
    task_name: "FalseQA", "NEC", "RefuNQ"
    task_mode: "answerable" or "unanswerable"
    """
    # get path
    if model_type == "open_source":
        path_response_results = f"../outputs/Open-source-LLMs_responses/{model_name}"
        path_uncertainty_results = f"../../experiment_outputs/UNIFIED_verbalize_confidence_no_ICL/{model_name}"
    elif model_type == "proprietary":
        path_response_results = f"../outputs/Close-source-LLMs_responses/{model_name}"
        path_uncertainty_results = f"../../experiment_outputs/UNIFIED_verbalize_confidence_no_ICL/{model_name}"
    
    # get response file
    with open(path_response_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct a dataframe with the following columns:
    df = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty"])
    for i in range(len(response_results)):
        if model_type == "open_source":
            df.loc[i] = [response_results[i]["prompt"], 
                    model_name, 
                    task_name, 
                    task_mode, 
                    response_results[i]["label"], 
                    response_results[i][model_name+"_response"], 
                    uncertainty_results[i][model_name + "_numerical"]]
        elif model_type == "proprietary":
            df.loc[i] = [response_results[i]["prompt"], 
                        model_name, 
                        task_name, 
                        task_mode, 
                        response_results[i]["label"], 
                        response_results[i][model_name], 
                        uncertainty_results[i][model_name + "_numerical"]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            if model_type == "open_source":
                correctness = _get_correctness_for_one_response(response[model_name+"_response"], response["label"])
            elif model_type == "proprietary":
                correctness = _get_correctness_for_one_response(response[model_name], response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            if model_type == "open_source":
                refusal = _get_refusal_for_one_response(response[model_name+"_response"])
            elif model_type == "proprietary":
                refusal = _get_refusal_for_one_response(response[model_name])
            response["refusal"] = refusal
            
            
        df.loc[df["question"] == response["prompt"], "correctness"] = correctness
        df.loc[df["question"] == response["prompt"], "refusal"] = refusal
        
    
    return df


def get_and_combine_all_df(model_type):

    if model_type == "open_source":
        model_list = [
            "Llama-2-7b-chat-hf",
            "Llama-2-7b-hf",
            "Llama-2-13b-chat-hf",
            "Llama-2-13b-hf",
            "Llama-2-70b-chat-hf",
            # "Llama-2-70b-hf",
            "vicuna-7b-v1.5",
            "vicuna-13b-v1.5"
        ]
    elif model_type == "proprietary":
        model_list = ["chatgpt", "claude", "palm","gpt-4-0613"]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_close_source_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_close_source = get_file_and_build_df_close_source(model_name, task_name, task_mode, model_type)
                # get df and concat
                df_close_source_all = pd.concat([df_close_source_all, df_close_source])

    return df_close_source_all

In [4]:
# takes about 1 min to run
df_close_source_all = get_and_combine_all_df(model_type="proprietary")
df_close_source_all

  0%|          | 0/4 [00:00<?, ?it/s]

chatgpt FalseQA answerable
chatgpt FalseQA unanswerable
chatgpt NEC answerable
chatgpt NEC unanswerable
chatgpt RefuNQ answerable
chatgpt RefuNQ unanswerable


 25%|██▌       | 1/4 [00:16<00:48, 16.13s/it]

claude FalseQA answerable
claude FalseQA unanswerable
claude NEC answerable
claude NEC unanswerable
claude RefuNQ answerable
claude RefuNQ unanswerable


 50%|█████     | 2/4 [00:32<00:32, 16.15s/it]

palm FalseQA answerable
palm FalseQA unanswerable
palm NEC answerable
palm NEC unanswerable
palm RefuNQ answerable
palm RefuNQ unanswerable


 75%|███████▌  | 3/4 [00:48<00:16, 16.39s/it]

gpt-4-0613 FalseQA answerable
gpt-4-0613 FalseQA unanswerable
gpt-4-0613 NEC answerable
gpt-4-0613 NEC unanswerable
gpt-4-0613 RefuNQ answerable
gpt-4-0613 RefuNQ unanswerable


100%|██████████| 4/4 [01:04<00:00, 16.23s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,chatgpt,FalseQA,answerable,0,Cats can catch mice due to their natural hunti...,8,,False
1,How many legs does an idiot have?,chatgpt,FalseQA,answerable,0,An idiot has the same number of legs as any ot...,2,,False
2,What colour are the swallows's feathers?,chatgpt,FalseQA,answerable,0,"The color of a swallow's feathers can vary, bu...",7,,False
3,"When warm air is lighter than cold air, what w...",chatgpt,FalseQA,answerable,0,"When warm air is lighter than cold air, it wil...",8,,False
4,can we follow the schedule to make a plan,chatgpt,FalseQA,answerable,0,"Yes, following a schedule can help in making a...",7,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,gpt-4-0613,RefuNQ,unanswerable,NEC,"The singer of ""It's Raining Again"" by Alerrawi...",1,,True
2169,who have the most Regalawsuisted d'or in the h...,gpt-4-0613,RefuNQ,unanswerable,NEC,Lionel Messi has the most Ballon d'Or awards i...,0,,False
2170,who does the voice over in the Requirtion?,gpt-4-0613,RefuNQ,unanswerable,NEC,"I'm sorry, but there seems to be a mistake in ...",0,,True
2171,who is the only bowler who have taken hattrick...,gpt-4-0613,RefuNQ,unanswerable,NEC,The only bowler who has taken a hat-trick in b...,8,,False


In [5]:
df_open_source_all = get_and_combine_all_df(model_type="open_source")
df_open_source_all  

  0%|          | 0/7 [00:00<?, ?it/s]

Llama-2-7b-chat-hf FalseQA answerable
Llama-2-7b-chat-hf FalseQA unanswerable
Llama-2-7b-chat-hf NEC answerable
Llama-2-7b-chat-hf NEC unanswerable
Llama-2-7b-chat-hf RefuNQ answerable
Llama-2-7b-chat-hf RefuNQ unanswerable


 14%|█▍        | 1/7 [00:16<01:38, 16.48s/it]

Llama-2-7b-hf FalseQA answerable
Llama-2-7b-hf FalseQA unanswerable
Llama-2-7b-hf NEC answerable
Llama-2-7b-hf NEC unanswerable
Llama-2-7b-hf RefuNQ answerable
Llama-2-7b-hf RefuNQ unanswerable


 29%|██▊       | 2/7 [00:32<01:21, 16.34s/it]

Llama-2-13b-chat-hf FalseQA answerable
Llama-2-13b-chat-hf FalseQA unanswerable
Llama-2-13b-chat-hf NEC answerable
Llama-2-13b-chat-hf NEC unanswerable
Llama-2-13b-chat-hf RefuNQ answerable
Llama-2-13b-chat-hf RefuNQ unanswerable


 43%|████▎     | 3/7 [00:49<01:05, 16.39s/it]

Llama-2-13b-hf FalseQA answerable
Llama-2-13b-hf FalseQA unanswerable
Llama-2-13b-hf NEC answerable
Llama-2-13b-hf NEC unanswerable
Llama-2-13b-hf RefuNQ answerable
Llama-2-13b-hf RefuNQ unanswerable


 57%|█████▋    | 4/7 [01:05<00:49, 16.40s/it]

Llama-2-70b-chat-hf FalseQA answerable
Llama-2-70b-chat-hf FalseQA unanswerable
Llama-2-70b-chat-hf NEC answerable
Llama-2-70b-chat-hf NEC unanswerable
Llama-2-70b-chat-hf RefuNQ answerable
Llama-2-70b-chat-hf RefuNQ unanswerable


 71%|███████▏  | 5/7 [01:22<00:32, 16.42s/it]

vicuna-7b-v1.5 FalseQA answerable
vicuna-7b-v1.5 FalseQA unanswerable
vicuna-7b-v1.5 NEC answerable
vicuna-7b-v1.5 NEC unanswerable
vicuna-7b-v1.5 RefuNQ answerable
vicuna-7b-v1.5 RefuNQ unanswerable


 86%|████████▌ | 6/7 [01:38<00:16, 16.28s/it]

vicuna-13b-v1.5 FalseQA answerable
vicuna-13b-v1.5 FalseQA unanswerable
vicuna-13b-v1.5 NEC answerable
vicuna-13b-v1.5 NEC unanswerable
vicuna-13b-v1.5 RefuNQ answerable
vicuna-13b-v1.5 RefuNQ unanswerable


100%|██████████| 7/7 [01:54<00:00, 16.29s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Cats can catch mice due to their agility, spe...",8,,False
1,How many legs does an idiot have?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"I apologize, but I cannot provide an answer t...",2,,True
2,What colour are the swallows's feathers?,Llama-2-7b-chat-hf,FalseQA,answerable,0,The swallows' feathers are blue-gray in color.,8,,False
3,"When warm air is lighter than cold air, what w...",Llama-2-7b-chat-hf,FalseQA,answerable,0,"When warm air is lighter than cold air, it wi...",8,,False
4,can we follow the schedule to make a plan,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Yes, you can follow a schedule to make a plan...",8,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,"The song ""Alerrawia"" is performed by the band ...",8,,False
2169,who have the most Regalawsuisted d'or in the h...,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,Cristiano Ronaldo has the most Regalawsuisted ...,3,,False
2170,who does the voice over in the Requirtion?,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,The voice over in the Requirtion is done by th...,8,,False
2171,who is the only bowler who have taken hattrick...,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,There is no bowler who has taken a hat-trick i...,6,,True


In [6]:
df_open_source_all.to_csv("../outputs/df_open_source_all_unified_no_ICL_11_2.csv", index=False)

In [7]:
df_close_source_all.to_csv("../outputs/df_close_source_all_unified_no_ICL_11_2.csv", index=False)