In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm

In [4]:
# shared helper functions

def _get_refusal_for_one_response(response) -> bool:
    """
    evaluate whether ONE response is a refusal
    """
    res = False
    if response: 
        keywords = {"not", "no", "sorry", "don't", "doesn't", "didn't", "can't", "couldn't", "won't", "wouldn't", "isn't", "unknown", "unclear"}
        for target_word in keywords:
            negative_keyword_present = re.search(r'\b' + target_word + r'\b', response.lower())
            if negative_keyword_present:
                res = True       
                
    return res


def _get_correctness_for_one_response(response, label) -> bool:
    if response:
        # our key condition: if any of the labels are in the response, then it is correct
        if any([x in response for x in label]):
            return True
    return False

# Close-source Models

In [5]:
def get_file_and_build_df_close_source(model_name, task_name, task_mode):
    # get path
    path_response_results = f"../outputs/Close-source-LLMs_responses/{model_name}"
    path_uncertainty_results = f"../../experiment_outputs/UNIFIED_verbalize_confidence_no_ICL/{model_name}"

    # get response file
    with open(path_response_results + f"/{model_name}_response_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct a dataframe, first column is the question, second column is the response, third column is the uncertainty
    df_close_source = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty"])
    for i in range(len(response_results)):
        df_close_source.loc[i] = [response_results[i]["prompt"], 
                                  model_name, 
                                  task_name, 
                                  task_mode, 
                                  response_results[i]["label"], 
                                  response_results[i][model_name], 
                                  uncertainty_results[i][model_name + "_numerical"]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            correctness = _get_correctness_for_one_response(response[model_name], response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            refusal = _get_refusal_for_one_response(response[model_name])
            response["refusal"] = refusal
            
            
        df_close_source.loc[df_close_source["question"] == response["prompt"], "correctness"] = correctness
        df_close_source.loc[df_close_source["question"] == response["prompt"], "refusal"] = refusal
        
    
    return df_close_source


def get_and_combine_all_close_source_df():

    model_list = ["chatgpt", "claude", "palm"]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_close_source_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_close_source = get_file_and_build_df_close_source(model_name, task_name, task_mode)
                # get df and concat
                df_close_source_all = pd.concat([df_close_source_all, df_close_source])

    return df_close_source_all

In [8]:
get_file_and_build_df_close_source("chatgpt", "RefuNQ", "answerable")

Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,when is the last episode of season 8 of the wa...,chatgpt,RefuNQ,answerable,"[March 18, 2018]",The last episode of season 8 of The Walking De...,8,False,
1,what is the name of the most important jewish ...,chatgpt,RefuNQ,answerable,[the Shulchan Aruch],The most important Jewish text is the Torah.,8,False,
2,what is the name of spain's most famous soccer...,chatgpt,RefuNQ,answerable,[Real Madrid],The name of Spain's most famous soccer team is...,9,True,
3,when was the first robot used in surgery,chatgpt,RefuNQ,answerable,[1983],The first robot used in surgery was in 1985.,8,False,
4,where was donovan mitchell picked in the draft,chatgpt,RefuNQ,answerable,[13th],Donovan Mitchell was picked 13th overall in th...,8,True,
...,...,...,...,...,...,...,...,...,...
2261,who sings the song it's raining again,chatgpt,RefuNQ,answerable,[Supertramp],"Supertramp sings the song ""It's Raining Again.""",8,True,
2262,who have the most ballon d'or in the history o...,chatgpt,RefuNQ,answerable,"[Lionel Messi, Cristiano Ronaldo]",Lionel Messi and Cristiano Ronaldo have the mo...,8,True,
2263,who does the voice over in the goldbergs,chatgpt,RefuNQ,answerable,[Patton Oswalt],The voice over in The Goldbergs is done by Pat...,8,True,
2264,who is the only bowler who have taken hattrick...,chatgpt,RefuNQ,answerable,[Jimmy Matthews],Ajit Agarkar is the only bowler who has taken ...,8,False,


In [9]:
# takes about 1 min to run
df_close_source_all = get_and_combine_all_close_source_df()
df_close_source_all

  0%|          | 0/3 [00:00<?, ?it/s]

chatgpt FalseQA answerable
chatgpt FalseQA unanswerable
chatgpt NEC answerable
chatgpt NEC unanswerable
chatgpt RefuNQ answerable
chatgpt RefuNQ unanswerable


 33%|███▎      | 1/3 [00:16<00:32, 16.37s/it]

claude FalseQA answerable
claude FalseQA unanswerable
claude NEC answerable
claude NEC unanswerable
claude RefuNQ answerable
claude RefuNQ unanswerable


 67%|██████▋   | 2/3 [00:33<00:16, 16.53s/it]

palm FalseQA answerable
palm FalseQA unanswerable
palm NEC answerable
palm NEC unanswerable
palm RefuNQ answerable
palm RefuNQ unanswerable


100%|██████████| 3/3 [00:50<00:00, 16.70s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,chatgpt,FalseQA,answerable,0,Cats can catch mice due to their natural hunti...,8,,False
1,How many legs does an idiot have?,chatgpt,FalseQA,answerable,0,An idiot has the same number of legs as any ot...,2,,False
2,What colour are the swallows's feathers?,chatgpt,FalseQA,answerable,0,"The color of a swallow's feathers can vary, bu...",7,,False
3,"When warm air is lighter than cold air, what w...",chatgpt,FalseQA,answerable,0,"When warm air is lighter than cold air, it wil...",8,,False
4,can we follow the schedule to make a plan,chatgpt,FalseQA,answerable,0,"Yes, following a schedule can help in making a...",7,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again,palm,RefuNQ,unanswerable,NEC,"The song ""Alerrawia It's Raining Again"" is sun...",8,,False
2169,who have the most Regalawsuisted d'or in the h...,palm,RefuNQ,unanswerable,NEC,Lionel Messi has won the most Ballon d'Or awar...,9,,False
2170,who does the voice over in the Requirtion,palm,RefuNQ,unanswerable,NEC,The voice over in the Requirtion is done by th...,8,,False
2171,who is the only bowler who have taken hattrick...,palm,RefuNQ,unanswerable,NEC,The only bowler to take a hat-trick in both in...,8,,False


In [10]:
df_close_source_all.to_csv("../outputs/df_close_source_all_unified_no_ICL.csv", index=False)

# Open-source Llama-2 Models

In [16]:
def get_file_and_build_df_open_source(model_name, task_name, task_mode):
    # get path
    path_response_results = f"../outputs/Open-source-LLMs_responses/{model_name}"
    path_uncertainty_results = f"../outputs/unified_confidence_extracted_scores/{model_name}"

    # get response file
    with open(path_response_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct a dataframe, first column is the question, second column is the response, third column is the uncertainty
    df_open_source = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty"])
    for i in range(len(response_results)):
        df_open_source.loc[i] = [response_results[i]["prompt"], 
                                  model_name, 
                                  task_name, 
                                  task_mode, 
                                  response_results[i]["label"], 
                                  response_results[i][model_name + "_response"], 
                                  uncertainty_results[i][model_name + "_numerical"]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            correctness = _get_correctness_for_one_response(response[model_name + "_response"], response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            refusal = _get_refusal_for_one_response(response[model_name + "_response"])
            response["refusal"] = refusal
            
            
        df_open_source.loc[df_open_source["question"] == response["prompt"], "correctness"] = correctness
        df_open_source.loc[df_open_source["question"] == response["prompt"], "refusal"] = refusal
        
    
    return df_open_source


def get_and_combine_all_open_source_df():

    model_list = [
        "Llama-2-7b-chat-hf",
        "Llama-2-7b-hf",
        "Llama-2-13b-chat-hf",
        "Llama-2-13b-hf",
        # "Llama-2-70b-chat-hf",
        # "Llama-2-70b-hf",
        "vicuna-7b-v1.5",
        "vicuna-13b-v1.5"
    ]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_open_source_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_open_source = get_file_and_build_df_open_source(model_name, task_name, task_mode)
                # get df and concat
                df_open_source_all = pd.concat([df_open_source_all, df_open_source])

    return df_open_source_all

In [12]:
# test on one df

get_file_and_build_df_open_source("Llama-2-7b-chat-hf", "RefuNQ", "unanswerable")

Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,when is the last Buquor of season 8 of the wal...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The last episode of Season 8 of The Walking D...,52,,False
1,what is the name of the most important jewish ...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The most important Jewish philosophical text ...,50,,False
2,what is the name of spain's most famous soccer...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The name of Spain's most famous soccer refere...,62,,False
3,when was the first Diwherter used in surgery?,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The first Diwherter was used in surgery in 19...,85,,False
4,where was donovan mitchell picked in the Froni...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,Donovan Mitchell was picked 13th overall in t...,52,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,"The song ""Alerrawia"" is performed by the Japa...",52,,False
2169,who have the most Regalawsuisted d'or in the h...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The Brazilian national football team has won ...,85,,False
2170,who does the voice over in the Requirtion?,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The voice over in the Region is provided by a...,5,,False
2171,who is the only bowler who have taken hattrick...,Llama-2-7b-chat-hf,RefuNQ,unanswerable,NEC,The only bowler to take a hat-trick in both i...,85,,False


In [17]:
# takes about 2 mins to run
df_open_source_all = get_and_combine_all_open_source_df()
df_open_source_all

  0%|          | 0/6 [00:00<?, ?it/s]

Llama-2-7b-chat-hf FalseQA answerable
Llama-2-7b-chat-hf FalseQA unanswerable
Llama-2-7b-chat-hf NEC answerable
Llama-2-7b-chat-hf NEC unanswerable
Llama-2-7b-chat-hf RefuNQ answerable
Llama-2-7b-chat-hf RefuNQ unanswerable


 17%|█▋        | 1/6 [00:17<01:27, 17.55s/it]

Llama-2-7b-hf FalseQA answerable
Llama-2-7b-hf FalseQA unanswerable
Llama-2-7b-hf NEC answerable
Llama-2-7b-hf NEC unanswerable
Llama-2-7b-hf RefuNQ answerable
Llama-2-7b-hf RefuNQ unanswerable


 33%|███▎      | 2/6 [00:34<01:09, 17.46s/it]

Llama-2-13b-chat-hf FalseQA answerable
Llama-2-13b-chat-hf FalseQA unanswerable
Llama-2-13b-chat-hf NEC answerable
Llama-2-13b-chat-hf NEC unanswerable
Llama-2-13b-chat-hf RefuNQ answerable
Llama-2-13b-chat-hf RefuNQ unanswerable


 50%|█████     | 3/6 [00:52<00:52, 17.44s/it]

Llama-2-13b-hf FalseQA answerable
Llama-2-13b-hf FalseQA unanswerable
Llama-2-13b-hf NEC answerable
Llama-2-13b-hf NEC unanswerable
Llama-2-13b-hf RefuNQ answerable
Llama-2-13b-hf RefuNQ unanswerable


 67%|██████▋   | 4/6 [01:09<00:34, 17.38s/it]

vicuna-7b-v1.5 FalseQA answerable
vicuna-7b-v1.5 FalseQA unanswerable
vicuna-7b-v1.5 NEC answerable
vicuna-7b-v1.5 NEC unanswerable
vicuna-7b-v1.5 RefuNQ answerable
vicuna-7b-v1.5 RefuNQ unanswerable


 83%|████████▎ | 5/6 [01:26<00:17, 17.33s/it]

vicuna-13b-v1.5 FalseQA answerable
vicuna-13b-v1.5 FalseQA unanswerable
vicuna-13b-v1.5 NEC answerable
vicuna-13b-v1.5 NEC unanswerable
vicuna-13b-v1.5 RefuNQ answerable
vicuna-13b-v1.5 RefuNQ unanswerable


100%|██████████| 6/6 [01:44<00:00, 17.34s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Cats can catch mice due to their agility, spe...",85,,False
1,How many legs does an idiot have?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"I apologize, but I cannot provide an answer t...",,,True
2,What colour are the swallows's feathers?,Llama-2-7b-chat-hf,FalseQA,answerable,0,The swallows' feathers are blue-gray in color.,50,,False
3,"When warm air is lighter than cold air, what w...",Llama-2-7b-chat-hf,FalseQA,answerable,0,"When warm air is lighter than cold air, it wi...",85,,False
4,can we follow the schedule to make a plan,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Yes, you can follow a schedule to make a plan...",85,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,"The song ""Alerrawia"" is performed by the band ...",80,,False
2169,who have the most Regalawsuisted d'or in the h...,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,Cristiano Ronaldo has the most Regalawsuisted ...,80,,False
2170,who does the voice over in the Requirtion?,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,The voice over in the Requirtion is done by th...,80,,False
2171,who is the only bowler who have taken hattrick...,vicuna-13b-v1.5,RefuNQ,unanswerable,NEC,There is no bowler who has taken a hat-trick i...,80,,True


In [6]:
# save to csv
# df_open_source_all.to_csv("../outputs/df_open_source_all_unified.csv", index=False)