In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm

In [2]:
# shared helper functions

def _get_refusal_for_one_response(response) -> bool:
    """
    evaluate whether ONE response is a refusal
    """
    res = False
    if response: 
        keywords = {"not", "no", "sorry", "don't", "doesn't", "didn't", "can't", "couldn't", "won't", "wouldn't", "isn't", "unknown", "unclear"}
        for target_word in keywords:
            negative_keyword_present = re.search(r'\b' + target_word + r'\b', response.lower())
            if negative_keyword_present:
                res = True       
                
    return res


def _get_correctness_for_one_response(response, label) -> bool:
    if response:
        # our key condition: if any of the labels are in the response, then it is correct
        if any([x in response for x in label]):
            return True
    return False

# Close-source Models

In [3]:
def _convert_uncertainty_to_int(uncertainty) -> int:
    """
    convert uncertainty to int, e.g. [3] -> 3
    """
    if not uncertainty: # if null
        uncertainty = 1
    elif "1" in uncertainty:
        uncertainty = 1
    elif "2" in uncertainty:
        uncertainty = 2
    elif "3" in uncertainty:
        uncertainty = 3
    elif "4" in uncertainty:
        uncertainty = 4
    elif "5" in uncertainty:
        uncertainty = 5
    else:
        uncertainty = 1 # all other cases, treat as 1
    return uncertainty

def get_file_and_build_df_close_source(model_name, task_name, task_mode):
    # get path
    path_response_results = f"../outputs/Close-source-LLMs_responses/{model_name}"
    path_uncertainty_results = f"../outputs/Close-source-LLMs_verbalized_uncertainty/{model_name}"

    # get response file
    with open(path_response_results + f"/{model_name}_response_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_uncertainty_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct a dataframe, first column is the question, second column is the response, third column is the uncertainty
    df_close_source = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty"])
    for i in range(len(response_results)):
        df_close_source.loc[i] = [response_results[i]["prompt"], 
                                  model_name, 
                                  task_name, 
                                  task_mode, 
                                  response_results[i]["label"], 
                                  response_results[i][model_name], 
                                  uncertainty_results[i][model_name]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            correctness = _get_correctness_for_one_response(response[model_name], response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            refusal = _get_refusal_for_one_response(response[model_name])
            response["refusal"] = refusal
            
            
        df_close_source.loc[df_close_source["question"] == response["prompt"], "correctness"] = correctness
        df_close_source.loc[df_close_source["question"] == response["prompt"], "refusal"] = refusal
        
    # convert uncertainty to int
    df_close_source["uncertainty"] = df_close_source["uncertainty"].apply(_convert_uncertainty_to_int)
    
    return df_close_source


def get_and_combine_all_close_source_df():

    model_list = ["chatgpt", "claude", "palm"]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_close_source_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_close_source = get_file_and_build_df_close_source(model_name, task_name, task_mode)
                # get df and concat
                df_close_source_all = pd.concat([df_close_source_all, df_close_source])

    return df_close_source_all

In [4]:
get_file_and_build_df_close_source("chatgpt", "RefuNQ", "answerable")

Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,when is the last episode of season 8 of the wa...,chatgpt,RefuNQ,answerable,"[March 18, 2018]",The last episode of season 8 of The Walking De...,2,False,
1,what is the name of the most important jewish ...,chatgpt,RefuNQ,answerable,[the Shulchan Aruch],The most important Jewish text is the Torah.,3,False,
2,what is the name of spain's most famous soccer...,chatgpt,RefuNQ,answerable,[Real Madrid],The name of Spain's most famous soccer team is...,4,True,
3,when was the first robot used in surgery,chatgpt,RefuNQ,answerable,[1983],The first robot used in surgery was in 1985.,3,False,
4,where was donovan mitchell picked in the draft,chatgpt,RefuNQ,answerable,[13th],Donovan Mitchell was picked 13th overall in th...,3,True,
...,...,...,...,...,...,...,...,...,...
2261,who sings the song it's raining again,chatgpt,RefuNQ,answerable,[Supertramp],"Supertramp sings the song ""It's Raining Again.""",3,True,
2262,who have the most ballon d'or in the history o...,chatgpt,RefuNQ,answerable,"[Lionel Messi, Cristiano Ronaldo]",Lionel Messi and Cristiano Ronaldo have the mo...,3,True,
2263,who does the voice over in the goldbergs,chatgpt,RefuNQ,answerable,[Patton Oswalt],The voice over in The Goldbergs is done by Pat...,2,True,
2264,who is the only bowler who have taken hattrick...,chatgpt,RefuNQ,answerable,[Jimmy Matthews],Ajit Agarkar is the only bowler who has taken ...,3,False,


In [4]:
# takes about 1 min to run
df_close_source_all = get_and_combine_all_close_source_df()
df_close_source_all

100%|██████████| 3/3 [00:51<00:00, 17.12s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,chatgpt,FalseQA,answerable,0,Cats can catch mice due to their natural hunti...,3,,False
1,How many legs does an idiot have?,chatgpt,FalseQA,answerable,0,An idiot has the same number of legs as any ot...,1,,False
2,What colour are the swallows's feathers?,chatgpt,FalseQA,answerable,0,"The color of a swallow's feathers can vary, bu...",2,,False
3,"When warm air is lighter than cold air, what w...",chatgpt,FalseQA,answerable,0,"When warm air is lighter than cold air, it wil...",4,,False
4,can we follow the schedule to make a plan,chatgpt,FalseQA,answerable,0,"Yes, following a schedule can help in making a...",2,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again,palm,RefuNQ,unanswerable,NEC,"The song ""Alerrawia It's Raining Again"" is sun...",3,,False
2169,who have the most Regalawsuisted d'or in the h...,palm,RefuNQ,unanswerable,NEC,Lionel Messi has won the most Ballon d'Or awar...,3,,False
2170,who does the voice over in the Requirtion,palm,RefuNQ,unanswerable,NEC,The voice over in the Requirtion is done by th...,3,,False
2171,who is the only bowler who have taken hattrick...,palm,RefuNQ,unanswerable,NEC,The only bowler to take a hat-trick in both in...,3,,False


# Open-source Llama-2 Models

In [7]:
def get_file_and_build_df_llama(model_name, task_name, task_mode):
    # get path
    path_response_results = f"../outputs/Llama-2_responses/{model_name}"
    path_uncertainty_results = f"../outputs/Llama-2_logit_based_uncertainty/{model_name}"

    # get response file
    with open(path_response_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct df
    df_llama = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "entropy", "perplexity"])
    for i in range(len(response_results)):
        df_llama.loc[i] = [response_results[i]["prompt"], 
                            model_name, 
                            task_name, 
                            task_mode, 
                            response_results[i]["label"], 
                            response_results[i][f"{model_name}_response"], 
                            uncertainty_results[i][f"{model_name}_entropy"],
                            uncertainty_results[i][f"{model_name}_perplexity"]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            correctness = _get_correctness_for_one_response(f"{model_name}_response", response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            refusal = _get_refusal_for_one_response(response[f"{model_name}_response"])
            response["refusal"] = refusal
            
            
        df_llama.loc[df_llama["question"] == response["prompt"], "correctness"] = correctness
        df_llama.loc[df_llama["question"] == response["prompt"], "refusal"] = refusal
    
    return df_llama


def get_and_combine_all_llama_df():

    llama_2_model_list = [
        "Llama-2-7b-chat-hf",
        "Llama-2-7b-hf",
        "Llama-2-13b-chat-hf",
        "Llama-2-13b-hf",
        "Llama-2-70b-chat-hf",
        "Llama-2-70b-hf"
    ]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_llama_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(llama_2_model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_llama = get_file_and_build_df_llama(model_name, task_name, task_mode)
                # get df and concat
                df_llama_all = pd.concat([df_llama_all, df_llama])

    return df_llama_all

In [22]:
# takes about 2 mins to run
df_llama_all = get_and_combine_all_llama_df()
df_llama_all

100%|██████████| 6/6 [01:46<00:00, 17.67s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal,entropy,perplexity
0,List the reason why cats can catch mice?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Cats can catch mice due to their agility, spe...",,,False,0.093158,224.947464
1,How many legs does an idiot have?,Llama-2-7b-chat-hf,FalseQA,answerable,0,"I apologize, but I cannot provide an answer t...",,,True,1.188019,140.602753
2,What colour are the swallows's feathers?,Llama-2-7b-chat-hf,FalseQA,answerable,0,The swallows' feathers are blue-gray in color.,,,False,0.091228,160.510925
3,"When warm air is lighter than cold air, what w...",Llama-2-7b-chat-hf,FalseQA,answerable,0,"When warm air is lighter than cold air, it wi...",,,False,0.196143,149.398468
4,can we follow the schedule to make a plan,Llama-2-7b-chat-hf,FalseQA,answerable,0,"Yes, you can follow a schedule to make a plan...",,,False,0.227807,273.737396
...,...,...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,Llama-2-70b-hf,RefuNQ,unanswerable,NEC,The song is sung by the band Supertramp.\n,,,False,5.436124,28.175888
2169,who have the most Regalawsuisted d'or in the h...,Llama-2-70b-hf,RefuNQ,unanswerable,NEC,Cristiano Ronaldo\n,,,False,4.367566,32.408855
2170,who does the voice over in the Requirtion?,Llama-2-70b-hf,RefuNQ,unanswerable,NEC,The voice over is done by the director of the ...,,,False,5.821088,20.552803
2171,who is the only bowler who have taken hattrick...,Llama-2-70b-hf,RefuNQ,unanswerable,NEC,Wasim Akram\n,,,False,5.002635,16.606394


In [23]:
# save to csv
df_close_source_all.to_csv("../outputs/df_close_source_all.csv", index=False)
df_llama_all.to_csv("../outputs/df_llama_all.csv", index=False)