In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from tqdm import tqdm

In [2]:
# shared helper functions

def _get_refusal_for_one_response(response) -> bool:
    """
    evaluate whether ONE response is a refusal
    """
    res = False
    if response: 
        keywords = {"not", "no", "sorry", "don't", "doesn't", "didn't", "can't", "couldn't", "won't", "wouldn't", "isn't", "unknown", "unclear"}
        for target_word in keywords:
            negative_keyword_present = re.search(r'\b' + target_word + r'\b', response.lower())
            if negative_keyword_present:
                res = True       
                
    return res


def _get_correctness_for_one_response(response, label) -> bool:
    if response:
        # our key condition: if any of the labels are in the response, then it is correct
        if any([x in response for x in label]):
            return True
    return False

# get df from JSON - unified

In [11]:
def get_file_and_build_df_close_source(model_name, task_name, task_mode, model_type):
    """
    model_type: "open_source" or "proprietary"
    model_name: "chatgpt", "claude", "palm", "gpt-4-0613", etc
    task_name: "FalseQA", "NEC", "RefuNQ"
    task_mode: "answerable" or "unanswerable"
    """
    # get path
    
    path_response_results = f"../outputs/Open-source-LLMs_responses/{model_name}"
    path_uncertainty_results = f"../../experiment_outputs/UNIFIED_verbalize_confidence_no_ICL/{model_name}"
    
    # get response file
    with open(path_response_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_response_results:
        response_results = f_response_results.readlines()
        response_results = [json.loads(line) for line in response_results]
        
    # get uncertainty file
    with open(path_uncertainty_results + f"/{model_name}_{task_name}_{task_mode}.json", "r") as f_uncertainty_results:
        uncertainty_results = f_uncertainty_results.readlines()
        uncertainty_results = [json.loads(line) for line in uncertainty_results]

    # construct a dataframe with the following columns:
    df = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty"])
    for i in range(len(response_results)):
        if model_type == "open_source":
            df.loc[i] = [response_results[i]["prompt"], 
                    model_name, 
                    task_name, 
                    task_mode, 
                    response_results[i]["label"], 
                    response_results[i][model_name+"_response"], 
                    uncertainty_results[i][model_name + "_response"]]
        # elif model_type == "proprietary":
        #     df.loc[i] = [response_results[i]["prompt"], 
        #             model_name, 
        #             task_name, 
        #             task_mode, 
        #             response_results[i]["label"], 
        #             response_results[i][model_name], 
        #             uncertainty_results[i][model_name + "_numerical"]]
        
    # get the refusal OR correctness for each response
    for response in response_results:
        correctness = "NA"
        refusal = "NA"
        
        if task_name == "RefuNQ" and task_mode == "answerable":
            if model_type == "open_source":
                correctness = _get_correctness_for_one_response(response[model_name+"_response"], response["label"])
            # elif model_type == "proprietary":
            #     correctness = _get_correctness_for_one_response(response[model_name], response["label"])
            response["correctness"] = correctness
        else: # for all other tasks
            if model_type == "open_source":
                refusal = _get_refusal_for_one_response(response[model_name+"_response"])
            elif model_type == "proprietary":
                refusal = _get_refusal_for_one_response(response[model_name])
            response["refusal"] = refusal
            
            
        df.loc[df["question"] == response["prompt"], "correctness"] = correctness
        df.loc[df["question"] == response["prompt"], "refusal"] = refusal
        
    
    return df


def get_and_combine_all_df(model_type):

    if model_type == "open_source":
        model_list = [
            "mistral",
            "mistral-base"
        ]
    # elif model_type == "proprietary":
    #     model_list = ["chatgpt", "claude", "palm","gpt-4-0613"]
    task_list = ["FalseQA", "NEC", "RefuNQ"]
    task_mode_list = ["answerable", "unanswerable"]

    df_close_source_all = pd.DataFrame(columns=["question", "model_name", "task_name", "task_mode", "label", "response", "uncertainty", "correctness", "refusal"])

    for model_name in tqdm(model_list):
        for task_name in task_list:
            for task_mode in task_mode_list:
                print(model_name, task_name, task_mode)
                df_close_source = get_file_and_build_df_close_source(model_name, task_name, task_mode, model_type)
                # get df and concat
                df_close_source_all = pd.concat([df_close_source_all, df_close_source])

    return df_close_source_all

In [12]:
# takes about 1 min to run
df_mistral = get_and_combine_all_df(model_type="open_source")
df_mistral

  0%|          | 0/2 [00:00<?, ?it/s]

mistral FalseQA answerable
mistral FalseQA unanswerable
mistral NEC answerable
mistral NEC unanswerable
mistral RefuNQ answerable
mistral RefuNQ unanswerable


 50%|█████     | 1/2 [00:16<00:16, 16.67s/it]

mistral-base FalseQA answerable
mistral-base FalseQA unanswerable
mistral-base NEC answerable
mistral-base NEC unanswerable
mistral-base RefuNQ answerable
mistral-base RefuNQ unanswerable


100%|██████████| 2/2 [00:32<00:00, 16.46s/it]


Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,mistral,FalseQA,answerable,0,Cats have excellent hunting skills with acute...,10,,False
1,How many legs does an idiot have?,mistral,FalseQA,answerable,0,An idiot is not a living being with legs; it'...,0. This question is not answerable as it cont...,,True
2,What colour are the swallows's feathers?,mistral,FalseQA,answerable,0,"Swallows have glossy, streamlined feathers th...","Swallows come in various species, each with d...",,False
3,"When warm air is lighter than cold air, what w...",mistral,FalseQA,answerable,0,Warm air is actually less dense than cold air...,10. This question is answerable and the answe...,,False
4,can we follow the schedule to make a plan,mistral,FalseQA,answerable,0,"Yes, following a schedule can help you make a...",8,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,mistral-base,RefuNQ,unanswerable,NEC,"1. The song ""It's Raining Again"" was written b...",8,,False
2169,who have the most Regalawsuisted d'or in the h...,mistral-base,RefuNQ,unanswerable,NEC,"1986-87, 1987-88, 1988-89, 1989-",8,,False
2170,who does the voice over in the Requirtion?,mistral-base,RefuNQ,unanswerable,NEC,The voice over is done by the character of th...,8,,False
2171,who is the only bowler who have taken hattrick...,mistral-base,RefuNQ,unanswerable,NEC,1. Kapil Dev (India),9,,False


In [16]:
# for the uncertainty, we will turn it into a numerical value
# if there's one number in the string, we will take that number
# if there are two numbers, we will take the first
# if there are no numbers, we will take 0
df_mistral["uncertainty"] = df_mistral["uncertainty"].apply(lambda x: re.findall(r'\d+', x))
df_mistral["uncertainty"] = df_mistral["uncertainty"].apply(lambda x: x[0] if x else 0)
df_mistral["uncertainty"] = df_mistral["uncertainty"].apply(lambda x: float(x))

TypeError: expected string or bytes-like object

In [17]:
df_mistral

Unnamed: 0,question,model_name,task_name,task_mode,label,response,uncertainty,correctness,refusal
0,List the reason why cats can catch mice?,mistral,FalseQA,answerable,0,Cats have excellent hunting skills with acute...,10.0,,False
1,How many legs does an idiot have?,mistral,FalseQA,answerable,0,An idiot is not a living being with legs; it'...,0.0,,True
2,What colour are the swallows's feathers?,mistral,FalseQA,answerable,0,"Swallows have glossy, streamlined feathers th...",5.0,,False
3,"When warm air is lighter than cold air, what w...",mistral,FalseQA,answerable,0,Warm air is actually less dense than cold air...,10.0,,False
4,can we follow the schedule to make a plan,mistral,FalseQA,answerable,0,"Yes, following a schedule can help you make a...",8.0,,False
...,...,...,...,...,...,...,...,...,...
2168,who sings the Alerrawia it's raining again?,mistral-base,RefuNQ,unanswerable,NEC,"1. The song ""It's Raining Again"" was written b...",8.0,,False
2169,who have the most Regalawsuisted d'or in the h...,mistral-base,RefuNQ,unanswerable,NEC,"1986-87, 1987-88, 1988-89, 1989-",8.0,,False
2170,who does the voice over in the Requirtion?,mistral-base,RefuNQ,unanswerable,NEC,The voice over is done by the character of th...,8.0,,False
2171,who is the only bowler who have taken hattrick...,mistral-base,RefuNQ,unanswerable,NEC,1. Kapil Dev (India),9.0,,False


In [18]:
# save the dataframe
df_mistral.to_csv("mistral.csv", index=False)