In [23]:
from typing import Literal, List
from tqdm import tqdm
import pandas as pd
import dotenv
from openai import OpenAI

In [2]:
# prompts used by Google FACTS paper 
facts_judge_prompt  = """
Your task is to check if the Response is accurate to the Evidence.
Generate 'Accurate' if the Response is accurate when verified according to the Evidence,
or 'Inaccurate' if the Response is inaccurate (contradicts the evidence) or cannot be
verified.
**Evidence**\n\n{context}\n\n**End of Evidence**\n
**Response**:\n\n{response}\n\n**End of Response**\n
Let's think step-by-step.
"""

In [3]:
r1_df = pd.read_csv('r1.csv')

In [4]:
dotenv.load_dotenv()
openai_client = OpenAI()

In [18]:
def judge_by_openai(context:str, response:str, model: str) -> Literal[0,1]:
    prompt = facts_judge_prompt.format(context=context, response=response)
    client = openai_client # global 
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "developer", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    response = completion.choices[0].message.content
    # find the word "accurate" or "inaccurate" near the end of the response

    window = 10
    while window < len(response): 
        if "inaccurate" in response[-window:].lower():
            return 0
        elif "accurate" in response[-window:].lower():
            return 1
        window += 10

    return -1 # error 

judge_by_openai("The capital of France is Paris", "The capital of France is Berline", "gpt-4o-mini") # test

1

In [21]:
# iterate over the rows of the dataframe and judge each response
# save the DF with the new column "judgement" in every iteration 

def loop_over_df(df, model: str):    

    judgement_column = f'{model}_judgement'
    r1_df[judgement_column] = -1

    for i, row in tqdm(r1_df.iterrows(), total=len(r1_df)):
        if row[judgement_column] != -1: # already judged,
            continue 
        context = row['source']
        response = row['summary']
        judgement = judge_by_openai(context, response, model)
        r1_df.at[i, judgement_column] = judgement

        r1_df.to_csv(f'r1_{model}_judgement.csv', index=False) # save after every iteration

loop_over_df(r1_df, "gpt-4o-mini") # test

100%|██████████| 1006/1006 [1:36:47<00:00,  5.77s/it]


In [31]:
r1_df

Unnamed: 0.1,Unnamed: 0,source,summary,dataset,HHEM prediction,HHEM score,openai_judgement,gpt-4o-mini_judgement
0,0,Paul Merson has restarted his row with Andros ...,Paul Merson reignited his public feud with And...,summeval_valid,Not Hallucinated,0.591848,1,1
1,1,Chelsea have made an offer for FC Tokyo's 22-y...,Chelsea have submitted a formal offer to sign ...,summeval_valid,Not Hallucinated,0.970130,1,1
2,2,Babies given antibiotics in the first six mont...,A large-scale Finnish study published in *Pedi...,summeval_valid,Not Hallucinated,0.946560,-1,1
3,3,This is the embarrassing moment a Buckingham P...,A Buckingham Palace guard slipped and fell on ...,summeval_valid,Not Hallucinated,0.943379,-1,1
4,4,(CNN)One of the biggest TV events of all time ...,"The 1977 groundbreaking miniseries *Roots*, wh...",summeval_valid,Not Hallucinated,0.845592,-1,1
...,...,...,...,...,...,...,...,...
1001,1001,Headquartered in the Qatar Airways Tower in Do...,"Qatar Airways, headquartered in Doha's Qatar A...",vitc_dev,Not Hallucinated,0.942210,-1,1
1002,1002,Maxthon is reported to be the second most popu...,Maxthon is the second most popular browser in ...,vitc_dev,Not Hallucinated,0.781710,-1,1
1003,1003,Maxthon is reported to be the second most popu...,Maxthon is the second most popular browser in ...,vitc_dev,Not Hallucinated,0.880635,-1,1
1004,1004,more than have been reported worldwide ; more ...,"The passage reports over 16,700 deaths and mor...",vitc_dev,Not Hallucinated,0.806597,-1,1


In [69]:
def compute_hallu_rate(deepseek_version: str, models: List[str]) -> float:
    hall_rate = {}
    r1_df = pd.read_csv(f'{deepseek_version}.csv')

    # load  df from {model}_judgement.csv and append {model}_judgement column to global variable r1_df

    for model in models:
        judgement_column = f'{model}_judgement'
        csv_path = f'{deepseek_version}_{model}_judgement.csv'
        print (f"loading {csv_path}")
        df = pd.read_csv(csv_path)
        r1_df[judgement_column] = df[judgement_column]

    # remove rows where there is -1 in any of the judgement columns
    print ("before removing rows", r1_df.shape)
    r1_df = r1_df[(r1_df != -1).all(axis=1)]
    print ("after removing rows", r1_df.shape)

    r1_df['vote'] = r1_df[[f'{model}_judgement' for model in models]].mean(axis=1)
    r1_df['vote'] = r1_df['vote'].apply(lambda x: 1 if x > 0.5 else 0)

    judgement_df = r1_df[[ f'{model}_judgement' for model in models] + ['vote']]
    # display (judgement_df[judgement_df['vote']<1])

    hall_rate = 1 - r1_df['vote'].mean()

    return hall_rate

judging_models = [
    "gpt-4o-mini",
    # "gpt-4o", 
    # "claude-3-5-sonnet-latest", 
    "gemini-1.5-pro"]

print (compute_hallu_rate("r1", judging_models)) 
       

print (compute_hallu_rate("v3", judging_models)) 


loading r1_gpt-4o-mini_judgement.csv
loading r1_gemini-1.5-pro_judgement.csv
before removing rows (1006, 8)
after removing rows (1003, 8)
0.02193419740777669
loading v3_gpt-4o-mini_judgement.csv
loading v3_gemini-1.5-pro_judgement.csv
before removing rows (1006, 6)
after removing rows (9, 6)
0.0


In [64]:
compute_hallu_rate("v3", ["gpt-4o"]) # test


loading v3_gpt-4o_judgement.csv
before removing rows (1006, 5)
after removing rows (1005, 5)


0.012935323383084563