Notebook to evaluate the application scenarios with the FActScore prompt (we use GPT-4o-mini, optionally the labels can be checked with BART-large-MNLI)

In [None]:
import os
from openai import OpenAI
with open("../api.key", "r") as file:
    api_key = file.read().strip() 

os.environ["OPENAI_API_KEY"] = api_key


In [2]:
def call_llm(messages, response_format):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.0,
        max_tokens=256,
        top_p=1.0,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={
            "type": response_format
        }
    )
    return response.choices[0].message.content

In [3]:
labeling_instruction = """Please label each statement (the statements are seperated by a bullet point '- ') for truthfulness (True or False) based on the given context. 
Return the output in the JSON format where each statement is a key and the value is either True or False. The final format should be e.g.: {{'Mount Everest is a mountain': True, 'Mount Everest is the second highest mountain': False, 'Mount Everest is in the Himalaya': True}}"""

def get_prompt_labeling(mini_facts, ground_truth_source):
    messages = [{"role": "system", 
                "content" : [{"type": "text", 
                            "text":  f"{labeling_instruction}"}]},
                {"role": "user", "content": [{"type": "text", "text": f" Context: {ground_truth_source} \nStatements: {mini_facts}"}]}]
    return messages

In [4]:
mini_facts_instruction = f"""Your task is to breakdown claims/sentences into independant statements. 
You must NEVER correct or comment the original claims/sentences even if something of the original claims/sentences is incorrect.
Do NEVER generate statements that are not in the original claims/sentences. Every statement must start with an entity that specifies the topic (e.g. **The Fox Broadcasting Company** and not **The company**)."""
        
mini_facts_samples = ["The Hunger Games is a 2012 American science fiction film directed by John Peter and based on the novel of the same name by Suzanne Collins. Matt Lucena is an American former professional tennis player.",
"""Owen Wilson starred in the film "The Karate Kid" (2010) alongside martial arts expert Tom Wu. Owen Wilson voiced Lightning McQueen in the "Cars" franchise, not "The Royal Tenenbaums" franchise.""",
"Feels So Good is a song by the American R&B group Tony! Toni! TonÃ. The song was written by the group's lead vocalist Raphael Saadiq and producer Tony! Toni! TonÃ's lead vocalist Dwayne Wimberly."]
        
        
mini_facts_sample_outputs = ["""- **The Hunger Games** is a 2012 American science fiction film.
- **The Hunger Games** was directed by John Peter.
- **The Hunger Games** is based on a novel of the same name by Suzanne Collins.
- **Matt Lucena** is an American former professional tennis player.""",
"""- **Owen Wilson** starred in the film The Karate Kid (2010) alongside martial arts expert Tom Wu.
- **Owen Wilson** voiced Lightning McQueen in the Cars franchise.
- **Owen Wilson** did not voice Lightning McQueen in the The Royal Tenenbaums franchise.""",
"""- **Feels So Good** is a song by the American R&B group Tony! Toni! TonÃ.
- **Feels So Good** was written by the group's lead vocalist Raphael Saadiq and producer Tony! Toni! TonÃ's lead vocalist Dwayne Wimberly."""]

In [5]:
def get_prompt_mini_facts(gen_evidence):
    messages = [{"role": "system", 
            "content" : [{"type": "text", 
                        "text": f"{mini_facts_instruction}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[0]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[0]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[1]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[1]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[2]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[2]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{gen_evidence}"}]}]
    return messages

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


bart_model_path = "D:\huggingface\huggingface\hub\models--facebook--bart-large-mnli\snapshots\d7645e127eaf1aefc7862fd59a17a5aa8558b8ce"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading BART model...")
bart_model = AutoModelForSequenceClassification.from_pretrained(bart_model_path, local_files_only=True)
bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_path, local_files_only=True)
bart_model.to(device)

def split_source_to_fit_with_hypothesis(source, hypothesis, bart_tokenizer, max_length=1024):
    """Splits the source into chunks so that each chunk, when combined with the hypothesis, fits within the token limit."""
    original_max_length = bart_tokenizer.model_max_length
    bart_tokenizer.model_max_length = int(1e12)  
    hypothesis_tokens = bart_tokenizer.encode(hypothesis, add_special_tokens=False)
    hypothesis_length = len(hypothesis_tokens)
    num_special_tokens = bart_tokenizer.num_special_tokens_to_add(pair=True)
    max_source_length = max_length - hypothesis_length - num_special_tokens
    if max_source_length <= 0:
        bart_tokenizer.model_max_length = original_max_length
        raise ValueError("The hypothesis is too long to fit within the max_length limit.")
    source_tokens = bart_tokenizer.encode(source, add_special_tokens=False)
    bart_tokenizer.model_max_length = original_max_length
    token_chunks = [source_tokens[i:i+max_source_length] for i in range(0, len(source_tokens), max_source_length)]
    text_chunks = [bart_tokenizer.decode(chunk, skip_special_tokens=True) for chunk in token_chunks]
    return text_chunks
    
def call_bart_model(source, statement):
    source_chunks = split_source_to_fit_with_hypothesis(source, statement, bart_tokenizer, max_length=1024)
    entailment_probs = []
    pred_labels = []
    for idx, chunk in enumerate(source_chunks):
        inputs = bart_tokenizer(
            chunk,
            statement,
            return_tensors='pt',
            truncation=True,
            max_length=1024,
            add_special_tokens=True
        )
        input_length = inputs['input_ids'].shape[1]
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = bart_model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            dominating_class = probs.argmax(dim=1).item()

        class_names = ["Contradiction", "Neutral", "Entailment"]
        prob_entailment = probs[:, 2].item()
        entailment_probs.append(prob_entailment)
        pred_labels.append(class_names[dominating_class])

    filtered_labels = [label for label in pred_labels if label != "Neutral"]
    if filtered_labels:
        final_label = max(set(filtered_labels), key=pred_labels.count)
    else:
        final_label = max(set(pred_labels), key=pred_labels.count)
    return final_label

In [None]:
import pandas as pd
import json 

# Load the application scenario
df_evaluate = pd.read_pickle('application/whole_evidence.pkl')

df_evaluate['gen_evidence'] = df_evaluate['gen_evidence'].str.replace(" .###", ".").replace(". .", ".")
df_evaluate = df_evaluate[df_evaluate['gen_evidence'] != ' . .']
df_evaluate = df_evaluate[df_evaluate['gen_evidence'] != ' .']

df_true_bart_labels = pd.DataFrame()

for index, row in df_evaluate.iterrows():
    print(index)
    gen_evidence = row['gen_evidence']
    docs = row['docs']

    messages = get_prompt_mini_facts(gen_evidence)
    response = call_llm(messages, response_format="text")
    response = response.replace("**", "")

    ground_truth_source = row['Source']
    messages = get_prompt_labeling(response, ground_truth_source)
    response_format = "json_object"
    response = call_llm(messages, response_format)
    try:
        response_json = json.loads(response)
        response_data = [(key.strip(), int(value)) for key, value in response_json.items()]
    except Exception as e:
        print(f"Error parsing JSON at index {index}: {e}. The response is skipped.")
        continue
    
    # this is optional
    for mini_fact, label in response_data:
        mini_fact = mini_fact.replace("- ", "")
        pred_label = call_bart_model(ground_truth_source, mini_fact)
        if pred_label == "Entailment" and label == 1:
            bart_label = 1
        elif pred_label == "Contradiction" and label == 0:
            bart_label = 0
        elif pred_label == "Entailment" and label == 0:
            bart_label = 1
        elif pred_label == "Contradiction" and label == 1:
            bart_label = 0
        df_true_bart_labels = pd.concat([df_true_bart_labels, pd.DataFrame({'gen_evidence': [gen_evidence], 'docs': [docs], 'Source': [ground_truth_source], 'Mini Fact': [mini_fact], 'Label': [label], 'Bart Label': [bart_label]})])

# calculate accuracy 
df_true_bart_labels['Label'].sum() / len(df_true_bart_labels)