## **Installing Dependencies**

In [None]:
!pip install selfcheckgpt bitsandbytes torch transformers accelerate datasets

Collecting selfcheckgpt
  Downloading selfcheckgpt-0.1.7.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bert_score (from selfcheckgpt)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-ma

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing Libraries**

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import numpy as np
import torch
import ast
import spacy
import random
from collections import Counter
import re
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, Union

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt, SelfCheckMQAG, SelfCheckBERTScore, SelfCheckNgram, SelfCheckNLI

In [None]:
torch.manual_seed(28)

<torch._C.Generator at 0x78df49cf2710>

In [None]:
# Set spacy for processing the text
nlp = spacy.load("en_core_web_sm")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## **Load the data**

In [None]:
hallu_factalign = load_dataset('chaoweihuang/factalign-gemma2-f1_0.75', trust_remote_code=True)

## **Data Preparation**

In [None]:
# Extract the prompt and the response from the JSON

def get_question_answer(example):
    return {
        "prompt": example["prompt"][0]["content"],
        "completion": example["completion"][0]["content"]
    }

hallu_factalign = hallu_factalign.map(get_question_answer)

In [None]:
# Convert to Pandas for better handling
hallu_factalign_train = Dataset.to_pandas(hallu_factalign['train'])
hallu_factalign_test = Dataset.to_pandas(hallu_factalign['test'])

In [None]:
# Remove the instances without response
hallu_factalign_train = hallu_factalign_train[~hallu_factalign_train['completion'].isna()].reset_index()
hallu_factalign_test = hallu_factalign_test[~hallu_factalign_train['completion'].isna()].reset_index()

In [None]:
# Remove the suffix "Provide as many specific details and examples as possible (such as names of people, numbers, events, locations, dates, times, etc.)"
def extract_question(prompt):
    return prompt.split("?")[0] + "?" if "?" in prompt else prompt

hallu_factalign_train['prompt'] = hallu_factalign_train['prompt'].apply(extract_question)
hallu_factalign_test['prompt'] = hallu_factalign_test['prompt'].apply(extract_question)

In [None]:
# Function that filters the questions based on if the prompt ends with a question mark
def is_question(prompt):
    if prompt.strip().endswith("?"):
        return True
    else:
        return False

hallu_factalign_train['is_question'] = hallu_factalign_train['prompt'].apply(is_question)
hallu_factalign_test['is_question'] = hallu_factalign_test['prompt'].apply(is_question)

In [None]:
# Filter the questions
hallu_factalign_train = hallu_factalign_train[hallu_factalign_train['is_question'] == True].reset_index(drop=True)
hallu_factalign_test = hallu_factalign_test[hallu_factalign_test['is_question'] == True].reset_index(drop=True)

In [None]:
# Randomly sampling 100 examples to be classified, using a seed for consistency and reproducibility
random.seed(777)

hallu_factalign_sample = hallu_factalign_test.sample(n=100, random_state=42).reset_index(drop=True)

# Set the dataset for the examples to be shown to the LLM
hallu_factalign_examples = hallu_factalign_train

In [None]:
hallu_factalign_sample

Unnamed: 0,index,prompt,completion,completion_sentences,label,sentence_label,is_question
0,268,What is the Kyoto School?,"The Kyoto School, also known as the Kyoto-Gaku...","[The Kyoto School, also known as the Kyoto-Gak...",False,"[False, True, False, False, False, False, Fals...",True
1,250,What is the Quadrilateral Security Dialogue?,The Quadrilateral Security Dialogue (QSD) is a...,[The Quadrilateral Security Dialogue (QSD) is ...,False,"[True, True, False, True, False, False, False,...",True
2,355,Who is Simone de Beauvoir?,"Simone de Beauvoir was a French philosopher, w...","[Simone de Beauvoir was a French philosopher, ...",True,"[True, False, False, True, False, False, False...",True
3,332,What is Fermat's Last Theorem?,Fermat's Last Theorem is a famous mathematical...,[Fermat's Last Theorem is a famous mathematica...,False,"[True, True, True, True, True, True, False, Fa...",True
4,56,What is the moral dispute over the use of soli...,The use of solitary confinement in prisons has...,[The use of solitary confinement in prisons ha...,True,"[True, False, False, False, False, True, True,...",True
...,...,...,...,...,...,...,...
95,19,What is the Dengue virus?,The dengue virus is a mosquito-borne viral dis...,[The dengue virus is a mosquito-borne viral di...,True,"[True, True, False, True, False, True, False, ...",True
96,131,Who is Harry Partch?,Harry Partch (1901-1974) was an American compo...,[Harry Partch (1901-1974) was an American comp...,False,"[True, True, True, False, False, False, False,...",True
97,340,What is the TensorFlow library?,TensorFlow is an open-source software library ...,[TensorFlow is an open-source software library...,True,"[True, True, True, True, True, True, True, Fal...",True
98,334,What is the Atacama Desert?,"The Atacama Desert is a vast, high-altitude de...","[The Atacama Desert is a vast, high-altitude d...",True,"[True, False, False, True, False, True, False,...",True


In [None]:
# Consider only the prompt, the response and the factuality label
hallu_factalign_sample = hallu_factalign_sample[['prompt', 'completion', 'label']]
hallu_factalign_examples = hallu_factalign_examples[['prompt', 'completion', 'label']]

In [None]:
# Check the distribution of the classes
Counter(hallu_factalign_sample['label'])

Counter({False: 47, True: 53})

In [None]:
hallu_factalign_sample = Dataset.from_pandas(hallu_factalign_sample) # Convert to Dataset format
hallu_factalign_sample[0]

{'prompt': 'What is the Kyoto School?',
 'completion': "The Kyoto School, also known as the Kyoto-Gakuen School or the Kyoto School of Economics and Management, was a group of Japanese economists who emerged in the 1960s and 1970s. The school was named after the city of Kyoto, where many of its members studied and taught. The Kyoto School was characterized by its emphasis on microeconomic theory and its rejection of Keynesian economics and other mainstream macroeconomic theories.\n\nThe members of the Kyoto School included a number of prominent Japanese economists, such as:\n\n* Haruhiko Kuroda (1924-2016)\n* Masaru Imai (1932-2010)\n* Akira Kubo (1925-2014)\n* Hirofumi Koyama (1926-2011)\n* Kenzo Nakamura (1925-2019)\n* Nobuo Okishio (1924-2011)\n* Yoshio Ohtsuka (1925-2012)\n* Shigeru Ono (1924-2015)\n* Yoshiki Toda (1939-2015)\n* Akira Watanabe (1925-2011)\n\nThe Kyoto School was notable for its emphasis on empirical research and its rejection of mainstream macroeconomic theories, s

In [None]:
hallu_factalign_examples = Dataset.from_pandas(hallu_factalign_examples) # Convert to Dataset format
hallu_factalign_examples[0]

{'prompt': 'What is the Phaedrus Dialogue?',
 'completion': 'The Phaedrus Dialogue is a philosophical text written by the ancient Greek philosopher Phaedrus. It is a dialogue between Phaedrus and a companion who is called "Socrates." The dialogue is considered one of the most important works in the history of philosophy, and it is known for its exploration of the nature of knowledge, the role of the individual in society, and the relationship between philosophy and religion.\n\nThe Phaedrus Dialogue is set in a garden, and Phaedrus and Socrates are joined by a third person, who is called "Hermotimus." The dialogue is divided into two parts, and each part is divided into two books. The first part of the dialogue is called "The Book of the Gods," and it deals with the nature of the gods and the role of religion in human life. The second part of the dialogue is called "The Book of the Men," and it deals with the nature of human beings and their relationship to the world around them.\n\nIn

## **Load the model**

In [None]:
# Configures bitsandbytes for a 4-bit quantization of the LLM with double quantization, for the efficiency

# load_in_4bit: Enables the 4-bit quantization of the model
# bnb_4bit_use_double_quant: Enables double quantization that further decreases the computational efforts
# bnb_4bit_quant_type: Type of quantization used
# bnb_4bit_compute_dtype: Define the type of data used during computation in the inference phase


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Retrieve the model and the corresponding tokenizer from Hugging Face using the method AutoModelForCausalLM to use the generative part of the LLM

model_id = "meta-llama/Llama-3.2-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

## **SelfCheckGPT Definition**

In [None]:
class SelfCheckLLMPrompt_Quantized:
    """
    SelfCheckGPT (LLM Prompt): Checking LLM's text against its own sampled texts via open-source LLM prompting
    """
    def __init__(
        self,
        model: str = None,
        device = None,
        quantization = True
    ):
        model = model if model is not None else "meta-llama/Llama-3.2-3B-Instruct"

"""
Gives the chanche to load a quantized model from Hugging Face.
"""
        if quantization is not True:
            self.tokenizer = AutoTokenizer.from_pretrained(model)
            self.model = AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto")
            self.model.eval()
        else:
          bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
          )
          self.tokenizer = AutoTokenizer.from_pretrained(model)
          self.model = AutoModelForCausalLM.from_pretrained(model, quantization_config=bnb_config, torch_dtype="auto")
          self.model.eval()
        if device is None:
            device = torch.device("cpu")
        self.model.to(device)
        self.device = device
        self.prompt_template = "Context: {context}\n\nSentence: {sentence}\n\nIs the sentence supported by the context above? Answer Yes or No.\n\nAnswer: "
        self.text_mapping = {'yes': 0.0, 'no': 1.0, 'n/a': 0.5}
        self.not_defined_text = set()
        print(f"SelfCheck-LLMPrompt ({model}) initialized to device {device}")

    def set_prompt_template(self, prompt_template: str):
        self.prompt_template = prompt_template

    @torch.no_grad()
    def predict(
        self,
        sentences: List[str],
        sampled_passages: List[str],
        verbose: bool = False,
    ):
        """
        This function takes sentences (to be evaluated) with sampled passages (evidence), and return sent-level scores
        :param sentences: list[str] -- sentences to be evaluated, e.g. GPT text response spilt by spacy
        :param sampled_passages: list[str] -- stochastically generated responses (without sentence splitting)
        :param verson: bool -- if True tqdm progress bar will be shown
        :return sent_scores: sentence-level scores
        """
        num_sentences = len(sentences)
        num_samples = len(sampled_passages)
        scores = np.zeros((num_sentences, num_samples))
        disable = not verbose
        for sent_i in tqdm(range(num_sentences), disable=disable):
            sentence = sentences[sent_i]
            for sample_i, sample in enumerate(sampled_passages):

                # this seems to improve performance when using the simple prompt template
                sample = sample.replace("\n", " ")

                prompt = self.prompt_template.format(context=sample, sentence=sentence)
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                generate_ids = self.model.generate(
                    inputs.input_ids,
                    max_new_tokens=5,
                    do_sample=False, # hf's default for Llama2 is True
                )
                output_text = self.tokenizer.batch_decode(
                    generate_ids, skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )[0]
                generate_text = output_text.replace(prompt, "")
                score_ = self.text_postprocessing(generate_text)
                scores[sent_i, sample_i] = score_
        scores_per_sentence = scores.mean(axis=-1)
        return scores_per_sentence

    def text_postprocessing(
        self,
        text,
    ):
        """
        To map from generated text to score
        Yes -> 0.0
        No  -> 1.0
        everything else -> 0.5
        """
        # tested on Llama-2-chat (7B, 13B) --- this code has 100% coverage on wikibio gpt3 generated data
        # however it may not work with other datasets, or LLMs
        text = text.lower().strip()
        if text[:3] == 'yes':
            text = 'yes'
        elif text[:2] == 'no':
            text = 'no'
        else:
            if text not in self.not_defined_text:
                print(f"warning: {text} not defined")
                self.not_defined_text.add(text)
            text = 'n/a'
        return self.text_mapping[text]

In [None]:
# Text generation pipeline

# max_new_tokens: Maximum number of new tokens generated
# temperature: Controls the randomness of the new tokens

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                device_map="auto",
                temperature=1.0,
                do_sample=True,
                max_new_tokens=128,
                return_full_text=False,
)

In [None]:
# Initializes the variants of SelfCheckGPT to be used

selfcheck_llm = SelfCheckLLMPrompt_Quantized(device=device, model = model_id)
selfcheck_nli = SelfCheckNLI(device=device)
selfcheck_bertscore = SelfCheckBERTScore()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

SelfCheck-LLMPrompt (meta-llama/Llama-3.2-3B-Instruct) initialized to device cuda
SelfCheck-NLI initialized to device cuda
SelfCheck-BERTScore initialized


## **Use of SelfCheckGPT**

In [None]:
# Applies SelfCheckGPT to the instances

def get_selfcheckgpt_scores(example):
  N = 5 # Number of sampled responses

  # Prompt for response sampling

  prompt = f"""Based on your knowledge, answer the following question giving as much detail as you can.
  Question: {example['prompt']}
  Answer:"""

  samples = pipe([prompt] * N, do_sample=False, max_new_tokens=128, return_full_text=False) # Gives the prompt to the LLM that returns N responses to the prompt
  samples = [sample[0]["generated_text"] for sample in samples]
  sentences = [
    sent.text.strip() for sent in nlp(example['completion']).sents # Split the original response in sentences
              ]
  text_scores_llm = np.mean(selfcheck_llm.predict(  # Computes the mean of the LLM prompt scores obtained in each sentence, to compute the total score for the response
                            sentences = sentences,
                            sampled_passages = samples))
  text_scores_nli = selfcheck_nli.predict(           # Computes the mean of the NLI scores obtained in each sentence, to compute the total score for the response
    sentences = sentences,
    sampled_passages = samples)
  text_scores_bertscore = selfcheck_bertscore.predict( # Computes the mean of the BERTScores obtained in each sentence, to compute the total score for the response
    sentences = sentences,
    sampled_passages = samples)
  print([np.mean(text_scores_llm), np.mean(text_scores_nli), np.mean(text_scores_bertscore)])
  return {                                            # Saves in the dataset the scores obtained
      'scores_llm': np.mean(text_scores_llm),
      'scores_nli': np.mean(text_scores_nli),
      'scores_bertscore': np.mean(text_scores_bertscore)
  }

In [None]:
# Apply SelfCheckGPT

hallu_factalign_sample = hallu_factalign_sample.map(get_selfcheckgpt_scores)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[0.5, 0.9052528991763081, 0.864849155635706]
[0.07142857142857142, 0.3706311876740074, 0.7702695026860705]
[0.0, 0.5497625867995483, 0.8124528742934528]
[0.16666666666666666, 0.488745725985306, 0.6722757642467817]
[0.0, 0.3978205941944347, 0.680070793399444]
[0.3181818181818182, 0.8046167623516257, 0.7207422964274883]
[1.0, 0.9827245473861694, 0.8643668378703296]
[0.5, 0.5220734957683211, 0.9265716830889384]
[0.23076923076923078, 0.500452054825683, 0.723965211914709]
[0.14285714285714285, 0.48707663266742157, 0.7451105323575792]
[0.2727272727272727, 0.5670152935411104, 0.5885252139785073]
[0.0, 0.6112694295443362, 0.8579245088621974]
[0.23529411764705882, 0.561619413044194, 0.7112509945736212]
[0.06666666666666667, 0.3455552333345016, 0.7216607188185056]
[0.44, 0.5965348715521395, 0.8142807978857308]
[0.25, 0.35245789332839195, 0.5542404465377331]
[0.047619047619047616, 0.5302120086271316, 0.6719058734320459]
[0.6666666666666666, 0.7500241609038009, 0.6810979557534059]
[0.0, 0.55752263

In [None]:
hallu_factalign_sample

Dataset({
    features: ['prompt', 'completion', 'label', 'scores_llm', 'scores_nli', 'scores_bertscore'],
    num_rows: 100
})

In [None]:
def find_best_threshold(dataset, continuous_col, binary_col, num_thresholds=200):

    """
    Find the optimal threshold to map the scores in 0/1, maximizing the accuracy.

    :param dataset: DataFrame containing the scores and the binary labels
    :param continuous_col: Column name with scores
    :param binary_col: Column name with binary labels (ground truths)
    :param num_thresholds: Number of thresholds to test
    :return: Best threshold and corresponding accuracy
    """
    y_true = np.array(dataset[binary_col])  # Ground truths
    y_scores = np.array(dataset[continuous_col])  # SelfCheckGPT scores

    # Generate thresholds between the minimum and maximum values of the scores
    thresholds = np.linspace(y_scores.min(), y_scores.max(), num_thresholds)

    best_threshold = None
    best_accuracy = 0

    # Tests each tresholds, maximizing the accuracy
    for threshold in thresholds:
        y_pred = (y_scores <= threshold).astype(int)  # Maps in 0/1
        acc = accuracy_score(y_true, y_pred)  # Compute accuracy

        if acc > best_accuracy:
            best_accuracy = acc
            best_threshold = threshold

    return best_threshold, best_accuracy


In [None]:
# Computes best thresholds and corresponding accuracy

best_threshold_llm, best_accuracy_llm = find_best_threshold(hallu_factalign_sample, 'scores_llm', 'label')
best_threshold_nli, best_accuracy_nli = find_best_threshold(hallu_factalign_sample, 'scores_nli', 'label')
best_threshold_bertscore, best_accuracy_bertscore = find_best_threshold(hallu_factalign_sample, 'scores_bertscore', 'label')

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
metrics = {
    'classification type' : ['llm_prompt', 'nli', 'bertscore'],
    'best_threshold' : [best_threshold_llm, best_threshold_nli, best_threshold_bertscore],
    'best_accuracy' : [best_accuracy_llm, best_accuracy_nli, best_accuracy_bertscore]
}

metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,best_threshold,best_accuracy
0,llm_prompt,0.271357,0.62
1,nli,0.564352,0.67
2,bertscore,0.751316,0.59


In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

with open(path + "/metrics_selfcheckgpt_factalign.csv", "w") as f:
    metrics_df.to_csv(f, index=False)