## **Installing Dependencies**

In [None]:
!pip install selfcheckgpt bitsandbytes torch transformers accelerate datasets

Collecting selfcheckgpt
  Downloading selfcheckgpt-0.1.7.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bert_score (from selfcheckgpt)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-ma

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing Libraries**

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import numpy as np
import torch
import ast
import spacy
import random
from collections import Counter
import re
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, Union

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt, SelfCheckBERTScore, SelfCheckNLI

In [None]:
torch.manual_seed(28)

<torch._C.Generator at 0x7c33fd4faf50>

In [None]:
# Set spacy for processing the text
nlp = spacy.load("en_core_web_sm")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## **Load the data**

In [None]:
hallu_factbench = pd.read_json('/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/data/Factbench.jsonl', lines=True)

## **Data Preparation**

In [None]:
# Check and eliminates null responses

hallu_factbench = hallu_factbench[~hallu_factbench['response'].isna()].reset_index(drop=True)
hallu_factbench = hallu_factbench[hallu_factbench['response_label'] != 'NA'].reset_index(drop=True)

In [None]:
# Function that filters the questions based on if the prompt ends with a question mark

def is_question(prompt):
    if prompt.strip().endswith("?"):
        return True
    else:
        return False

hallu_factbench['is_question'] = hallu_factbench['prompt'].apply(is_question)

In [None]:
# Filter the questions

hallu_factbench = hallu_factbench[hallu_factbench['is_question'] == True].reset_index(drop=True)

In [None]:
# Check and eliminates null labels

hallu_factbench = hallu_factbench[hallu_factbench['response_label']!= 'NA'].reset_index()
hallu_factbench.groupby('response_label').count()

Unnamed: 0_level_0,index,prompt,response,claims,claim_labels,ability_to_test,source,hallucination_spans,is_question
response_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,221,221,221,221,221,221,221,74,221
True,327,327,327,327,327,327,327,190,327


In [None]:
# Choose 50 random examples from the True class and 50 random examples from the False class, in order to balance the dataset

random.seed(777)

num_classes = hallu_factbench["response_label"].nunique()
samples_per_class = 100 // num_classes

hallu_factbench_sample = hallu_factbench.groupby("response_label").sample(n=samples_per_class, random_state=42)

# Set the dataset for the examples to be shown to the LLM as the ones that were not chosen previously
hallu_factbench_examples = hallu_factbench.drop(hallu_factbench_sample.index).reset_index(drop=True)

In [None]:
# Consider only the prompt, the response and the factuality label
hallu_factbench_sample = hallu_factbench_sample[['prompt', 'response', 'response_label']]

In [None]:
# Check for the correctness of the rebalance
Counter(hallu_factbench_sample['response_label'])

Counter({False: 50, True: 50})

In [None]:
hallu_factbench_sample = Dataset.from_pandas(hallu_factbench_sample) # Convert to Dataset format
hallu_factbench_sample[0]

{'prompt': 'Given this paragraph about autonomous buildings, why would they be safer during a military attack?',
 'response': 'Autonomous buildings are designed to not rely on external systems such as electricity, water, and gas grids. They are equipped with their own systems for heating, cooling, water supply, and waste management. This means that even during a military attack, when external systems may be compromised, autonomous buildings can continue to operate and provide a safe and sustainable haven for the people inside. Additionally, their self-sufficient systems may be more resilient to damage inflicted during the attack, resulting in a safer outcome for the occupants.',
 'response_label': False,
 '__index_level_0__': 255}

In [None]:
hallu_factbench_examples = hallu_factbench_examples[['prompt', 'response', 'response_label']] # Consider only the prompt, the response and the factuality label
hallu_factbench_examples = Dataset.from_pandas(hallu_factbench_examples) # Convert to Dataset format

## **Load the model**

In [None]:
# Configures bitsandbytes for a 4-bit quantization of the LLM with double quantization, for the efficiency
# load_in_4bit: Enables the 4-bit quantization of the model
# bnb_4bit_use_double_quant: Enables double quantization that further decreases the computational efforts
# bnb_4bit_quant_type: Type of quantization used
# bnb_4bit_compute_dtype: Define the type of data used during computation in the inference phase

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Retrieve the model and the corresponding tokenizer from Hugging Face using the method AutoModelForCausalLM to use the generative part of the LLM

model_id = "meta-llama/Llama-3.2-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

## **SelfCheckGPT Definition**

In [None]:
class SelfCheckLLMPrompt_Quantized:
    """
    SelfCheckGPT (LLM Prompt): Checking LLM's text against its own sampled texts via open-source LLM prompting
    """
    def __init__(
        self,
        model: str = None,
        device = None,
        quantization = True
    ):
        model = model if model is not None else "meta-llama/Llama-3.2-3B-Instruct"
"""
Gives the chanche to load a quantized model from Hugging Face.
"""
        if quantization is not True:
            self.tokenizer = AutoTokenizer.from_pretrained(model)
            self.model = AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto")
            self.model.eval()
        else:
          bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
          )
          self.tokenizer = AutoTokenizer.from_pretrained(model)
          self.model = AutoModelForCausalLM.from_pretrained(model, quantization_config=bnb_config, torch_dtype="auto")
          self.model.eval()
        if device is None:
            device = torch.device("cpu")
        self.model.to(device)
        self.device = device
        self.prompt_template = "Context: {context}\n\nSentence: {sentence}\n\nIs the sentence supported by the context above? Answer Yes or No.\n\nAnswer: "
        self.text_mapping = {'yes': 0.0, 'no': 1.0, 'n/a': 0.5}
        self.not_defined_text = set()
        print(f"SelfCheck-LLMPrompt ({model}) initialized to device {device}")

    def set_prompt_template(self, prompt_template: str):
        self.prompt_template = prompt_template

    @torch.no_grad()
    def predict(
        self,
        sentences: List[str],
        sampled_passages: List[str],
        verbose: bool = False,
    ):
        """
        This function takes sentences (to be evaluated) with sampled passages (evidence), and return sent-level scores
        :param sentences: list[str] -- sentences to be evaluated, e.g. GPT text response spilt by spacy
        :param sampled_passages: list[str] -- stochastically generated responses (without sentence splitting)
        :param verson: bool -- if True tqdm progress bar will be shown
        :return sent_scores: sentence-level scores
        """
        num_sentences = len(sentences)
        num_samples = len(sampled_passages)
        scores = np.zeros((num_sentences, num_samples))
        disable = not verbose
        for sent_i in tqdm(range(num_sentences), disable=disable):
            sentence = sentences[sent_i]
            for sample_i, sample in enumerate(sampled_passages):

                # this seems to improve performance when using the simple prompt template
                sample = sample.replace("\n", " ")

                prompt = self.prompt_template.format(context=sample, sentence=sentence)
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                generate_ids = self.model.generate(
                    inputs.input_ids,
                    max_new_tokens=5,
                    do_sample=False, # hf's default for Llama2 is True
                )
                output_text = self.tokenizer.batch_decode(
                    generate_ids, skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )[0]
                generate_text = output_text.replace(prompt, "")
                score_ = self.text_postprocessing(generate_text)
                scores[sent_i, sample_i] = score_
        scores_per_sentence = scores.mean(axis=-1)
        return scores_per_sentence

    def text_postprocessing(
        self,
        text,
    ):
        """
        To map from generated text to score
        Yes -> 0.0
        No  -> 1.0
        everything else -> 0.5
        """
        # tested on Llama-2-chat (7B, 13B) --- this code has 100% coverage on wikibio gpt3 generated data
        # however it may not work with other datasets, or LLMs
        text = text.lower().strip()
        if text[:3] == 'yes':
            text = 'yes'
        elif text[:2] == 'no':
            text = 'no'
        else:
            if text not in self.not_defined_text:
                print(f"warning: {text} not defined")
                self.not_defined_text.add(text)
            text = 'n/a'
        return self.text_mapping[text]

In [None]:
# Text generation pipeline

# max_new_tokens: Maximum number of new tokens generated
# temperature: Controls the randomness of the new tokens
# truncation: if the input is too long, it gets truncated to respect the maximum limit of tokens supported by the LLM

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                device_map="auto",
                temperature=1.0,
                do_sample=True,
                max_new_tokens=128,
                return_full_text=False,
)

In [None]:
# Initializes the variants of SelfCheckGPT to be used

selfcheck_llm = SelfCheckLLMPrompt_Quantized(device=device, model = model_id)
selfcheck_nli = SelfCheckNLI(device=device)
selfcheck_bertscore = SelfCheckBERTScore()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

SelfCheck-LLMPrompt (meta-llama/Llama-3.2-3B-Instruct) initialized to device cuda


tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

SelfCheck-NLI initialized to device cuda
SelfCheck-BERTScore initialized


In [None]:
def get_selfcheckgpt_scores(example):

# Applies SelfCheckGPT to the instances

  N = 5 # Number of sampled responses

  # Prompt for response sampling

  prompt = f"""Based on your knowledge, answer the following question giving as much detail as you can.
  Question: {example['prompt']}
  Answer:"""

  samples = pipe([prompt] * N, do_sample=False, max_new_tokens=128, return_full_text=False) # Gives the prompt to the LLM that returns N responses to the prompt
  samples = [sample[0]["generated_text"] for sample in samples]
  sentences = [
    sent.text.strip() for sent in nlp(example['response']).sents # Split the original response in sentences
              ]
  text_scores_llm = np.mean(selfcheck_llm.predict( # Computes the mean of the LLM prompt scores obtained in each sentence, to compute the total score for the response
                            sentences = sentences,
                            sampled_passages = samples))
  text_scores_nli = selfcheck_nli.predict( # Computes the mean of the NLI scores obtained in each sentence, to compute the total score for the response
    sentences = sentences,
    sampled_passages = samples)
  text_scores_bertscore = selfcheck_bertscore.predict(   # Computes the mean of the BERTScores obtained in each sentence, to compute the total score for the response
    sentences = sentences,
    sampled_passages = samples)
  print([np.mean(text_scores_llm), np.mean(text_scores_nli), np.mean(text_scores_bertscore)])
  return {                                              # Saves in the dataset the scores obtained
      'scores_llm': np.mean(text_scores_llm),
      'scores_nli': np.mean(text_scores_nli),
      'scores_bertscore': np.mean(text_scores_bertscore)
  }

In [None]:
# Apply SelfCheckGPT

hallu_factbench_sample = hallu_factbench_sample.map(get_selfcheckgpt_scores)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

[0.5, 0.18386980591458268, 0.6383601129055023]
[0.5, 0.9283157289028168, 0.5971998870372772]
[0.0, 0.008266096003353596, 0.0]
[0.25, 0.897262305021286, 0.778950366191566]
[0.0, 0.002427738392725587, 0.4551329016685486]
[0.5, 0.9877556562423706, 0.7276678048074245]
[0.0, 0.6268665939569473, 0.6347406506538391]
[0.0, 0.649674654006958, 0.7222466841340065]
[1.0, 0.9994250535964966, 0.5589593350887299]
[0.0, 0.46665897965431213, 0.5640201270580292]
[1.0, 0.967917819817861, 0.7881844003374378]
[0.75, 0.9539938420057297, 0.7119792923331261]
[0.0, 0.010201307813986205, 0.5513566359877586]
[0.75, 0.744301266502589, 0.6493522897362709]
[0.3333333333333333, 0.3210804308085547, 0.6300262361764908]
[0.0, 0.36613505333662033, 0.5357581277688345]
[0.6666666666666666, 0.7740453481674194, 0.55217178662618]
[1.0, 0.3981570269912481, 0.6578655123710633]
[0.0, 0.18804129290704927, 0.7166339059670767]
[0.5, 0.8575700521469116, 0.5023190453648567]
[0.5, 0.5447797700762749, 0.5365189015865326]
[0.3333333333

In [None]:
hallu_factbench_sample

Dataset({
    features: ['prompt', 'response', 'response_label', '__index_level_0__', 'scores_llm', 'scores_nli', 'scores_bertscore'],
    num_rows: 100
})

In [None]:
def find_best_threshold(dataset, continuous_col, binary_col, num_thresholds=200):

    """
    Find the optimal threshold to map the scores in 0/1, maximizing the accuracy.

    :param dataset: DataFrame containing the scores and the binary labels
    :param continuous_col: Column name with scores
    :param binary_col: Column name with binary labels (ground truths)
    :param num_thresholds: Number of thresholds to test
    :return: Best threshold and corresponding accuracy
    """
    y_true = np.array(dataset[binary_col])  # Ground truths
    y_scores = np.array(dataset[continuous_col])  # SelfCheckGPT scores

    # Generate thresholds between the minimum and maximum values of the scores
    thresholds = np.linspace(y_scores.min(), y_scores.max(), num_thresholds)

    best_threshold = None
    best_accuracy = 0

    # Tests each tresholds, maximizing the accuracy
    for threshold in thresholds:
        y_pred = (y_scores <= threshold).astype(int)  # Maps in 0/1
        acc = accuracy_score(y_true, y_pred)  # Compute accuracy

        if acc > best_accuracy:
            best_accuracy = acc
            best_threshold = threshold

    return best_threshold, best_accuracy

In [None]:
# Computes best thresholds and corresponding accuracy

best_threshold_llm, best_accuracy_llm = find_best_threshold(hallu_factbench_sample, 'scores_llm', 'response_label')
best_threshold_nli, best_accuracy_nli = find_best_threshold(hallu_factbench_sample, 'scores_nli', 'response_label')
best_threshold_bertscore, best_accuracy_bertscore = find_best_threshold(hallu_factbench_sample, 'scores_bertscore', 'response_label')

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
metrics = {
    'classification type' : ['llm_prompt', 'nli', 'bertscore'],
    'best_threshold' : [best_threshold_llm, best_threshold_nli, best_threshold_bertscore],
    'best_accuracy' : [best_accuracy_llm, best_accuracy_nli, best_accuracy_bertscore]
}

metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,best_threshold,best_accuracy
0,llm_prompt,0.231156,0.57
1,nli,0.642922,0.64
2,bertscore,0.605494,0.61


In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

with open(path + "/metrics_selfcheckgpt_factbench.csv", "w") as f:
    metrics_df.to_csv(f, index=False)