## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing Libraries**

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import numpy as np
import torch
import ast
import random
from collections import Counter
import re
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the Data**

In [None]:
hallu_factbench = pd.read_json('/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/data/Factbench.jsonl', lines=True)

## **Data Preparation**

In [None]:
# Check and eliminates null responses

hallu_factbench = hallu_factbench[~hallu_factbench['response'].isna()].reset_index(drop=True)
hallu_factbench = hallu_factbench[hallu_factbench['response_label'] != 'NA'].reset_index(drop=True)

In [None]:
# Function that filters the questions based on if the prompt ends with a question mark

def is_question(prompt):
    if prompt.strip().endswith("?"):
        return True
    else:
        return False

hallu_factbench['is_question'] = hallu_factbench['prompt'].apply(is_question)

In [None]:
# Filter the questions

hallu_factbench = hallu_factbench[hallu_factbench['is_question'] == True].reset_index(drop=True)

In [None]:
# Check and eliminates null labels

hallu_factbench = hallu_factbench[hallu_factbench['response_label']!= 'NA'].reset_index()
hallu_factbench.groupby('response_label').count()

Unnamed: 0_level_0,index,prompt,response,claims,claim_labels,ability_to_test,source,hallucination_spans,is_question
response_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,221,221,221,221,221,221,221,74,221
True,327,327,327,327,327,327,327,190,327


In [None]:
# Choose 50 random examples from the True class and 50 random examples from the False class, in order to balance the dataset

random.seed(777)

num_classes = hallu_factbench["response_label"].nunique()
samples_per_class = 100 // num_classes

hallu_factbench_sample = hallu_factbench.groupby("response_label").sample(n=samples_per_class, random_state=42)

# Set the dataset for the examples to be shown to the LLM as the ones that were not chosen previously
hallu_factbench_examples = hallu_factbench.drop(hallu_factbench_sample.index).reset_index(drop=True)

In [None]:
# Consider only the prompt, the response and the factuality label

hallu_factbench_sample = hallu_factbench_sample[['prompt', 'response', 'response_label']]

In [None]:
# Check for the correctness of the rebalance

Counter(hallu_factbench_sample['response_label'])

Counter({False: 50, True: 50})

In [None]:
hallu_factbench_sample = Dataset.from_pandas(hallu_factbench_sample) # Convert to Dataset format
hallu_factbench_sample[0]

{'prompt': 'Given this paragraph about autonomous buildings, why would they be safer during a military attack?',
 'response': 'Autonomous buildings are designed to not rely on external systems such as electricity, water, and gas grids. They are equipped with their own systems for heating, cooling, water supply, and waste management. This means that even during a military attack, when external systems may be compromised, autonomous buildings can continue to operate and provide a safe and sustainable haven for the people inside. Additionally, their self-sufficient systems may be more resilient to damage inflicted during the attack, resulting in a safer outcome for the occupants.',
 'response_label': False,
 '__index_level_0__': 255}

In [None]:
hallu_factbench_examples = hallu_factbench_examples[['prompt', 'response', 'response_label']] # Consider only the prompt, the response and the factuality label
hallu_factbench_examples = Dataset.from_pandas(hallu_factbench_examples) # Convert to Dataset format

## **Load the model**

In [None]:
# Configures bitsandbytes for a 4-bit quantization of the LLM with double quantization, for the efficiency
# load_in_4bit: Enables the 4-bit quantization of the model
# bnb_4bit_use_double_quant: Enables double quantization that further decreases the computational efforts
# bnb_4bit_quant_type: Type of quantization used
# bnb_4bit_compute_dtype: Define the type of data used during computation in the inference phase

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Retrieve the model and the corresponding tokenizer from Hugging Face using the method AutoModelForCausalLM to use the generative part of the LLM

model_id = 'meta-llama/Llama-3.2-3B-Instruct'

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Text generation pipeline

# max_new_tokens: Maximum number of new tokens generated
# temperature: Controls the randomness of the new tokens
# truncation: if the input is too long, it gets truncated to respect the maximum limit of tokens supported by the LLM

text_generator = pipeline(task="text-generation",
                          model=model,
                          tokenizer=tokenizer,
                          do_sample = False,
                          max_new_tokens=128,
                          temperature = 1e-3,
                          truncation = True)

Device set to use cuda:0


In [None]:
# Function to get the text generated by the LLM

def get_response(prompt):
  return text_generator(prompt,
                        pad_token_id=text_generator.tokenizer.eos_token_id, return_full_text = False)[0]['generated_text']

## **Chat Building**



In [None]:
def build_chat(k = 1):

  # Prompt to instruct the LLM for the task

  messages = [{
      'role': 'system',
      'content': f"""I want you to act as a response judge. Given a user query and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge and on the definition of hallucination provided, analyze the user query and the response of LLM, and answer the following question: is the response factual or not?
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation."""}]

  # Add the examples to the list of messages

  data = hallu_factbench_examples
  for i in range(k):
    demo = {
        'role':'user',
        'content': f"""User query: {data['prompt'][i]}
LLM response: {data['response'][i]}"""
    }
    result = {
        'role':'assistant',
        'content': f"""Answer: {data['response_label'][i]}"""
    }
    messages.append(demo)
    messages.append(result)

  return messages

## **One-Shot Classification**



In [None]:
def one_shot_classification(example):

# Builds the prompt with one examples

  system_message = build_chat()

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['response']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'prediction': result} # Save the prediction in a new column of the dataset

In [None]:
print(tokenizer.apply_chat_template(build_chat(1), tokenize = False, add_generation_prompt = True))  # Example of chat template

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 28 Feb 2025

I want you to act as a response judge. Given a user query and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge and on the definition of hallucination provided, analyze the user query and the response of TLLM, and answer the following question: is the response factual or not?
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>

User query: Which country or city has the maximum number of nuclear power plants?
LLM response: The United States has the highest number of nuclear power plants in the world, with 94 operating reactors. Other countri

In [None]:
# Start the one shot prompting

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: one_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
def compute_metrics(y_true, y_pred):

# Function that computes the metrics with the predictions and the true labels

  metrics = {
      "accuracy": accuracy_score(y_true, y_pred),
      "precision": precision_score(y_true, y_pred, average="binary"),
      "recall": recall_score(y_true, y_pred, average="binary"),
      "macro_f1": f1_score(y_true, y_pred, average="binary"),
      "micro_f1": f1_score(y_true, y_pred, average="binary")
  }
  return metrics

In [None]:
# Function that extracts True of False from the response using a regex

def response(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['prediction'])
  if match:
    match_found = match.group(1)
    example['prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Extracts True or False

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: response(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Compute the metrics

one_shot_metrics = compute_metrics(hallu_factbench_sample['response_label'], hallu_factbench_sample['prediction'])

In [None]:
one_shot_metrics

{'accuracy': 0.62,
 'precision': 0.6153846153846154,
 'recall': 0.64,
 'macro_f1': 0.6274509803921569,
 'micro_f1': 0.6274509803921569}

## **5-Shot Classification**

In [None]:
def five_shot_classification(example):

# Builds the prompt with 5 examples

  system_message = build_chat(5)

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['response']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'five_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Start the 5 shot prompting

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: five_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Function that extracts True of False from the response using a regex

def response_5shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['five_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['five_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Extracts True or False from the response

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: response_5shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Compute the metrics

five_shot_metrics = compute_metrics(hallu_factbench_sample['response_label'], hallu_factbench_sample['five_shot_prediction'])

In [None]:
five_shot_metrics

{'accuracy': 0.53,
 'precision': 0.5652173913043478,
 'recall': 0.26,
 'macro_f1': 0.3561643835616438,
 'micro_f1': 0.3561643835616438}

In [None]:
def ten_shot_classification(example):

# Builds the prompt with 10 examples

  system_message = build_chat(10)

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['response']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'ten_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Starts the ten shot classification

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: ten_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Function that extracts True of False from the response using a regex

def response_10shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['ten_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['ten_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Extracts True or False from the response

hallu_factbench_sample = hallu_factbench_sample.map(lambda x: response_10shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Computes the metrics

ten_shot_metrics = compute_metrics(hallu_factbench_sample['response_label'], hallu_factbench_sample['ten_shot_prediction'])

In [None]:
ten_shot_metrics

{'accuracy': 0.59,
 'precision': 0.6216216216216216,
 'recall': 0.46,
 'macro_f1': 0.5287356321839081,
 'micro_f1': 0.5287356321839081}

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : ['one shot', 'five shot', 'ten shot'],
    'accuracy' : [one_shot_metrics['accuracy'], five_shot_metrics['accuracy'], ten_shot_metrics['accuracy']]
}

metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy
0,one shot,0.62
1,five shot,0.53
2,ten shot,0.59


In [None]:
with open(path + "/metrics_shot_classification_factbench.csv", "w") as f:
    metrics_df.to_csv(f, index=False)