## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing Libraries**

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import numpy as np
import torch
import ast
import random
from collections import Counter
import re
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the Data**

In [None]:
hallu_factalign = load_dataset('chaoweihuang/factalign-gemma2-f1_0.75', trust_remote_code=True)

## **Data Preparation**

In [None]:
# Extract the prompt and the response from the JSON

def get_question_answer(example):
    return {
        "prompt": example["prompt"][0]["content"],
        "completion": example["completion"][0]["content"]
    }

hallu_factalign = hallu_factalign.map(get_question_answer)

Map:   0%|          | 0/2177 [00:00<?, ? examples/s]

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

In [None]:
# Convert to Pandas for better handling
hallu_factalign_train = Dataset.to_pandas(hallu_factalign['train'])
hallu_factalign_test = Dataset.to_pandas(hallu_factalign['test'])

In [None]:
# Remove the instances without response

hallu_factalign_train = hallu_factalign_train[~hallu_factalign_train['completion'].isna()].reset_index()
hallu_factalign_test = hallu_factalign_test[~hallu_factalign_train['completion'].isna()].reset_index()

In [None]:
# Remove the suffix "Provide as many specific details and examples as possible (such as names of people, numbers, events, locations, dates, times, etc.)"

def extract_question(prompt):
    return prompt.split("?")[0] + "?" if "?" in prompt else prompt

hallu_factalign_train['prompt'] = hallu_factalign_train['prompt'].apply(extract_question)
hallu_factalign_test['prompt'] = hallu_factalign_test['prompt'].apply(extract_question)

In [None]:
# Function that filters the questions based on if the prompt ends with a question mark

def is_question(prompt):
    if prompt.strip().endswith("?"):
        return True
    else:
        return False

hallu_factalign_train['is_question'] = hallu_factalign_train['prompt'].apply(is_question)
hallu_factalign_test['is_question'] = hallu_factalign_test['prompt'].apply(is_question)

In [None]:
# Filter the questions

hallu_factalign_train = hallu_factalign_train[hallu_factalign_train['is_question'] == True].reset_index(drop=True)
hallu_factalign_test = hallu_factalign_test[hallu_factalign_test['is_question'] == True].reset_index(drop=True)

In [None]:
# Randomly sampling 100 examples to be classified, using a seed for consistency and reproducibility
random.seed(777)

hallu_factalign_sample = hallu_factalign_test.sample(n=100, random_state=42).reset_index(drop=True)


# Set the dataset for the examples to be shown to the LLM
hallu_factalign_examples = hallu_factalign_train

In [None]:
hallu_factalign_sample

Unnamed: 0,index,prompt,completion,completion_sentences,label,sentence_label,is_question
0,268,What is the Kyoto School?,"The Kyoto School, also known as the Kyoto-Gaku...","[The Kyoto School, also known as the Kyoto-Gak...",False,"[False, True, False, False, False, False, Fals...",True
1,250,What is the Quadrilateral Security Dialogue?,The Quadrilateral Security Dialogue (QSD) is a...,[The Quadrilateral Security Dialogue (QSD) is ...,False,"[True, True, False, True, False, False, False,...",True
2,355,Who is Simone de Beauvoir?,"Simone de Beauvoir was a French philosopher, w...","[Simone de Beauvoir was a French philosopher, ...",True,"[True, False, False, True, False, False, False...",True
3,332,What is Fermat's Last Theorem?,Fermat's Last Theorem is a famous mathematical...,[Fermat's Last Theorem is a famous mathematica...,False,"[True, True, True, True, True, True, False, Fa...",True
4,56,What is the moral dispute over the use of soli...,The use of solitary confinement in prisons has...,[The use of solitary confinement in prisons ha...,True,"[True, False, False, False, False, True, True,...",True
...,...,...,...,...,...,...,...
95,19,What is the Dengue virus?,The dengue virus is a mosquito-borne viral dis...,[The dengue virus is a mosquito-borne viral di...,True,"[True, True, False, True, False, True, False, ...",True
96,131,Who is Harry Partch?,Harry Partch (1901-1974) was an American compo...,[Harry Partch (1901-1974) was an American comp...,False,"[True, True, True, False, False, False, False,...",True
97,340,What is the TensorFlow library?,TensorFlow is an open-source software library ...,[TensorFlow is an open-source software library...,True,"[True, True, True, True, True, True, True, Fal...",True
98,334,What is the Atacama Desert?,"The Atacama Desert is a vast, high-altitude de...","[The Atacama Desert is a vast, high-altitude d...",True,"[True, False, False, True, False, True, False,...",True


In [None]:
# Consider only the prompt, the response and the factuality label

hallu_factalign_sample = hallu_factalign_sample[['prompt', 'completion', 'label']]
hallu_factalign_examples = hallu_factalign_examples[['prompt', 'completion', 'label']]

In [None]:
# Check the distribution of the classes
Counter(hallu_factalign_sample['label'])

Counter({False: 47, True: 53})

In [None]:
hallu_factalign_sample = Dataset.from_pandas(hallu_factalign_sample) # Convert to Dataset format
hallu_factalign_sample[0]

{'prompt': 'What is the Kyoto School?',
 'completion': "The Kyoto School, also known as the Kyoto-Gakuen School or the Kyoto School of Economics and Management, was a group of Japanese economists who emerged in the 1960s and 1970s. The school was named after the city of Kyoto, where many of its members studied and taught. The Kyoto School was characterized by its emphasis on microeconomic theory and its rejection of Keynesian economics and other mainstream macroeconomic theories.\n\nThe members of the Kyoto School included a number of prominent Japanese economists, such as:\n\n* Haruhiko Kuroda (1924-2016)\n* Masaru Imai (1932-2010)\n* Akira Kubo (1925-2014)\n* Hirofumi Koyama (1926-2011)\n* Kenzo Nakamura (1925-2019)\n* Nobuo Okishio (1924-2011)\n* Yoshio Ohtsuka (1925-2012)\n* Shigeru Ono (1924-2015)\n* Yoshiki Toda (1939-2015)\n* Akira Watanabe (1925-2011)\n\nThe Kyoto School was notable for its emphasis on empirical research and its rejection of mainstream macroeconomic theories, s

In [None]:
hallu_factalign_examples = Dataset.from_pandas(hallu_factalign_examples) # Convert to Dataset format
hallu_factalign_examples[0]

{'prompt': 'What is the Phaedrus Dialogue?',
 'completion': 'The Phaedrus Dialogue is a philosophical text written by the ancient Greek philosopher Phaedrus. It is a dialogue between Phaedrus and a companion who is called "Socrates." The dialogue is considered one of the most important works in the history of philosophy, and it is known for its exploration of the nature of knowledge, the role of the individual in society, and the relationship between philosophy and religion.\n\nThe Phaedrus Dialogue is set in a garden, and Phaedrus and Socrates are joined by a third person, who is called "Hermotimus." The dialogue is divided into two parts, and each part is divided into two books. The first part of the dialogue is called "The Book of the Gods," and it deals with the nature of the gods and the role of religion in human life. The second part of the dialogue is called "The Book of the Men," and it deals with the nature of human beings and their relationship to the world around them.\n\nIn

## **Load the model**

In [None]:
# Configures bitsandbytes for a 4-bit quantization of the LLM with double quantization, for the efficiency

# load_in_4bit: Enables the 4-bit quantization of the model
# bnb_4bit_use_double_quant: Enables double quantization that further decreases the computational efforts
# bnb_4bit_quant_type: Type of quantization used
# bnb_4bit_compute_dtype: Define the type of data used during computation in the inference phase

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Retrieve the model and the corresponding tokenizer from Hugging Face using the method AutoModelForCausalLM to use the generative part of the LLM

model_id = 'meta-llama/Llama-3.2-3B-Instruct'

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Text generation pipeline

# max_new_tokens: Maximum number of new tokens generated
# temperature: Controls the randomness of the new tokens
# truncation: if the input is too long, it gets truncated to respect the maximum limit of tokens supported by the LLM

text_generator = pipeline(task="text-generation",
                          model=model,
                          tokenizer=tokenizer,
                          do_sample = False,
                          max_new_tokens=128,
                          temperature = 1e-3,
                          truncation = True)

Device set to use cuda:0


In [None]:
# Function to get the text generated by the LLM

def get_response(prompt):
  return text_generator(prompt,
                        pad_token_id=text_generator.tokenizer.eos_token_id, return_full_text = False)[0]['generated_text']

## **Chat Building**



Function used to show examples to the LLM for a better classification.

The parameter K controls how many examples are shown to the model.

In [None]:
def build_chat(k = 1):

  # Prompt to instruct the LLM for the task

  messages = [{
      'role': 'system',
      'content': f"""I want you to act as a response judge. Given a user query and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge and on the definition of hallucination provided, analyze the user query and the response of TLLM, and answer the following question: is the response factual or not?
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation."""}]

  # Add the examples to the list of messages

  data = hallu_factalign_examples
  for i in range(k):
    demo = {
        'role':'user',
        'content': f"""User query: {data['prompt'][i]}
        LLM response: {data['completion'][i]}"""
    }
    result = {
        'role':'assistant',
        'content': f"""Answer: {data['label'][i]}"""
    }
    messages.append(demo)
    messages.append(result)

  return messages

## **One-Shot Classification**



In [None]:
def one_shot_classification(example):

# Builds the prompt with the examples

  system_message = build_chat()

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'prediction': result} # Save the prediction in a new column of the dataset

In [None]:
print(tokenizer.apply_chat_template(build_chat(1), tokenize = False, add_generation_prompt = True)) # Example of chat template

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 12 Feb 2025

I want you to act as a response judge. Given a user query and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge and on the definition of hallucination provided, analyze the user query and the response of TLLM, and answer the following question: is the response factual or not?
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>

User query: What is the Phaedrus Dialogue?
LLM response: The Phaedrus Dialogue is a philosophical text written by the ancient Greek philosopher Phaedrus. It is a dialogue between Phaedrus and a companion who is calle

In [None]:
# Start the one shot prompting

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: one_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# Consider the response of the LLM as boolean

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: {'prediction': ast.literal_eval(x['prediction'])})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(y_true, y_pred):

# Function that computes the metrics with the predictions and the true labels

  metrics = {
      "accuracy": accuracy_score(y_true, y_pred),
      "precision": precision_score(y_true, y_pred, average="binary"),
      "recall": recall_score(y_true, y_pred, average="binary"),
      "macro_f1": f1_score(y_true, y_pred, average="binary"),
      "micro_f1": f1_score(y_true, y_pred, average="binary")
  }
  return metrics

In [None]:
# Compute the metrics
one_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['prediction'])

In [None]:
one_shot_metrics

{'accuracy': 0.5,
 'precision': 0.5205479452054794,
 'recall': 0.7169811320754716,
 'macro_f1': 0.6031746031746031,
 'micro_f1': 0.6031746031746031}

## **5-Shot Classification**

In [None]:
def five_shot_classification(example):

# Builds the prompt with 5 examples

  system_message = build_chat(5)

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'five_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Function that extracts True of False from the response using a regex

def response_5shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['five_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['five_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Start the 5 shot prompting

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: five_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Extracts True or False from the response

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: response_5shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Compute the metrics

five_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['five_shot_prediction'])

In [None]:
five_shot_metrics

{'accuracy': 0.57,
 'precision': 0.5595238095238095,
 'recall': 0.8867924528301887,
 'macro_f1': 0.6861313868613139,
 'micro_f1': 0.6861313868613139}

In [None]:
def ten_shot_classification(example):

# Builds the prompt with 10 examples

  system_message = build_chat(10)

# Add the response to be classified to the list of messages, along with the corresponding prompt

  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  return {'ten_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Starts the ten shot classification

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: ten_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Function that extracts True of False from the response using a regex

def response_10shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['ten_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['ten_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Extracts True or False from the response

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: response_10shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Computes the metrics

ten_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['ten_shot_prediction'])

In [None]:
ten_shot_metrics

{'accuracy': 0.55,
 'precision': 0.5512820512820513,
 'recall': 0.8113207547169812,
 'macro_f1': 0.6564885496183206,
 'micro_f1': 0.6564885496183206}

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : ['one shot', 'five shot', 'ten shot'],
    'accuracy' : [one_shot_metrics['accuracy'], five_shot_metrics['accuracy'], ten_shot_metrics['accuracy']]
}

metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy
0,one shot,0.5
1,five shot,0.57
2,ten shot,0.55


In [None]:
with open(path + "/metrics_shot_classification_factalign.csv", "w") as f:
    metrics_df.to_csv(f, index=False)