## **Installing Dependencies**

In [None]:
!pip install datasets bitsandbytes torch transformers accelerate llama-index llama-index-embeddings-huggingface peft auto-gptq optimum einops

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting llama-index
  Downloading llama_index-0.12.18-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.1-py3-none-any.whl.metadata (767 bytes)
Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting optimum
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Colle

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

## **Importing Libraries**

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import numpy as np
import torch
import ast
import random
from collections import Counter
import re
import pandas as pd

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, VectorStoreIndex, Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from io import StringIO

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

## **Load the Data**

In [None]:
hallu_factalign_sample = pd.read_csv('/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/factalign_w_knowledge.csv')
hallu_factalign_examples = pd.read_csv('/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/factalign_w_knowledge_examples.csv')

In [None]:
hallu_factalign_sample = Dataset.from_pandas(hallu_factalign_sample) # Convert to Dataset format
hallu_factalign_sample[0]

{'prompt': 'What is the Kyoto School?',
 'completion': "The Kyoto School, also known as the Kyoto-Gakuen School or the Kyoto School of Economics and Management, was a group of Japanese economists who emerged in the 1960s and 1970s. The school was named after the city of Kyoto, where many of its members studied and taught. The Kyoto School was characterized by its emphasis on microeconomic theory and its rejection of Keynesian economics and other mainstream macroeconomic theories.\n\nThe members of the Kyoto School included a number of prominent Japanese economists, such as:\n\n* Haruhiko Kuroda (1924-2016)\n* Masaru Imai (1932-2010)\n* Akira Kubo (1925-2014)\n* Hirofumi Koyama (1926-2011)\n* Kenzo Nakamura (1925-2019)\n* Nobuo Okishio (1924-2011)\n* Yoshio Ohtsuka (1925-2012)\n* Shigeru Ono (1924-2015)\n* Yoshiki Toda (1939-2015)\n* Akira Watanabe (1925-2011)\n\nThe Kyoto School was notable for its emphasis on empirical research and its rejection of mainstream macroeconomic theories, s

In [None]:
hallu_factalign_examples = Dataset.from_pandas(hallu_factalign_examples) # Convert to Dataset format
hallu_factalign_examples[0]

{'prompt': 'What is the Phaedrus Dialogue?',
 'completion': 'The Phaedrus Dialogue is a philosophical text written by the ancient Greek philosopher Phaedrus. It is a dialogue between Phaedrus and a companion who is called "Socrates." The dialogue is considered one of the most important works in the history of philosophy, and it is known for its exploration of the nature of knowledge, the role of the individual in society, and the relationship between philosophy and religion.\n\nThe Phaedrus Dialogue is set in a garden, and Phaedrus and Socrates are joined by a third person, who is called "Hermotimus." The dialogue is divided into two parts, and each part is divided into two books. The first part of the dialogue is called "The Book of the Gods," and it deals with the nature of the gods and the role of religion in human life. The second part of the dialogue is called "The Book of the Men," and it deals with the nature of human beings and their relationship to the world around them.\n\nIn

## **Set embedding model and settings for RAG**

In [None]:
# Retrieves the embedding model for indexing and retrieving the knowledge for RAG
Settings.embed_model = HuggingFaceEmbedding(model_name="jinaai/jina-embeddings-v3", trust_remote_code = True)

Settings.llm = None
Settings.chunk_size = 256 # Number of characters within each chunk
Settings.chunk_overlap = 25 # Number of characters that overlaps for not truncating the chunk

modules.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/734k [00:00<?, ?B/s]

custom_st.py:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v3:
- custom_st.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
- mha.py
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- block.py
- embedding.py
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

LLM is explicitly disabled. Using MockLLM.


In [None]:
# Basic cleaning of the knowledge retrieved

def clean_text(example):
  text = example['knowledge']
  try:
    text = text.replace('\n',' ')
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\n*', ' ')
    example['knowledge'] = text
  except:
    pass
  return example

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: clean_text(x))
hallu_factalign_examples = hallu_factalign_examples.map(lambda x: clean_text(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
hallu_factalign_sample[1]

{'prompt': 'What is the Quadrilateral Security Dialogue?',
 'completion': 'The Quadrilateral Security Dialogue (QSD) is a multilateral security dialogue between India, Japan, Australia, and the United States. The QSD was established in 2007, with the aim of promoting security and stability in the Indo-Pacific region.\n\nHere are some specific details and examples of the QSD:\n\n* Participants: The QSD includes India, Japan, Australia, and the United States.\n* Location: The QSD is held in various locations around the world, including Tokyo, India, Washington D.C., and other regional capitals.\n* Dates and times: The QSD takes place on a regular basis, with meetings typically held at the beginning of the year.\n* Agenda: The QSD agenda includes discussions on regional security issues, including counter-terrorism, maritime security, and regional stability.\n* Outcomes: The QSD has produced several outcomes, including the signing of the Enhanced Partnership Agreement between Japan and Ind

## **Load the model**

In [None]:
# Configures bitsandbytes for a 4-bit quantization of the LLM with double quantization, for the efficiency
# load_in_4bit: Enables the 4-bit quantization of the model
# bnb_4bit_use_double_quant: Enables double quantization that further decreases the computational efforts
# bnb_4bit_quant_type: Type of quantization used
# bnb_4bit_compute_dtype: Define the type of data used during computation in the inference phase

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Retrieve the model and the corresponding tokenizer from Hugging Face using the method AutoModelForCausalLM to use the generative part of the LLM

model_id = 'meta-llama/Llama-3.2-3B-Instruct'

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Text generation pipeline

# max_new_tokens: Maximum number of new tokens generated
# temperature: Controls the randomness of the new tokens
# truncation: if the input is too long, it gets truncated to respect the maximum limit of tokens supported by the LLM

text_generator = pipeline(task="text-generation",
                          model=model,
                          tokenizer=tokenizer,
                          do_sample = False,
                          max_new_tokens=128,
                          temperature = 1e-3,
                          truncation = True)

Device set to use cuda:0


In [None]:
# Function to get the text generated by the LLM

def get_response(prompt):
  return text_generator(prompt,
                        pad_token_id=text_generator.tokenizer.eos_token_id, return_full_text = False)[0]['generated_text']

## **Chat Building**



In [None]:
def build_chat(k = 1):

  # Prompt to instruct the LLM for the task

  messages = [{
      'role': 'system',
      'content': f"""I want you to act as a response judge. Given a user query, a knowledge and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge, on the knowledge provided and on the definition of hallucination provided, analyze the user query and the response of the LLM, and answer the following question: is the response factual or not?
    BE CAREFUL: sometimes the knowledge may be empty or not useful, in which case you have to respond based only on your knowledge.
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation."""}]

  data = hallu_factalign_examples

  # Add the examples to the list of messages

  top_k = 5 # Number of top relevant chunks to be retrieved
  for i in range(k):
    if data['knowledge'][i]:
      documents = [Document(text = data['knowledge'][i])] # Transform the knowledge into a Document
      index = VectorStoreIndex.from_documents(documents) # Indexes the knowledge
      retriever = VectorIndexRetriever(index=index, # Define the retriever
                                     similarity_top_k = top_k)
      query_engine = RetrieverQueryEngine(retriever=retriever, # Retrieves the top k relevant document to the query
                                        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)])
      response = query_engine.query(data['prompt'][i]) # Define the query as the prompt that generated the response
      knowledge = '\n\n'

# Adds the knowledge to the prompt given to the LLM

      for k in range(min(top_k, len(response.source_nodes))):
        knowledge = knowledge + response.source_nodes[k].text + '\n\n'
    else:
      knowledge = ''
    demo = {
        'role':'user',
        'content': f"""User query: {data['prompt'][i]}
        Knowledge: {knowledge}
        LLM response: {data['completion'][i]}"""
    }
    result = {
        'role':'assistant',
        'content': f"""Answer: {data['label'][i]}"""
    }
    messages.append(demo)
    messages.append(result)

  return messages

## **One-Shot Classification**



In [None]:
system_message = build_chat() # Builds the prompt with one example

In [None]:
def one_shot_classification(example):

# Add the response to be classified to the list of messages, along with the corresponding prompt and the knowledge

  if example['knowledge']: # If the knowledge exists and it is not None...

    top_k = 5 # Number of top relevant chunks to be retrieved
    documents = [Document(text = example['knowledge'])] # Transform the knowledge into a Document
    index = VectorStoreIndex.from_documents(documents) # Indexes the knowledge
    retriever = VectorIndexRetriever(index=index, # Defines the retriever
                                   similarity_top_k = top_k)
    query_engine = RetrieverQueryEngine(retriever=retriever, # Retrieves the top k relevant document to the query
                                      node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)])
    response = query_engine.query(example['prompt']) # Define the query as the prompt that generated the response
    knowledge = '\n\n'

# Adds the knowledge to the prompt given to the LLM

    for k in range(min(top_k, len(response.source_nodes))):
      knowledge = knowledge + response.source_nodes[k].text + '\n\n'
  else:
    knowledge = ''
  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     Knowledge: {knowledge if knowledge else ''}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  system_message.pop() # Eliminates the last item (the response to be evaluated) from the list of messages
  return {'prediction_w_knowledge': result} # Save the prediction in a new column of the dataset

In [None]:
print(tokenizer.apply_chat_template(build_chat(), tokenize = False, add_generation_prompt = True)) # Example of chat template

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 17 Feb 2025

I want you to act as a response judge. Given a user query, a knowledge and a response by an LLM, your objective is to determine if the response is an hallucination or not.
    In the context of NLP, an "hallucination" refers to a phenomenon where the LLM generates text that is incorrect, nonsensical, or not real.
    Based on your knowledge, on the knowledge provided and on the definition of hallucination provided, analyze the user query and the response of the LLM, and answer the following question: is the response factual or not?
    BE CAREFUL: sometimes the knowledge may be empty or not useful, in which case you have to respond based only on your knowledge.
    Answer True if you consider the response factual, False otherwise. You don't have to provide any explanation.<|eot_id|><|start_header_id|>user<|end_header_id|>

User query: What is the Phaedrus Dialogue

In [None]:
# Start the one shot prompting

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: one_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Consider the response of the LLM as boolean

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: {'prediction_w_knowledge': ast.literal_eval(x['prediction_w_knowledge'])})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(y_true, y_pred):

# Function that computes the metrics with the predictions and the true labels

  metrics = {
      "accuracy": accuracy_score(y_true, y_pred),
      "precision": precision_score(y_true, y_pred, average="binary"),
      "recall": recall_score(y_true, y_pred, average="binary"),
      "macro_f1": f1_score(y_true, y_pred, average="binary"),
      "micro_f1": f1_score(y_true, y_pred, average="binary")
  }
  return metrics

In [None]:
# Compute the metrics
one_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['prediction_w_knowledge'])

In [None]:
one_shot_metrics

{'accuracy': 0.54,
 'precision': 0.5454545454545454,
 'recall': 0.7924528301886793,
 'macro_f1': 0.6461538461538462,
 'micro_f1': 0.6461538461538462}

## **5-Shot Classification**

In [None]:
system_message = build_chat(5) # Builds the prompt with 5 examples

In [None]:
def five_shot_classification(example, system_message=system_message):

# Add the response to be classified to the list of messages, along with the corresponding prompt and the knowledge

  if example['knowledge']: # If the knowledge exists and it is not None...

    top_k = 5 # Number of top relevant chunks to be retrieved
    documents = [Document(text = example['knowledge'])] # Transform the knowledge into a Document
    index = VectorStoreIndex.from_documents(documents) # Indexes the knowledge
    retriever = VectorIndexRetriever(index=index, # Defines the retriever
                                   similarity_top_k = top_k)
    query_engine = RetrieverQueryEngine(retriever=retriever, # Retrieves the top k relevant document to the query
                                      node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)])
    response = query_engine.query(example['prompt']) # Define the query as the prompt that generated the response
    knowledge = '\n\n'

# Adds the knowledge to the prompt given to the LLM

    for k in range(min(top_k, len(response.source_nodes))):
      knowledge = knowledge + response.source_nodes[k].text + '\n\n'
  else:
    knowledge = ''
  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     Knowledge: {knowledge if knowledge else ''}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  system_message.pop() # Eliminates the last item (the response to be evaluated) from the list of messages
  return {'five_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Function that extracts True of False from the response using a regex

def response_5shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['five_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['five_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Start the 5 shot prompting

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: five_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# Extracts True or False from the response

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: response_5shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Computes the metrics

five_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['five_shot_prediction'])

In [None]:
five_shot_metrics

{'accuracy': 0.55,
 'precision': 0.5454545454545454,
 'recall': 0.9056603773584906,
 'macro_f1': 0.6808510638297872,
 'micro_f1': 0.6808510638297872}

In [None]:
system_message = build_chat(10) # Builds the prompt with 10 examples

In [None]:
def ten_shot_classification(example, system_message=system_message):

# Add the response to be classified to the list of messages, along with the corresponding prompt and the knowledge

  if example['knowledge']: # If the knowledge exists and it is not None...

    top_k = 5 # Number of top relevant chunks to be retrieved
    documents = [Document(text = example['knowledge'])] # Transform the knowledge into a Document
    index = VectorStoreIndex.from_documents(documents) # Indexes the knowledge
    retriever = VectorIndexRetriever(index=index, # Defines the retriever
                                   similarity_top_k = top_k)
    query_engine = RetrieverQueryEngine(retriever=retriever, # Retrieves the top k relevant document to the query
                                      node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)])
    response = query_engine.query(example['prompt']) # Define the query as the prompt that generated the response
    knowledge = '\n\n'

# Adds the knowledge to the prompt given to the LLM

    for k in range(min(top_k, len(response.source_nodes))):
      knowledge = knowledge + response.source_nodes[k].text + '\n\n'
  else:
    knowledge = ''
  question = {
      'role':'user',
      'content': f"""User query: {example['prompt']}
                     Knowledge: {knowledge}
                     LLM response: {example['completion']}"""
  }
  system_message.append(question)
  prompt = tokenizer.apply_chat_template(system_message, tokenize = False, add_generation_prompt = True) # Transform the list of messages in a chat template readable by the LLM
  result = get_response(prompt) # Extract the response from the LLM
  result = result.split('Answer:')[-1].strip() # Post processing of the output to extract the response that we are interested in (True or False)
  system_message.pop() # Eliminates the last item (the response to be evaluated) from the list of messages
  return {'ten_shot_prediction': result} # Save the prediction in a new column of the dataset

In [None]:
# Start the 10 shot prompting

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: ten_shot_classification(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Extracts True or False from the response

def response_10shot(example):
  pattern = r'.*(True|False)'
  match = re.search(pattern, example['ten_shot_prediction'])
  if match:
    match_found = match.group(1)
    example['ten_shot_prediction'] = ast.literal_eval(match_found)

  return example

In [None]:
# Extracts True or False from the response

hallu_factalign_sample = hallu_factalign_sample.map(lambda x: response_10shot(x))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Computes the metrics

ten_shot_metrics = compute_metrics(hallu_factalign_sample['label'], hallu_factalign_sample['ten_shot_prediction'])

In [None]:
ten_shot_metrics

{'accuracy': 0.59,
 'precision': 0.5652173913043478,
 'recall': 0.9811320754716981,
 'macro_f1': 0.7172413793103448,
 'micro_f1': 0.7172413793103448}

## **Saving dataframes with metrics**

Convert the results in a dataframe and save it on Google Drive.

In [None]:
path = '/content/drive/MyDrive/Ceccarelli_MasterThesis&Internship/Master Thesis/Results'

In [None]:
metrics = {
    'classification type' : ['one shot', 'five shot', 'ten shot'],
    'accuracy' : [one_shot_metrics['accuracy'], five_shot_metrics['accuracy'], ten_shot_metrics['accuracy']]
}

metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,classification type,accuracy
0,one shot,0.54
1,five shot,0.55
2,ten shot,0.59


In [None]:
with open(path + "/metrics_shot_classification_w_knowledge_factalign.csv", "w") as f:
    metrics_df.to_csv(f, index=False)