# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset('saracandu/hotpotQA_nli', split="train", trust_remote_code=True)
dataset

Downloading data: 100%|██████████████████████████████████████████████████████████████████████████| 375k/375k [00:00<00:00, 868kB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['question', 'answer', 'options', 'first nli', 'second nli', 'type', 'level', 'passages', 'BART1', 'BART2', 'ROBERTA1', 'ROBERTA2'],
    num_rows: 352
})

## 🔹 Select a subset of the true dataset as a test

In [3]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

{'question': 'Who was inducted into the Rock and Roll Hall of Fame, David Lee Roth or Cia Berg?',
 'answer': 'David Lee Roth',
 'options': "['Cia Berg', 'David Lee Roth']",
 'first nli': 'David Lee Roth was inducted into the Rock and Roll Hall of Fame before Cia Berg',
 'second nli': 'Cia Berg was inducted into the Rock and Roll Hall of Fame before David Lee Roth',
 'type': 'comparison',
 'level': 'medium',
 'passages': 'Cia Berg (born 2 December 1963), now known as Cia Soro, is a Swedish television presenter and singer. She was at one time the lead singer of the Swedish rock band Whale, who released the single "Hobo Humpin\' Slobo Babe". David Lee Roth (born October 10, 1954) is an American rock vocalist, musician, songwriter, actor, author, and former radio personality. In 2007, he was inducted into the Rock and Roll Hall of Fame.',
 'BART1': 0.885607123374939,
 'BART2': 0.0146357780322432,
 'ROBERTA1': 0.3287698328495025,
 'ROBERTA2': 0.0123543506488204}

In [None]:
# select a subset of the queries, just for test:
first_queries = dataset['question']

# same for correct answers and distractors:
correct_answers = dataset['answer']
possibilities = dataset['options']

# and for the sources:
sources = dataset['passages']

#nli
first_nli = dataset['first nli']
second_nli = dataset['second nli']

bart1 = dataset['BART1']
bart2 = dataset['BART2']

rob1 = dataset['ROBERTA1']
rob2 = dataset['ROBERTA2']

# Model loading and dataset selection (for testing purposes)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [4]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="nvidia/Llama3-ChatQA-1.5-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## ▪️ Model through `guidance`: 

In [5]:
from guidance import models, select

new_model = models.Transformers(model, tokenizer, temperature=0.0)

new_model + f'Do you want a joke or a poem? A ' + select(['joke', 'poem'])

# Thesis

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [22]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    System: This is a chat between a user and an AI assistant. The assistant gives helpful, detailed, and polite answers to the user’s questions based on the context. 
    {context}
    User: {question}
    Possible options: {options}.
    Assistant:
"""
)

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [23]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

In [24]:
import ast

def thesisGeneration(query, merged, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = thesis_chain.invoke({'question': query, 'options': merged, 'context': sources})
    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [25]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Assistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

## 🔹 Test: how well the thesis alone is able to perform?

In [28]:
answers = []
for i in range(len(first_queries)):
    answers.append(extract_answer(thesisGeneration(first_queries[i], possibilities[i], sources[i])))

In [11]:
answers[:5]

['Arthurs Magazine', 'Jonathan Stark', 'The Wolfhounds', 'yes', 'yes']

# Antithesis - level of entailment

## Listening to the NLI verifier only: 

In [13]:
bart_answers = []

for i in range(len(first_queries)):

    if bart1[i] > bart2[i]:
        bart_answers.append([first_nli[i], bart1[i]])
    else:
        bart_answers.append([second_nli[i], bart2[i]])

In [15]:
rob_answers = []

for i in range(len(first_queries)):

    if rob1[i] > rob2[i]:
        rob_answers.append([first_nli[i], rob1[i]])
    else:
        rob_answers.append([second_nli[i], rob2[i]])

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [None]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer, sources):
    merged = ast.literal_eval(merged)
    second_answer = query_model(system_message.format(context = sources),
    user_message.format(question=query, options = merged, candidate_answer = candidate_answer, context = sources,), max_length=1024)
    return second_answer

In [None]:
ant_answers = []
for i in range(len(first_queries)):
    ant_answers.append(antithesisGeneration(first_queries[i], prompt_template, possibilities[i], answers[i], sources[i]))

In [None]:
ant_answers[:10]

# Pre Synthesis

In [17]:
# prompt per baseline-2
system_message = """
    You are an helpful AI assistant.
    You are asked to determine the most correct answer for a given question.
    You have at disposal a first tentative answer (a candidate answer) and another opinion on which should be the correct option according to context (a suggestion).
    
    They could agree on the correct option; in this case, directly output the option on which they agree.
    If instead they disagree, use the context to determine the correct answer for the question, given the set of possible options.
    
    The goal of the assistant is to decree which is the most correct answer to the question between the available options. 
    Answer by explicitly reporting the correct answer to you.
"""


user_message = """
    Question: {question}
    Options: {options}
    Candidate answer: {candidate_answer}
    Suggestion: {critique}
    Which of the candidate answers {options} is the most proper answer for the question?

"""

In [49]:
# prompt per baseline-3
system_message = """
    You are an helpful AI assistant.
    You are asked to determine the most correct answer for a given question.
    You have at disposal a first tentative answer (a candidate answer) and another opinion on which should be the correct option (a suggestion).
    
    They could agree on the correct option; in this case, directly output the option on which they agree.
    If instead they disagree, use the context to determine the correct answer for the question, given the set of possible options.
    
    Your goal is to decree which is the most correct answer to the question between the available options. 
    Answer by explicitly reporting the correct answer to the question.
"""


user_message = """
    Question: {question}
    Options: {options}
    Candidate answer: {candidate_answer}
    Suggestion: {critique}
    Which of the candidate answers {options} is the most proper answer for the question {question}?

"""

In [30]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

def query_model(
        system_message,
        user_message,
        temperature = 0.0,
        max_length=400
        ):

    user_message = "Question: " + user_message + " Correct answer:"
    messages = [
        {"role": "System", "content": system_message},
        {"role": "User", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        top_p=0.0,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']
    return answer 

In [20]:
def preSynthGeneration(query, merged, candidate_answer, critique, sources):
    merged = ast.literal_eval(merged)
    second_answer = query_model(system_message.format(context = sources),
    user_message.format(question=query, options = merged, candidate_answer = candidate_answer, critique=critique, context = sources,), max_length=1024)
    return second_answer

In [50]:
pre_answers = []
for i in range(len(first_queries)):
    pre_answers.append(preSynthGeneration(first_queries[i], possibilities[i], answers[i], bart_answers[i], sources[i]))



In [51]:
pre_answers[:10]

[' "Arthurs\' Magazines"',
 'The suggested response indicates that there was some disagreement about who had actually achieved greater success as measured against their peers within professional sports circles - but ultimately it appears clear from both sources presented here today, namely Wikipedia itself alongside other online resources such.',
 '"The wolf hound"',
 "'No'\n\nExplanation:\nPavel was more into mathematics while Levinn worked mainly with computer science so it can't really say that both did similar kind if works",
 'The new pornographer',
 "'New York City'\n\nThe two sources provide conflicting information about whether or not these addresses belong within San Franicsoo territory limits versus those belonging inside NYC boundaries",
 'The suggested response "Ford" indicates that it has been verified as true information about who came into existence earlier than whom',
 'The suggested response "Jane" refers specifically',
 'The two men were indeed famous film-makers who 

# Synthesis

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [52]:
prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
Choose the most proper option between {options} that best matches with the suggestion. 

Question: {question}
Context: {critique}
Sources: {context}

Assistant:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [53]:
def synthesisGeneration(query, prompt_template, merged, pre_answer, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'critique': pre_answer,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [73]:
def_answers = ["the correct option is " + clean_text(correct_answer)
               + " since the other options is not mentioned in the context" for correct_answer in correct_answers]

In [57]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("assistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [74]:
syn_answers = []
for i in range(len(first_queries)):
    syn_answers.append(extract_answer(
        synthesisGeneration(
            first_queries[i], prompt_template, possibilities[i], 
            bart_answers[i], sources[i])))

In [75]:
syn_answers[:10]

['arthurs magazine',
 'jonathan stark',
 'the wolfhounds',
 'yes',
 'yes',
 'new york city',
 'aleksander ford',
 'yes',
 'director',
 'the saimaa gesture']

# Dataset conversion and performances

In [76]:
df = {
    'query': first_queries,
    'correct': correct_answers,
    'thesis': answers,
    'antithesis': bart_answers,
#     'pre-synthesis': pre_answers,
    'synthesis': syn_answers,
    'context': sources
} 

In [77]:
import pandas as pd

df = pd.DataFrame(df)
df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis,context
0,"Which magazine was started first, Arthur's Mag...",Arthur's Magazine,Arthurs Magazine,[Arthur's Magazine was started before First fo...,arthurs magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,Jonathan Stark,Jonathan Stark,[Jonathan Stark won more Grand Slam titles tha...,jonathan stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole (the rock b...",The Wolfhounds,The Wolfhounds,[Hole (the rock band that Courtney Love was a ...,the wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,yes,"[Yes, Pavel Urysohn and Leonid Levin were know...",yes,Leonid Anatolievich Levin ( ; Russian: Леони́д...
4,Are both The New Pornographers and Kings of Le...,yes,yes,"[Yes, both The New Pornographers and Kings of ...",yes,Kings of Leon is an American rock band that fo...


In [89]:
# df.to_csv('baseline-true.csv')

In [78]:
# Funzione per rimuovere le quadre e ottenere solo il contenuto
def remove_brackets(s):
    return s.strip("[] ")

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

# Applica la funzione alla colonna 'correct answer'
df['correct'] = df['correct'].apply(clean_text)
df['thesis'] = df['thesis'].apply(clean_text)
df['synthesis'] = df['synthesis'].apply(clean_text)

df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis,context
0,"Which magazine was started first, Arthur's Mag...",arthurs magazine,arthurs magazine,[Arthur's Magazine was started before First fo...,arthurs magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,jonathan stark,jonathan stark,[Jonathan Stark won more Grand Slam titles tha...,jonathan stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole (the rock b...",the wolfhounds,the wolfhounds,[Hole (the rock band that Courtney Love was a ...,the wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,yes,"[Yes, Pavel Urysohn and Leonid Levin were know...",yes,Leonid Anatolievich Levin ( ; Russian: Леони́д...
4,Are both The New Pornographers and Kings of Le...,yes,yes,"[Yes, both The New Pornographers and Kings of ...",yes,Kings of Leon is an American rock band that fo...


In [79]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['thesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        non_matches += 1
        which_ones.append("thesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 260
Number of non-matches: 92


In [80]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones_syn = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['synthesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1
        which_ones_syn.append("Synthesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 241
Number of non-matches: 111


In [81]:
df.to_csv('bart-nli-naive-1.csv')