# Dataset, documents, FAISS; retriever

## üîπ Load the dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset('saracandu/filtered_hotpotQA', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['question', 'options', 'answer', 'type', 'level', 'selected_passages'],
    num_rows: 352
})

In [2]:
dataset[10]

{'question': 'Who was inducted into the Rock and Roll Hall of Fame, David Lee Roth or Cia Berg?',
 'options': "['Cia Berg', 'David Lee Roth']",
 'answer': 'David Lee Roth',
 'type': 'comparison',
 'level': 'medium',
 'selected_passages': 'Cia Berg (born 2 December 1963), now known as Cia Soro, is a Swedish television presenter and singer. She was at one time the lead singer of the Swedish rock band Whale, who released the single "Hobo Humpin\' Slobo Babe". David Lee Roth (born October 10, 1954) is an American rock vocalist, musician, songwriter, actor, author, and former radio personality. In 2007, he was inducted into the Rock and Roll Hall of Fame.'}

## üîπ Select a subset of the true dataset as a test

In [None]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

In [67]:
N_examples = 200

# select a subset of the queries, just for test:
first_queries = dataset['question'][:N_examples]

# same for correct answers and distractors:
correct_answers = dataset['answer'][:N_examples]
possibilities = dataset['options'][:N_examples]
# and for the sources:
sources = dataset['selected_passages'][:N_examples]

# Model loading and dataset selection (for testing purposes)

## ‚ñ™Ô∏è Upload the model: 

In [None]:
# do not run this unless necessary!

# from huggingface_hub import login
# login()

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="nvidia/Llama3-ChatQA-1.5-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## ‚ñ™Ô∏è Create a model that allow to work with `guidance`

In [3]:
from guidance import models, select
new_model = models.Transformers(model, tokenizer, temperature=0.0)

  backends.update(_get_backends("networkx.backends"))

{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '

' }}{%- else -%}Assistant: {{ message['content'].strip() + '

' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %} was unable to be loaded directly into guidance.
                        Defaulting to the ChatML format which may not be optimal for the selected model. 
                        For best results, create and pass in a `guidance.ChatTemplate` subclass for your model.


# Thesis

## üîπ Function that generates the output given the prompt, the question and the set of options

In [68]:
import ast

def thesisGeneration(query, options, sources):
    
    options = ast.literal_eval(options)
    first_option = clean_text(options[0])
    second_option = clean_text(options[1])
    
    ans = new_model + f'''\
    Question: {query}. Choose to either {options} given the following supporting
    Context: {sources}
    Choice: {select([first_option, second_option], name="choice")}'''
    
    
    return str(ans)

In [69]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Choice:")

    
    # Se l'indice √® stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Choice:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

## üîπ Test: how well the thesis alone is able to perform?

In [70]:
answers = []
for i in range(N_examples):
    answers.append(extract_answer(thesisGeneration(first_queries[i], possibilities[i], sources[i])))

In [16]:
answers[:5]

['Arthurs Magazine', 'Henri Leconte', 'The Wolfhounds', 'no', 'yes']

# Antithesis

In [71]:
import transformers

pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

In [72]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

def query_model(
        system_message,
        user_message,
        temperature = 0.0,
        max_length=1024
        ):

    user_message = "Question: " + user_message + " Correct answer:"
    messages = [
        {"role": "System", "content": system_message},
        {"role": "User", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=False,
        top_p=0.0,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']

    return answer  

In [73]:
# currently tested:
system_message = """
    You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
    You are also provided with a candidate answer and the context. 
    Your goal is to decree which is the correct answer to the question by explaining why you think so. Use the context to ground your statements.
"""


user_message = """
    Question: {question}?
    Candidate answer: {candidate_answer}
    Context: {context}
    Which of the following options: {options} is the most proper answer for the question? Why? 
    Think step by step but choose only one of the options: {options} by explicitly reporting which one.
"""

In [84]:
system_message = """
    You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
    You are also provided with a candidate answer that you can check given the context.
    Your goal is to decree which is the correct answer to the question by explaining why you think so. Use the context to ground your statements.
"""


user_message = """
    Question: {question}?
    Candidate answer: {candidate_answer}
    Context: {context}
    Which of the following options: {options} is the most proper answer for the question? Why? 
    Think step by step but choose only one of the options: {options} by explicitly reporting which one.
"""

In [19]:
system_message = """
    You are an AI assistant asked to check whether or not a given question was asked correctly given an attempt of answer. 
    For each question there is a set of options from which the answer is taken.
    You can you give suggestions and advices on whether the given answer is correct or not. 
    You can use the context to ground your advice.
"""


user_message = """
    Question: {question}
    Possible options: {options}
    Attempt of answer: {candidate_answer}
    Context: {context}
"""

## üî∏ Function to generate the antithesis given the question, the thesis, the context and the options

In [74]:
def antithesisGeneration(query, merged, candidate_answer, sources):
    merged = ast.literal_eval(merged)
    second_answer = query_model(system_message.format(context = sources),
    user_message.format(question=query, options = merged, candidate_answer = candidate_answer, context = sources,), max_length=1024)
    return second_answer

In [75]:
ant_answers = []
for i in range(N_examples):
    ant_answers.append(antithesisGeneration(first_queries[i], possibilities[i], answers[i], sources[i]))



In [76]:
ant_answers[:10]

['"Arhutrs\' Magzine"\nThe text clearly states that this publication came out earlier than its competitor did',
 'The option "Lecont" would have been better if it had included that information about him winning three grand slam tournaments compared against Starks single victory',
 '"The wolf hound"',
 'The two men were not related or even from similar backgrounds as they lived at different times; therefore it would make sense that their interests did differ somewhat too',
 "The new pornographer isn't american",
 'The two locations mentioned belong to "NEW YORK CITY" as per information available at https://en.wikipedia.org/wiki/100_Park_Avenue',
 "'Alexsnder ford'\nHe has been alive longer than pablotrapera",
 'The reference text states that ‚ÄúFirst For Woman‚Äùis indeed as it claims,a publication aimed at females.The other option,‚ÄùNo‚Äù,implies otherwise.It‚Äôs important therefore,to consider carefully whether or not this statement accurately reflects reality.In light Of all these p

# Synthesis

In [77]:
def_answers = ["the correct option is " + clean_text(correct_answer) for correct_answer in correct_answers]

## üî∫ OPTION 1

In [107]:
from langchain import PromptTemplate
from operator import itemgetter

prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
You have a suggestion on which answer is the most appropriate that you are strongly suggested to follow.
Choose the most proper option between {options} that best matches with the suggestion. You also have sources reported in case you need them. 

Question: {question}
Context: The suggestion that is generated when {candidate_answer} is considered the correct answer is {critique}
Sources: {context}

Assistant:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

In [108]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [113]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("assistant:\n")

    
    # Se l'indice √® stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [114]:
syn_answers = []
for i in range(N_examples):
    syn_answers.append(extract_answer(
        synthesisGeneration(
            first_queries[i], prompt_template, possibilities[i], answers[i], 
            def_answers[i], sources[i])))

In [115]:
syn_answers[:10]

['arthurs magazine',
 'henri leconte',
 'the wolfhounds',
 'no',
 'yes',
 'new york city',
 'aleksander ford',
 'yes',
 'director',
 'the saimaa gesture']

## üî∫ OPTION 2

In [98]:
import ast

def synthesisGeneration2(query, options, candidate_answer, critique, sources):    
    
    options = ast.literal_eval(options)
    first_option = clean_text(options[0])
    second_option = clean_text(options[1])
    
    ans = new_model + f'''\
    Question: {query}. Choose the most proper option between {options} that best matches with the suggestion.
    You have a suggestion on which answer is the most appropriate that you are strongly suggested to follow.
    Context: The suggestion is {critique}

    You also have sources reported in case you need them. 
    Sources: {sources}
    
    Choice: {select([first_option, second_option], name="choice")}
'''
    
    return str(ans)

In [99]:
def extract_answer2(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Choice:")

    
    # Se l'indice √® stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Choice:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [100]:
syn_answers2 = []
for i in range(N_examples):
    syn_answers2.append(extract_answer2(
        synthesisGeneration2(
            first_queries[i], possibilities[i], answers[i], 
            def_answers[i], sources[i])))

In [86]:
syn_answers2[:10]

['Arthurs Magazine',
 'Henri Leconte',
 'The Wolfhounds',
 'no',
 'no',
 'New York City',
 'Aleksander Ford',
 'yes',
 'director',
 'The Saimaa Gesture']

## üî∫ Function to generate the synthesis given literally everything

In [116]:
df = {
    'query': first_queries,
    'correct': correct_answers,
    'thesis': answers,
    'antithesis': ant_answers,
    'synthesis-1': syn_answers,
    'synthesis-2': syn_answers2,
    'context': sources
} 

In [117]:
import pandas as pd

df = pd.DataFrame(df)
df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis-1,synthesis-2,context
0,Which magazine was started first Arthur's Maga...,Arthur's Magazine,Arthurs Magazine,"""Arhutrs' Magzine""\nThe text clearly states th...",arthurs magazine,arthurs magazine,Arthur's Magazine (1844‚Äì1846) was an American ...
1,Which tennis player won more Grand Slam titles...,Jonathan Stark,Henri Leconte,"The option ""Lecont"" would have been better if ...",henri leconte,jonathan stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole, the rock b...",The Wolfhounds,The Wolfhounds,"""The wolf hound""",the wolfhounds,courtney love,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,no,The two men were not related or even from simi...,no,no,Leonid Anatolievich Levin ( ; Russian: –õ–µ–æ–Ω–∏ÃÅ–¥...
4,Are both The New Pornographers and Kings of Le...,no,yes,The new pornographer isn't american,yes,no,Kings of Leon is an American rock band that fo...


In [None]:
df['query'][3]

In [48]:
df.to_csv('baseline-2.csv')

In [118]:
# Funzione per rimuovere le quadre e ottenere solo il contenuto
def remove_brackets(s):
    return s.strip("[] ")

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

# Applica la funzione alla colonna 'correct answer'
df['correct'] = df['correct'].apply(clean_text)
df['thesis'] = df['thesis'].apply(clean_text)
df['synthesis-1'] = df['synthesis-1'].apply(clean_text)
df['synthesis-2'] = df['synthesis-2'].apply(clean_text)

df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis-1,synthesis-2,context
0,Which magazine was started first Arthur's Maga...,arthurs magazine,arthurs magazine,"""Arhutrs' Magzine""\nThe text clearly states th...",arthurs magazine,arthurs magazine,Arthur's Magazine (1844‚Äì1846) was an American ...
1,Which tennis player won more Grand Slam titles...,jonathan stark,henri leconte,"The option ""Lecont"" would have been better if ...",henri leconte,jonathan stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole, the rock b...",the wolfhounds,the wolfhounds,"""The wolf hound""",the wolfhounds,courtney love,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,no,The two men were not related or even from simi...,no,no,Leonid Anatolievich Levin ( ; Russian: –õ–µ–æ–Ω–∏ÃÅ–¥...
4,Are both The New Pornographers and Kings of Le...,no,yes,The new pornographer isn't american,yes,no,Kings of Leon is an American rock band that fo...


In [52]:
df['context'][0]

'Arthur\'s Magazine (1844‚Äì1846) was an American literary periodical published in Philadelphia in the 19th century. Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others. In May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA. The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey. In 2011 the circulation of the magazine was 1,310,696 copies.'

In [119]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['thesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        non_matches += 1
        which_ones.append("thesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 154
Number of non-matches: 46


In [120]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones_syn = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['synthesis-1']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1
        which_ones_syn.append("Synthesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 144
Number of non-matches: 56


In [121]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones_syn = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['synthesis-2']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1
        which_ones_syn.append("Synthesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 153
Number of non-matches: 47


In [54]:
filtered_dataset = df.loc[~df['correct'].isin(['yes', 'no'])]

In [34]:
180/(180+57)*100

75.9493670886076

In [35]:
171/(171+66)*100

72.15189873417721