# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset
import ast

dataset = load_dataset('saracandu/filtered_hotpotQA', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['question', 'options', 'answer', 'type', 'level', 'selected_passages'],
    num_rows: 352
})

In [201]:
# Definisci la funzione di filtro
def filter_function(example):
    return example['answer'] not in ['yes', 'no']

# Applica il filtro al dataset
filtered_dataset = dataset.filter(filter_function)
filtered_dataset

Filter:   0%|          | 0/352 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'options', 'answer', 'type', 'level', 'selected_passages'],
    num_rows: 237
})

In [203]:
dataset[10]

{'question': 'Who was inducted into the Rock and Roll Hall of Fame, David Lee Roth or Cia Berg?',
 'options': "['Cia Berg', 'David Lee Roth']",
 'answer': 'David Lee Roth',
 'type': 'comparison',
 'level': 'medium',
 'selected_passages': 'Cia Berg (born 2 December 1963), now known as Cia Soro, is a Swedish television presenter and singer. She was at one time the lead singer of the Swedish rock band Whale, who released the single "Hobo Humpin\' Slobo Babe". David Lee Roth (born October 10, 1954) is an American rock vocalist, musician, songwriter, actor, author, and former radio personality. In 2007, he was inducted into the Rock and Roll Hall of Fame.'}

# Model loading and dataset selection (for testing purposes)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="nvidia/Llama3-ChatQA-1.5-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from guidance import models, select

new_model = models.Transformers(model, tokenizer)

new_model + f'Do you want a joke or a poem? A ' + select(['joke', 'poem'])

## ▪️ Test with `guidance`: 

In [5]:
system_message = """You are a multiple-choice question answering assistant.
Choose which of the following options: a star, a planet, a galaxy is the object below.

Object: the sun
"""

new_model + system_message + select(["a planet", "a galaxy", "a star"])

In [6]:
from langchain import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
You have a suggestion on which answer is the most appropriate, that is treated as context. Use the suggestion to choose the most proper option.
You also have an attempt of answer that you are suggested to neglect. 

Question: {question}
Attempt: {candidate_answer}
Context: {critique}

The most proper option between {options} is:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

In [7]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

In [8]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    # merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    new_model + normal_string + select(merged)

In [9]:
synthesisGeneration('what is the sun', prompt_template, ['star', 'planet'], 'planet', 
                    'the correct answer is: a star since bot an asteroid and a planet are inadequate and not supported by the context', 
                    'The Sun is the star at the center of the Solar System. It is a massive, nearly perfect sphere of hot plasma, heated to incandescence by nuclear fusion reactions in its core, radiating the energy from its surface mainly as visible light and infrared radiation with 10% at ultraviolet energies.')

In [10]:
source = """ Arthur\'s Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century. 
Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others. 
In May 1846 it was merged into "Godey\'s Lady\'s Book. First for Women is a woman\'s magazine published by Bauer Media Group in the USA.  
The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey. In 2011 the circulation of the magazine was 1,310,696 copies.
    """

synthesisGeneration('Which magazine was started first Arthur\'s Magazine or First for Women?', 
                    prompt_template, ['First for Women', 'Arthur\'s Magazine'], 'First for Women', 
                    'the correct answer is Arthur\'s Magazine and the context agrees', 
                    source)

In [11]:
source = """ "The Oberoi family is an Indian family that is famous for its involvement in hotels, namely through The Oberoi Group. 
The Oberoi Group is a hotel company with its head office in Delhi. 
Founded in 1934, the company owns and/or operates 30+ luxury hotels and two river cruise ships in six countries, primarily under its Oberoi Hotels & Resorts and Trident Hotels brands.".
    """

synthesisGeneration('The Oberoi family is part of a hotel company that has a head office in what city?', 
                    prompt_template, ['Delhi', 'Sammardenchia'], 'Sammardenchia', 
                    'Delhi is correct', 
                    source)

In [12]:
source = """ Allison Beth Allie Goertz (born March 2, 1991) is an American musician. 
Goertz is known for her satirical songs based on various pop culture topics. Her videos are posted on YouTube under the name of Cossbysweater. 
Subjects of her songs have included the film The Room, the character Milhouse from the television show The Simpsons, and the game Dungeons & Dragons. 
Her style has been compared to that of Bo Burnham. In December 2015, Goertz released a concept album based on the Adult Swim series Rick and Morty, Sad Dance Songs, 
with the album\'s cover emulating the animation and logo of the series.  The album was made possible through Kickstarter. 
She is co-host of Everything's Coming Up Podcast, a Simpsons-focused podcast along with Julia Prescott. 
Milhouse Mussolini van Houten is a fictional character featured in the animated television series The Simpsons, voiced by Pamela Hayden, and created by Matt Groening 
who named the character after President Richard Nixon\'s middle name. Later in the series, it is revealed that Milhouse\'s middle name is Mussolini. "
    """

synthesisGeneration('Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?', 
                    prompt_template, ['Nixon', 'Obama'], 'Obama', 
                    'the correct answer is Nixon', 
                    source)

In [14]:
source = """ 
Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director best known for the movie Rebel Without a Cause. 
Elia Kazan (born Elias Kazantzoglou September 7, 1909 – September 28, 2003) was a Greek-American director, producer, writer and actor, described by The New York Times as one of the most honored and influential directors in Broadway and Hollywood history.
    """

synthesisGeneration('What profession does Nicholas Ray and Elia Kazan have in common?', 
                    prompt_template, ['director', 'writer'], 'director', 
                    'the correct answer is writer', 
                    source)

## ▪️ Select a subset of the true dataset as a test

In [204]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

In [205]:
N_examples = 237

In [206]:
# select a subset of the queries, just for test:
first_queries = filtered_dataset['question'][:N_examples]

# same for correct answers and distractors:
correct_answers = filtered_dataset['answer'][:N_examples]
possibilities = filtered_dataset['options'][:N_examples]
# and for the sources:
sources = filtered_dataset['selected_passages'][:N_examples]

# Thesis

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [207]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    System: This is a chat between a user and an AI assistant. The assistant gives helpful, detailed, and polite answers to the user’s questions based on the context. 
    {context}
    User: {question}
    Possible options: {options}.
    Assistant:
"""
)

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [208]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

In [209]:
def thesisGeneration(query, merged, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = thesis_chain.invoke({'question': query, 'options': merged, 'context': sources})
    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [210]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Assistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

## 🔹 Test: how well the thesis alone is able to perform?

In [211]:
answers = []
for i in range(N_examples):
    answers.append(extract_answer(thesisGeneration(first_queries[i], possibilities[i], sources[i])))

In [212]:
answers[:5]

['First for Women',
 'Jonathan Stark',
 'The Wolfhounds',
 'New York City',
 'Aleksander Ford']

In [16]:
correct_answers[:5]

["Arthur's Magazine", 'Jonathan Stark', 'The Wolfhounds', 'no', 'no']

# Antithesis

In [213]:
import transformers

pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

In [214]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

def query_model(
        system_message,
        user_message,
        temperature = 0.0,
        max_length=1024
        ):

    user_message = "Question: " + user_message + " Correct answer:"
    messages = [
        {"role": "System", "content": system_message},
        {"role": "User", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=False,
        top_p=0.0,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']

    return user_message + " " + answer  

In [215]:
system_message = """
    This is a chat between a user and an AI assistant.
    The assistant is asked to answer a question given a certain number of candidate options, an attempt of answer
    and the context: {context}
    The assistant has to check which option is more grounded on the context given the question.
"""


user_message = """
    Question: {question}?
    It also has an attempt of answer, that could be right or wrong: {candidate_answer}
    Which of the candidate answers {options} is the most proper answer for the question? Why? 
    Think step by step but choose only one of the options: {options}. You are forbidden to say that it depends.
"""

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [216]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer, sources):
    merged = ast.literal_eval(merged)
    second_answer = query_model(system_message.format(context = sources),
    user_message.format(question=query, options = merged, candidate_answer = candidate_answer, context = sources,), max_length=400)
    return second_answer

In [217]:
def extract_answer_ant(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Correct answer:")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Correct answer:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [218]:
ant_answers = []
for i in range(N_examples):
    ant_answers.append(extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, possibilities[i], answers[i], sources[i])))



In [222]:
ant_answers[:10]

['["Which" means compare]',
 "'Jonathon stark'\n\nThe correct response can vary depending upon whether you consider all grand slam events including mixed double as well",
 '"The wolf hound"',
 'The correct response would have been "Yes", as both buildings mentioned - namely **"the Empire State Bldg."** [sic] at *20 West Street*; Manhattan NY USA / NYC aka N.Y.C., U.S.A.; North America/USA/NY/City Island/Empire state bdlg./Emp.St.Bdg.-Esb-ESB-Esblngbldrgrd-Building-grdyrd-buldr-gryrdbuldrgyrbdlgybrlg-yrlbg-dlbyfjglk-jklhgfdsafghjk-lkjhgfdsa-fdsa-hfgjh-kjlhfdfsdasfsdhfkjsa-, ESB-NYC-Manhattan-Wall St-New Jersey-Central park-Times Square-Midtown South-West Side-East River-Hudson river-South Ferry-Chelsea-Greenwich Village-Lower East side-Uptown-AlphabetCity-RandomIsland-StatenIslnd-FiveBurroughs-TheBronx-Qnsboro-KingsCoBrooklynManhtnRichmondStgeNYC5Bs-islands-of-the-city-nyc-metro-area-new york metro area-east coast-us east-coast-easternecoaste-wind-sunrise-settlesunset-westwind-almanac-t

In [154]:
first_queries[:10]

["Which magazine was started first Arthur's Magazine or First for Women?",
 'Which tennis player won more Grand Slam titles, Henri Leconte or Jonathan Stark?',
 'Which band was founded first, Hole, the rock band that Courtney Love was a frontwoman of, or The Wolfhounds?',
 'Were Pavel Urysohn and Leonid Levin known for the same type of work?',
 'Are both The New Pornographers and Kings of Leon American rock bands?',
 '750 7th Avenue and 101 Park Avenue, are located in which city?',
 'Who was born first, Pablo Trapero or Aleksander Ford?',
 "Are Jane and First for Women both women's magazines?",
 'What profession does Nicholas Ray and Elia Kazan have in common?',
 'Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?']

In [55]:
correct_answers[:5]

["Arthur's Magazine", 'Jonathan Stark', 'The Wolfhounds', 'no', 'no']

# Synthesis

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [219]:
prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
You have a suggestion on which answer is the most appropriate that you are strongly suggested to follow.
Choose the most proper option between {options} that best matches with the suggestion. 

Question: {question}
Context: {critique}
Sources: {context}

Assistant:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [220]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [21]:
def_answers = ["the correct option is " + clean_text(correct_answer)
               + "since the other options is not mentioned in the context" for correct_answer in correct_answers]

In [221]:
syn_answers = []
for i in range(N_examples):
    syn_answers.append(extract_answer(
        synthesisGeneration(
            first_queries[i], prompt_template, possibilities[i], answers[i], 
            ant_answers[i], sources[i])))

In [192]:
syn_answers[:10]

['Arthurs Magazine',
 'Jonathan Stark',
 'The Wolfhounds',
 'no',
 'yes',
 'New York City',
 'Aleksander Ford',
 'no',
 'director',
 'The Saimaa Gesture']

In [26]:
correct_answers[:10]

["Arthur's Magazine",
 'Jonathan Stark',
 'The Wolfhounds',
 'no',
 'no',
 'New York City',
 'Aleksander Ford',
 'yes',
 'director',
 'The Saimaa Gesture']

In [27]:
answers[:10]

['First for Women',
 'Jonathan Stark',
 'The Wolfhounds',
 'yes',
 'yes',
 'New York City',
 'Aleksander Ford',
 'yes',
 'director',
 'The Saimaa Gesture']

In [223]:
df = {
    'query': first_queries,
    'correct': correct_answers,
    'thesis': answers,
    'antithesis': ant_answers,
    'synthesis': syn_answers,
    'context': sources
} 

In [224]:
import pandas as pd

df = pd.DataFrame(df)
df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis,context
0,Which magazine was started first Arthur's Maga...,Arthur's Magazine,First for Women,"[""Which"" means compare]",Arthurs Magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,Jonathan Stark,Jonathan Stark,'Jonathon stark'\n\nThe correct response can v...,Jonathan Stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole, the rock b...",The Wolfhounds,The Wolfhounds,"""The wolf hound""",The Wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,"750 7th Avenue and 101 Park Avenue, are locate...",New York City,New York City,"The correct response would have been ""Yes"", as...",New York City,101 Park Avenue is a 629 ft tall skyscraper in...
4,"Who was born first, Pablo Trapero or Aleksande...",Aleksander Ford,Aleksander Ford,'Alexsader ford'\nThe correct birth date infor...,Aleksander Ford,Pablo Trapero (Born 4 October 1971) is an Arge...


In [233]:
df['query'][3]

'750 7th Avenue and 101 Park Avenue, are located in which city?'

In [195]:
df.to_csv('test-8.csv')

In [225]:
# Funzione per rimuovere le quadre e ottenere solo il contenuto
def remove_brackets(s):
    return s.strip("[] ")

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

# Applica la funzione alla colonna 'correct answer'
df['correct'] = df['correct'].apply(clean_text)
df['thesis'] = df['thesis'].apply(clean_text)
df['synthesis'] = df['synthesis'].apply(clean_text)

df.head()

Unnamed: 0,query,correct,thesis,antithesis,synthesis,context
0,Which magazine was started first Arthur's Maga...,arthurs magazine,first for women,"[""Which"" means compare]",arthurs magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,jonathan stark,jonathan stark,'Jonathon stark'\n\nThe correct response can v...,jonathan stark,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole, the rock b...",the wolfhounds,the wolfhounds,"""The wolf hound""",the wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,"750 7th Avenue and 101 Park Avenue, are locate...",new york city,new york city,"The correct response would have been ""Yes"", as...",new york city,101 Park Avenue is a 629 ft tall skyscraper in...
4,"Who was born first, Pablo Trapero or Aleksande...",aleksander ford,aleksander ford,'Alexsader ford'\nThe correct birth date infor...,aleksander ford,Pablo Trapero (Born 4 October 1971) is an Arge...


In [226]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['thesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        non_matches += 1

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 180
Number of non-matches: 57


In [228]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['synthesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 164
Number of non-matches: 73
