# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset
import ast

dataset = load_dataset('saracandu/msmarco_modified', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['Unnamed: 0', 'answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2'],
    num_rows: 82326
})

In [2]:
selected_passages = []

for row in dataset:
    passages_data = ast.literal_eval(row['passages'])
    try:
        selected_index = passages_data['is_selected'].index(1)
        selected_passage = {
            'is_selected': 1,
            'passage_text': passages_data['passage_text'][selected_index],
        }
        selected_passages.append(selected_passage)
    except ValueError:
        # Aggiungi un passaggio vuoto se non c'è nessun passaggio selezionato
        selected_passages.append({'is_selected': 0, 'passage_text': ''})

# Assicurati che la lunghezza dei passaggi selezionati corrisponda alla lunghezza del dataset originale
assert len(selected_passages) == len(dataset), "Errore: lunghezza dei passaggi selezionati non corrisponde alla lunghezza del dataset originale"

In [3]:
from datasets import Dataset

# Creazione di un nuovo dizionario con i dati desiderati
new_dataset = {
    'answers': dataset['answers'],
    'query': dataset['query'],
    'query_id': dataset['query_id'],
    'query_type': dataset['query_type'],
    'wellFormedAnswers': dataset['wellFormedAnswers'],
    'correct_answer': dataset['correct_answer'],
    'distractor_1': dataset['distractor_1'],
    'distractor_2': dataset['distractor_2'],
    'selected_passages': selected_passages
}

# Creazione del nuovo dataset
new_dataset = Dataset.from_dict(new_dataset)

In [4]:
def filter_correct_answer(example):
    return example['correct_answer'] != '[]'

# Applica la funzione di filtro
new_dataset = new_dataset.filter(filter_correct_answer)

# Visualizza le prime righe del dataset filtrato per verificare il risultato
new_dataset

Filter:   0%|          | 0/82326 [00:00<?, ? examples/s]

Dataset({
    features: ['answers', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2', 'selected_passages'],
    num_rows: 80143
})

# Model loading and dataset selection (for testing purposes)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [79]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="nvidia/Llama3-ChatQA-1.5-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [80]:
from outlines import models

new_model = models.Transformers(model, tokenizer)

In [81]:
import outlines

system_message = """You are a sentiment-labelling assistant.
Is the following review positive or negative?

Review: This restaurant is just awesome!
"""

generator = outlines.generate.choice(new_model, ["Positive", "Negative"])
answer = generator(system_message)

In [82]:
print(answer)

Positive


## ▪️ Select a subset of the true dataset as a test

In [65]:
N_examples = 20

In [66]:
# select a subset of the queries, just for test:
first_queries = new_dataset['query'][:N_examples]

# same for correct answers and distractors:
correct_answers = new_dataset['correct_answer'][:N_examples]
distractors_1 = new_dataset['distractor_1'][:N_examples]
distractors_2 = new_dataset['distractor_2'][:N_examples]
# and for the sources:
sources = new_dataset['selected_passages'][:N_examples]

## ▪️ Merge the true answer and the distractors into a vector, shuffling the order of the elements

In [67]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)

# Funzione per mescolare le risposte
def shuffleAnswers(correct_answer, distractor_1, distractor_2):
    # Applica la pulizia a ciascun elemento
    correct_answer_cleaned = clean_text(correct_answer)
    distractor_1_cleaned = clean_text(distractor_1)
    distractor_2_cleaned = clean_text(distractor_2)
    
    # Unisci le opzioni pulite
    merge_options = [correct_answer_cleaned, distractor_1_cleaned, distractor_2_cleaned]
    
    # Mescola le opzioni
    random.shuffle(merge_options)
    
    return merge_options

# Thesis

In [219]:
merged_options = []
for i in range(N_examples):
    merged_options.append(shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i]))

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [91]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    System: This is a chat between a user and an AI assistant. The assistant gives helpful, detailed, and polite answers to the user’s questions based on the context. 
    {context}
    User: {question}
    Possible options: {option_a}, {option_b}, {option_c}.
    Assistant:
"""
)

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [92]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

In [93]:
import outlines

def thesisGeneration(query, merged, sources):
    augmented_prompt = thesis_chain.invoke({'question': query, 'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 'context': sources})
    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    answer = generator(normal_string)
    # if not answer:
    #    answer = random.choice(options)
    return answer

## 🔹 Test: how well the thesis alone is able to perform?

In [94]:
answers = []
for i in range(N_examples):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    answers.append(thesisGeneration(first_queries[i], merged_options, sources[i]))

Compiling FSM index for all state transitions: 100%|██| 67/67 [00:03<00:00, 21.41it/s]
Compiling FSM index for all state transitions: 100%|█| 166/166 [00:07<00:00, 21.15it/s
Compiling FSM index for all state transitions: 100%|██| 44/44 [00:02<00:00, 21.19it/s]
Compiling FSM index for all state transitions: 100%|█| 186/186 [00:08<00:00, 21.13it/s
Compiling FSM index for all state transitions: 100%|█| 148/148 [00:06<00:00, 21.30it/s
Compiling FSM index for all state transitions: 100%|█| 471/471 [00:22<00:00, 21.41it/s
Compiling FSM index for all state transitions: 100%|█| 282/282 [00:13<00:00, 21.14it/s
Compiling FSM index for all state transitions: 100%|█| 132/132 [00:06<00:00, 21.09it/s
Compiling FSM index for all state transitions: 100%|█| 275/275 [00:12<00:00, 21.18it/s
Compiling FSM index for all state transitions: 100%|█| 439/439 [00:20<00:00, 21.10it/s
Compiling FSM index for all state transitions: 100%|█| 395/395 [00:18<00:00, 20.86it/s
Compiling FSM index for all state transitio

In [95]:
answers

["'Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.'",
 "'Yes'",
 "'Yes'",
 "'11 to 22 per square foot'",
 "'Due to symptoms in the body'",
 "'45,278 and 60,659'",
 "'The most expensive patents are international patents, which can run up to 100,000 or higher.Domestically the costs can be 10,000 or above.'",
 "'140 to 202'",
 "'Is the brand name of a drug called procaine which is a local anaesthetic.'",
 "'Ecosystem'",
 "'Dr. Seuss'",
 "'45 minutes to an hour'",
 "'6-16 a square foot'",
 "'Granite.', 'Granite.'",
 "'4.64 - 6.36'",
 '"A dolphin\'s life span varies according to its environment and species. Although some bottlenose dolphins can reach 40 years of age, their average age is between 15 and 16 years. Forty is an old age for a dolphin -- one making it to 40 is comparable to a human living to be 100."',
 '"In a New York State hermit\'s le

# Antithesis

## 🔸 PromptTemplate definition and a LLMChain for the **antithesis** 

In [99]:
import transformers

pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

In [112]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

def query_model(
        system_message,
        user_message,
        temperature = 0.0,
        max_length=1024
        ):

    user_message = "Question: " + user_message + " Answer:"
    messages = [
        {"role": "System", "content": system_message},
        {"role": "User", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=False,
        top_p=0.0,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )

    answer = sequences[0]['generated_text']

    return user_message + " " + answer  

In [227]:
system_message = """
    This is a chat between a user and an AI assistant. The assistant gives helpful, detailed, and polite answers to the user’s questions based on the context. 
    {context}
"""


user_message = """
    You are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context.
    Question: {question} 
    The answer has to be one of these: {option_a}, {option_b}, {option_c}. 
    Candidate answer that you need to check: {candidate_answer}
    Consider all the possible options and determine if the candidate is the most appropriate one, given the question and the context.
    If you think so, answer by saying 'Correct'; otherwise, answer 'Incorrect'. Add which option is most proper to you and why.\n
"""

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [228]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer, sources):
    second_answer = query_model(system_message.format(context = sources),
    user_message.format(question=query, option_a = merged[0], option_b = merged[1], 
                        option_c = merged[2], candidate_answer = candidate_answer, context = sources,), max_length=400)
    return second_answer

In [224]:
def extract_answer_ant(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Answer:")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Answer:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [229]:
ant_answers = []
for i in range(N_examples):
    ant_answers.append(extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options[i], answers[i], sources[i])))

In [230]:
ant_answers[:20]

['"correct", "results-based accountability".',
 '"correct", because he ran as republican against president carter',
 '"sydneysurroundingsareasneedtime",',
 "'correct'\n\nThe correct response would depend upon what specific information about Honolulu they were looking up?",
 "'correct','due to symptom'\n\nconversion",
 'The correct response would have been "inside the Rib Cage".',
 '"Most Correct"\n\nThis statement directly addresses both domestic (lower) prices as well as internationally high priced ones without any other distractions within this passage',
 '"correct", "the correct response would have been sophicles aeschlylusand euripedes."',
 '\'incorrect\'\n\'A Lapel pin refers specifically to those lapels with holes for attaching buttons; it does NOT include any type of clothing accessory such as brooches (which attach via prongs) nor do we consider "pins" here since this term encompasses many different kinds including safety ones etc...\'',
 '"incorrect", "somatic".',
 "'correct.'

# Synthesis

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [246]:
from langchain import PromptTemplate

prompt_template = PromptTemplate.from_template(
"""
    System: This is a chat between a user and an AI assistant. The assistant gives helpful, detailed, and polite answers to the user’s questions based on the context. 
    {context}
    
    User: You are asked to answer a certain question, given a certain number of options and the context. You are also provided with a candidate answer and its critique: 
    Question: {question}
    Possible options: {option_a}, {option_b}, {option_c}
    Candidate answer: {candidate_answer}
    Correct the candidate answer according to what the critique suggests as the correct option:
    {critique} \n 
    
    Assistant:
"""
)

In [247]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [248]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    final_answer = generator(normal_string)
    
    # if not final_answer:
    #     final_answer = candidate_answer
    return final_answer

In [249]:
syn_answers = []
for i in range(N_examples):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    syn_answers.append(synthesisGeneration(first_queries[i], prompt_template, merged_options, answers[i], ant_answers[i], sources[i]))

Compiling FSM index for all state transitions: 100%|█| 207/207 [00:09<00:00, 21.19it/s
Compiling FSM index for all state transitions: 100%|█| 445/445 [00:21<00:00, 21.01it/s
Compiling FSM index for all state transitions: 100%|█| 439/439 [00:21<00:00, 20.83it/s


In [250]:
dataset_new = {'query': first_queries, 'correct answer': correct_answers, 'thesis': answers, 'antithesis': ant_answers, 'synthesis': syn_answers}

In [251]:
import pandas as pd

df = pd.DataFrame(dataset_new)

In [252]:
df

Unnamed: 0,query,correct answer,thesis,antithesis,synthesis
0,what is rba,['Results-Based Accountability is a discipline...,'Results-Based Accountability is a disciplined...,"""correct"", ""results-based accountability"".",'Results-Based Accountability is a disciplined...
1,was ronald reagan a democrat,['Yes'],'Yes',"""correct"", because he ran as republican agains...",'Yes'
2,how long do you need for sydney and surroundin...,['20-25 minutes'],'Yes',"""sydneysurroundingsareasneedtime"",",'Yes'
3,price to install tile in shower,['$11 to $22 per square foot'],'11 to 22 per square foot','correct'\n\nThe correct response would depend...,'Honolulu'
4,why conversion observed in body,['Due to symptoms in the body'],'Due to symptoms in the body',"'correct','due to symptom'\n\nconversion",'Due to symptoms in the body'
5,where are the lungs located in the back,['Inside the rib cage.'],"'45,278 and 60,659'","The correct response would have been ""inside t...",'Inside the rib cage.'
6,cost to get a patent,['The most expensive patents are international...,'The most expensive patents are international ...,"""Most Correct""\n\nThis statement directly addr...",'The most expensive patents are international ...
7,best tragedies of ancient greece,"['Sophocles, Aeschylus and Euripides']",'140 to 202',"""correct"", ""the correct response would have be...","'Sophocles, Aeschylus and Euripides'"
8,what is a conifer,['A tree or shrub which produces distinctive c...,'Is the brand name of a drug called procaine w...,'incorrect'\n'A Lapel pin refers specifically ...,'A tree or shrub which produces distinctive co...
9,in animals somatic cells are produced by and g...,['Somatic cells are produced by mitosis and ga...,'Ecosystem',"""incorrect"", ""somatic"".",'Ecosystem'


In [253]:
df['antithesis'][1]

'"correct", because he ran as republican against president carter'

In [254]:
df.to_csv('test-2.csv')