# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset
import ast

dataset = load_dataset('saracandu/msmarco_modified', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['Unnamed: 0', 'answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2'],
    num_rows: 82326
})

In [2]:
selected_passages = []

for row in dataset:
    passages_data = ast.literal_eval(row['passages'])
    try:
        selected_index = passages_data['is_selected'].index(1)
        selected_passage = {
            'is_selected': 1,
            'passage_text': passages_data['passage_text'][selected_index],
        }
        selected_passages.append(selected_passage)
    except ValueError:
        # Aggiungi un passaggio vuoto se non c'è nessun passaggio selezionato
        selected_passages.append({'is_selected': 0, 'passage_text': ''})

# Assicurati che la lunghezza dei passaggi selezionati corrisponda alla lunghezza del dataset originale
assert len(selected_passages) == len(dataset), "Errore: lunghezza dei passaggi selezionati non corrisponde alla lunghezza del dataset originale"

In [3]:
from datasets import Dataset

# Creazione di un nuovo dizionario con i dati desiderati
new_dataset = {
    'answers': dataset['answers'],
    'query': dataset['query'],
    'query_id': dataset['query_id'],
    'query_type': dataset['query_type'],
    'wellFormedAnswers': dataset['wellFormedAnswers'],
    'correct_answer': dataset['correct_answer'],
    'distractor_1': dataset['distractor_1'],
    'distractor_2': dataset['distractor_2'],
    'selected_passages': selected_passages
}

# Creazione del nuovo dataset
new_dataset = Dataset.from_dict(new_dataset)

# Model part (`Llama-2-7b-chat-hf`)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [4]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from outlines import models

new_model = models.Transformers(model, tokenizer)

In [6]:
import outlines

prompt = """You are a sentiment-labelling assistant.
Is the following review positive or negative?

Review: This restaurant is just awesome!
"""

generator = outlines.generate.choice(new_model, ["Positive", "Negative"])
answer = generator(prompt)

In [7]:
print(answer)

Positive


## ▪️ Select a subset of the true dataset as a test

In [8]:
# select a subset of the queries, just for test:
first_queries = new_dataset['query'][:200]
# first_queries

In [9]:
# same for correct answers and distractors:
correct_answers = new_dataset['correct_answer'][:200]
distractors_1 = new_dataset['distractor_1'][:200]
distractors_2 = new_dataset['distractor_2'][:200]

In [10]:
sources = new_dataset['selected_passages'][:200]

## ▪️ Merge the true answer and the distractors into a vector, shuffling the order of the elements

In [11]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)

# Funzione per mescolare le risposte
def shuffleAnswers(correct_answer, distractor_1, distractor_2):
    # Applica la pulizia a ciascun elemento
    correct_answer_cleaned = clean_text(correct_answer)
    distractor_1_cleaned = clean_text(distractor_1)
    distractor_2_cleaned = clean_text(distractor_2)
    
    # Unisci le opzioni pulite
    merge_options = [correct_answer_cleaned, distractor_1_cleaned, distractor_2_cleaned]
    
    # Mescola le opzioni
    random.shuffle(merge_options)
    
    return merge_options

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [13]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. 
    Answer with the correct option only and then stop.
    Given this question: {question} you have to answer given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    Here is context to help: {context} \n
    The correct answer is:
 """
)

In [14]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [15]:
import outlines

def thesisGeneration(query, merged, sources):
    augmented_prompt = thesis_chain.invoke({'question': query, 'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 'context': sources})
    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    answer = generator(normal_string)
    return answer

## 🔹 Test: how well the thesis alone is able to perform?

In [None]:
answers = []
for i in range(200):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    answers.append(thesisGeneration(first_queries[i], merged_options, sources[i]))

Compiling FSM index for all state transitions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 80.93it/s]
Compiling FSM index for all state transitions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 86.72it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 186/186 [00:02<00:00, 85.71it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [00:03<00:00, 84.51it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 148/148 [00:01<00:00, 86.41it/s]
Compiling FSM index for all state transitions: 100

In [65]:
answers

["'Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.'",
 "'A contamination which is associated with the food itself and not through other causes of contamination.'",
 "'20-25 minutes'",
 "'11 to 22 per square foot'",
 "'Due to symptoms in the body'"]

## 🔸 PromptTemplate definition and a LLMChain for the **antithesis** 

In [None]:
from langchain.llms import HuggingFacePipeline

antithesis_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

antithesis_llm = HuggingFacePipeline(pipeline=antithesis_pipeline)

In [None]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Given this question: {question} you have to answer given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    The answer that you have to check is {candidate_answer}. 
    Here is context to help: {context} \n
    Answer by saying 'Yes' if it is correct and 'No' otherwise, then explain why you think so.
    Why or why not the answer is correct:
 """
)

In [None]:
# LLM chain definition
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "context": itemgetter("context"), }

antithesis_chain = augmentation | prompt_template | antithesis_llm

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [75]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer, sources):
    
    second_answer = antithesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'context': sources})
    return second_answer

In [78]:
def extract_answer_ant(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Why or why not the answer is correct:")
    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Why or why not the answer is correct:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [92]:
ant_answers = []
for i in range(200):
    # print(f"True answer: {correct_answers[i]}")
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    ant_answers.append(extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i])))
    # print(f"Given answer: {extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i]))}")
    # print('****************')



In [82]:
ant_answers

['Yes - This option matches exactly how Results Based Accounting (RBA) works according to their website description provided above; they state themselves being an organization focused on helping businesses create better user experience through strategic planning using both tech solutions while also providing evidence based results tracking for accountable decision making purposes within organizations across industries such as healthcare providers etcetera thus align perfectly well together under umbrella term called Digital Transformation Consultants who focus solely upon enhancing customer satisfaction via seamless integration between human centered approach alongside cutting edge technologies designed exclusively around meeting client needs more efficiently than ever before possible!',
 'I would say yes because according to what has been provided in the text "Ronald Regan" as an example for someone who switched from being Democratic party member too republicans.',
 'I agree with your

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [83]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to answer a certain question, given a certain number of candidate options and the context.
    You are also provided with an initial response and its critique, that could enforce or not the first opinion.
    Make a reasonable synthesis of these two opinions, but answer by outputting exactly one of the answer options only.
    Given this question: {question} you have to answer given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    The answer that you have to check is {candidate_answer} and this is its critique: {critique}.
    Here is context to help: {context} \n
    The answer is:
 """
)

In [84]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [85]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': formatted_context})

    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    final_answer = generator(normal_string)
    
    return final_answer

In [93]:
syn_answers = []
for i in range(200):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    syn_answers.append(synthesisGeneration(first_queries[i], prompt_template, merged_options, answers[i], ant_answers[i]))

Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 383/383 [00:04<00:00, 83.29it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 166/166 [00:01<00:00, 83.87it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 302/302 [00:03<00:00, 83.83it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 148/148 [00:01<00:00, 84.32it/s]
Compiling FSM index for all state transitions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 207/207 [00:02<00:00, 82.06it/s]
Compiling FSM index for all state transitions: 100

In [110]:
dataset_new = {'query': first_queries, 'correct answer': correct_answers, 'thesis': answers, 'antithesis': ant_answers, 'synthesis': syn_answers}

In [112]:
import pandas as pd

df = pd.DataFrame(dataset_new)

In [114]:
df.head()

Unnamed: 0,query,correct answer,thesis,antithesis,synthesis
0,what is rba,['Results-Based Accountability is a discipline...,'Results-Based Accountability is a disciplined...,Yes - This option matches exactly how Results ...,"""'Other Allowance' which is basically to compe..."
1,was ronald reagan a democrat,['Yes'],"'50, 55, 60, 65 and 70 C'","I would say ""no"" because according to the text...","'50, 55, 60, 65 and 70 C'"
2,how long do you need for sydney and surroundin...,['20-25 minutes'],'Yes',I would say yes because in response #3 they sa...,'Yes'
3,price to install tile in shower,['$11 to $22 per square foot'],'11 to 22 per square foot',Yes - This option matches what we know from re...,'11 to 22 per square foot'
4,why conversion observed in body,['Due to symptoms in the body'],'A chief engineer is responsible for all opera...,Yes - This option matches what we know about C...,'A chief engineer is responsible for all opera...


In [117]:
df.to_csv('first_output.csv')