# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset
import ast

dataset = load_dataset('saracandu/msmarco_modified', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['Unnamed: 0', 'answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2'],
    num_rows: 82326
})

In [2]:
selected_passages = []

for row in dataset:
    passages_data = ast.literal_eval(row['passages'])
    try:
        selected_index = passages_data['is_selected'].index(1)
        selected_passage = {
            'is_selected': 1,
            'passage_text': passages_data['passage_text'][selected_index],
        }
        selected_passages.append(selected_passage)
    except ValueError:
        # Aggiungi un passaggio vuoto se non c'è nessun passaggio selezionato
        selected_passages.append({'is_selected': 0, 'passage_text': ''})

# Assicurati che la lunghezza dei passaggi selezionati corrisponda alla lunghezza del dataset originale
assert len(selected_passages) == len(dataset), "Errore: lunghezza dei passaggi selezionati non corrisponde alla lunghezza del dataset originale"

In [3]:
from datasets import Dataset

# Creazione di un nuovo dizionario con i dati desiderati
new_dataset = {
    'answers': dataset['answers'],
    'query': dataset['query'],
    'query_id': dataset['query_id'],
    'query_type': dataset['query_type'],
    'wellFormedAnswers': dataset['wellFormedAnswers'],
    'correct_answer': dataset['correct_answer'],
    'distractor_1': dataset['distractor_1'],
    'distractor_2': dataset['distractor_2'],
    'selected_passages': selected_passages
}

# Creazione del nuovo dataset
new_dataset = Dataset.from_dict(new_dataset)

In [4]:
def filter_correct_answer(example):
    return example['correct_answer'] != '[]'

# Applica la funzione di filtro
new_dataset = new_dataset.filter(filter_correct_answer)

# Visualizza le prime righe del dataset filtrato per verificare il risultato
new_dataset

Filter:   0%|          | 0/82326 [00:00<?, ? examples/s]

Dataset({
    features: ['answers', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2', 'selected_passages'],
    num_rows: 80143
})

# Model loading and dataset selection (for testing purposes)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [5]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from outlines import models

new_model = models.Transformers(model, tokenizer)

In [7]:
import outlines

prompt = """You are a sentiment-labelling assistant.
Is the following review positive or negative?

Review: This restaurant is just awesome!
"""

generator = outlines.generate.choice(new_model, ["Positive", "Negative"])
answer = generator(prompt)

In [8]:
print(answer)

Positive


## ▪️ Select a subset of the true dataset as a test

In [52]:
N_examples = 100

In [53]:
# select a subset of the queries, just for test:
first_queries = new_dataset['query'][:N_examples]

# same for correct answers and distractors:
correct_answers = new_dataset['correct_answer'][:N_examples]
distractors_1 = new_dataset['distractor_1'][:N_examples]
distractors_2 = new_dataset['distractor_2'][:N_examples]
# and for the sources:
sources = new_dataset['selected_passages'][:N_examples]

## ▪️ Merge the true answer and the distractors into a vector, shuffling the order of the elements

In [54]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)

# Funzione per mescolare le risposte
def shuffleAnswers(correct_answer, distractor_1, distractor_2):
    # Applica la pulizia a ciascun elemento
    correct_answer_cleaned = clean_text(correct_answer)
    distractor_1_cleaned = clean_text(distractor_1)
    distractor_2_cleaned = clean_text(distractor_2)
    
    # Unisci le opzioni pulite
    merge_options = [correct_answer_cleaned, distractor_1_cleaned, distractor_2_cleaned]
    
    # Mescola le opzioni
    random.shuffle(merge_options)
    
    return merge_options

# Thesis

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [55]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You are asked to answer a question correctly, given a certain number of options. 
    Given this question: {question} you have to answer (you always have to provide an answe) given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    Here is context to help: {context} \n
    The correct answer is:
 """
)

In [56]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [57]:
import outlines

def thesisGeneration(query, merged, sources):
    augmented_prompt = thesis_chain.invoke({'question': query, 'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 'context': sources})
    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    answer = generator(normal_string)
    if not answer:
        answer = random.choice(options)
    return answer

## 🔹 Test: how well the thesis alone is able to perform?

In [58]:
answers = []
for i in range(N_examples):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    answers.append(thesisGeneration(first_queries[i], merged_options, sources[i]))

Compiling FSM index for all state transitions: 100%|█| 265/265 [00:03<00:00, 81.69it/s
Compiling FSM index for all state transitions: 100%|█| 222/222 [00:02<00:00, 82.14it/s
Compiling FSM index for all state transitions: 100%|█| 207/207 [00:02<00:00, 81.82it/s
Compiling FSM index for all state transitions: 100%|██| 60/60 [00:00<00:00, 81.28it/s]
Compiling FSM index for all state transitions: 100%|█| 307/307 [00:03<00:00, 81.12it/s
Compiling FSM index for all state transitions: 100%|█| 270/270 [00:03<00:00, 81.75it/s
Compiling FSM index for all state transitions: 100%|█| 103/103 [00:01<00:00, 81.44it/s
Compiling FSM index for all state transitions: 100%|█| 222/222 [00:02<00:00, 81.31it/s
Compiling FSM index for all state transitions: 100%|█| 223/223 [00:02<00:00, 82.19it/s
Compiling FSM index for all state transitions: 100%|█| 446/446 [00:05<00:00, 82.24it/s
Compiling FSM index for all state transitions: 100%|█| 296/296 [00:03<00:00, 82.84it/s
Compiling FSM index for all state transitio

# Antithesis

## 🔸 PromptTemplate definition and a LLMChain for the **antithesis** 

In [59]:
from langchain.llms import HuggingFacePipeline

antithesis_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

antithesis_llm = HuggingFacePipeline(pipeline=antithesis_pipeline)

In [60]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context.
    - question: {question},
    - possible options: {option_a}, {option_b}, {option_c},
    - context: {context}
    - candidate answer: {candidate_answer}
    Begin to answer by saying 'Correct' or 'Incorrect' according to whether you think the candidate answer is the correct one.
    If 'Incorrect', answer by explaining which other option is most proper to you and why. Remember that you have context to help. \n
    Why or why not the answer is correct:
 """
)

In [61]:
# LLM chain definition
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "context": itemgetter("context"), }

antithesis_chain = augmentation | prompt_template | antithesis_llm

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [62]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer, sources):
    
    second_answer = antithesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'context': sources})
    return second_answer

In [63]:
def extract_answer_ant(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Why or why not the answer is correct:")
    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Why or why not the answer is correct:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [64]:
ant_answers = []
for i in range(N_examples):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    ant_answers.append(extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i], sources[i])))



In [65]:
ant_answers[:6]

['The reason I say it’s incorrect because while they got some parts right like mentioning Results Based accountabilities being about improving life outcomes in general terms but then went off track when talking specifically about how its defined. They mentioned three random things under "other" category without providing clear definition, explanation nor examples related to results based accountable process itself! In contrast your provided passage clearly explains exactly What result base accounts entails making them more credible source than those randomly thrown around term here.. So unfortunately none Of these answers Are actually Correct choices available from this set...',
 'The provided response does match any available information about what constitutes as an appropriate temperature for cooking meatloaf; therefore it can be considered incorrect because none of those temperatures would likely produce edible results when used alone without additional ingredients such as seasoning

# Synthesis

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [66]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You are asked to answer a certain question, given a certain number of options and the context. You are also provided with a first answer and its critique. 
    Change the first answer if the critique says that it's not appropriate, otherwise leave it unaltered: 
    - question: {question},
    - possible options: {option_a}, {option_b}, {option_c},
    - context: {context}
    - candidate answer: {candidate_answer}
    - critique: {critique}
    Correct the initial answer if necessary and return the correct option.
    The new answer is:
 """
)

In [67]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [68]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    final_answer = generator(normal_string)
    
    if not final_answer:
        final_answer = candidate_answer
    return final_answer

In [69]:
syn_answers = []
for i in range(N_examples):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    syn_answers.append(synthesisGeneration(first_queries[i], prompt_template, merged_options, answers[i], ant_answers[i], sources[i]))

Compiling FSM index for all state transitions: 100%|█| 524/524 [00:06<00:00, 81.30it/s
Compiling FSM index for all state transitions: 100%|█| 296/296 [00:03<00:00, 82.35it/s
Compiling FSM index for all state transitions: 100%|██| 61/61 [00:00<00:00, 82.00it/s]
Compiling FSM index for all state transitions: 100%|█| 307/307 [00:03<00:00, 81.58it/s
Compiling FSM index for all state transitions: 100%|█| 237/237 [00:02<00:00, 82.36it/s
Compiling FSM index for all state transitions: 100%|█| 213/213 [00:02<00:00, 81.89it/s
Compiling FSM index for all state transitions: 100%|█| 386/386 [00:04<00:00, 81.30it/s
Compiling FSM index for all state transitions: 100%|█| 558/558 [00:06<00:00, 82.03it/s
Compiling FSM index for all state transitions: 100%|█| 103/103 [00:01<00:00, 81.94it/s
Compiling FSM index for all state transitions: 100%|█| 440/440 [00:05<00:00, 83.00it/s
Compiling FSM index for all state transitions: 100%|█| 169/169 [00:02<00:00, 82.47it/s
Compiling FSM index for all state transitio

In [70]:
dataset_new = {'query': first_queries, 'correct answer': correct_answers, 'thesis': answers, 'antithesis': ant_answers, 'synthesis': syn_answers}

In [71]:
import pandas as pd

df = pd.DataFrame(dataset_new)

In [72]:
df

Unnamed: 0,query,correct answer,thesis,antithesis,synthesis
0,what is rba,['Results-Based Accountability is a discipline...,'Results-Based Accountability is a disciplined...,The reason I say it’s incorrect because while ...,'Results-Based Accountability is a disciplined...
1,was ronald reagan a democrat,['Yes'],"'50, 55, 60, 65 and 70 C'",The provided response does match any available...,"'50, 55, 60, 65 and 70 C'"
2,how long do you need for sydney and surroundin...,['20-25 minutes'],'20-25 minutes',The provided response matches exactly what app...,'Yes'
3,price to install tile in shower,['$11 to $22 per square foot'],'11 to 22 per square foot',"The passage does mention ""Installation costs c...",'11 to 22 per square foot'
4,why conversion observed in body,['Due to symptoms in the body'],'Due to symptoms in the body',"Correct! The passage text explicitly states ""v...",'Due to symptoms in the body'
...,...,...,...,...,...
95,How much will it cost to go to college to beco...,"['Public, four-year colleges cost $7,000 for i...","'Public, four-year colleges cost 7,000 for in-...","Correct! The passage states ""public, four-year...","'Public, four-year colleges cost 7,000 for in-..."
96,trust amendment term,['A document used to change one or more minor ...,'Yes',The response provided in this case should be I...,'A document used to change one or more minor p...
97,what is kuchen,"['Kuchen means cake in German, and refers to a...",'Yes',Correct! The passage explicitly states that “k...,"'Kuchen means cake in German, and refers to a ..."
98,aleve maximum dose,['Two tablets within an 8 to 10 hour period'],'Two tablets within an 8 to 10 hour period',Correct! According to what we know from passag...,'Two tablets within an 8 to 10 hour period'


In [73]:
df.to_csv('test-1.csv')