# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset

dataset = load_dataset('saracandu/msmarco_modified', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['Unnamed: 0', 'answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2'],
    num_rows: 82326
})

In [2]:
from langchain.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader('saracandu/msmarco_filtered', 'passage_text')
documents = loader.load()
documents[0] # just to check



Document(page_content='"Since 2007, the RBA\'s outstanding reputation has been affected by the \'Securency\' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA\'s employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."')

## 🔹 Turn `documents` into a vector database using FAISS

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# create an instance of the RecursiveCharacterTextSplitter class with specific parameters
# (it splits text into chunks of 50 characters each with a 20-character overlap)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

# 'documents' holds the text you want to split, split the text into documents using the text splitter
docs = text_splitter.split_documents(documents)

In [None]:
# choose an embedding method
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",  
)

In [None]:
# embed the documents 'docs' into vectors using the embedding method specified by 'embedding'
# the result is stored in a FAISS index:
db = FAISS.from_documents(docs, embeddings)

# to avoid computing it each time (since the docs won't change), save the result in the storage
db.save_local(folder_path="faiss_db", index_name="MSMARCO_FaissIndex_MPNet")

## 🔹 Upload the already existing vector database (if it exists)

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-MiniLM-L6-dot-v1",  
)

db = FAISS.load_local(
    folder_path="faiss_db", # where to find it
    embeddings=embeddings, # in which "embedding language" it is expressed
    index_name="MSMARCO_FaissIndex_MiniLM", # since the folder contains multiple vector databases, specify its name
    allow_dangerous_deserialization=True
)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.6.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





## 🔹 Use it as a `retriever`

**Note:** `'k'=10` specifies the number of documents to retrieve each time `retrieved` is invoked. 
The default type of search performed is `similarity`.

In [4]:
retriever = db.as_retriever(
    search_kwargs={'k': 1,}
) 

Why `'k'=10`? Because MSMARCO assigns to each `(query, answer)` pair 10 text passages, and only 1 or 2 of these are truly relevant. 
In this first step of analysis I chose not to create `len(dataset)` different vector databases, one for each `(query, answer)` pair, but instead to merge all the passages together and store them into an unique vector database. 


**SE `'k'=10` SBAGLIA ALCUNE RISPOSTE! SE LO ABBASSI A `3` O A `4` NO :)**

# Model part (`Llama-2-7b-chat-hf`)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [5]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from outlines import models

new_model = models.Transformers(model, tokenizer)

In [7]:
import outlines

prompt = """You are a sentiment-labelling assistant.
Is the following review positive or negative?

Review: This restaurant is just awesome!
"""

generator = outlines.generate.choice(new_model, ["Positive", "Negative"])
answer = generator(prompt)

In [8]:
print(answer)

Positive


## ▪️ Select a subset of the true dataset as a test

In [9]:
# select a subset of the queries, just for test:
first_queries = dataset['query'][:200]
# first_queries

In [10]:
# same for correct answers and distractors:
correct_answers = dataset['correct_answer'][:200]
distractors_1 = dataset['distractor_1'][:200]
distractors_2 = dataset['distractor_2'][:200]

In [17]:
correct_answers[0]

"['Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.']"

## ▪️ Merge the true answer and the distractors into a vector, shuffling the order of the elements

In [11]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)

# Funzione per mescolare le risposte
def shuffleAnswers(correct_answer, distractor_1, distractor_2):
    # Applica la pulizia a ciascun elemento
    correct_answer_cleaned = clean_text(correct_answer)
    distractor_1_cleaned = clean_text(distractor_1)
    distractor_2_cleaned = clean_text(distractor_2)
    
    # Unisci le opzioni pulite
    merge_options = [correct_answer_cleaned, distractor_1_cleaned, distractor_2_cleaned]
    
    # Mescola le opzioni
    random.shuffle(merge_options)
    
    return merge_options

## ▪️ Function to format them properly the retrieved documents

In [12]:
# auxiliary function to format properly the output of the retrieval step

def format_page_content(documents):
    """
    Formats the list of retrieved documents such that 'page_content', 'Documents', 'metadata' 
    words are removed and just the true content is kept.
    """
    formatted_output = ""
    for i, doc in enumerate(documents, start=1):
        content = doc.page_content.strip(" ")
        formatted_output += f"[{i}]: {content}\n"
    return formatted_output

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [13]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. 
    Answer with the correct option only and then stop.
    Given this question: {question} you have to answer given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    Here is context to help: {context} \n
    The correct answer is:
 """
)

In [14]:
# LLM chain definition
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template 

In [15]:
thesis_chain.invoke({'question': 'query', 'option_a': 'a', 'option_b': 'b', 'option_c': 'c', 'context': 'something'})

StringPromptValue(text="\n    You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. \n    Answer with the correct option only and then stop.\n    Given this question: query you have to answer given only one of the following options: a, b, c. \n\n    Here is context to help: something \n\n    The correct answer is:\n ")

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [16]:
import outlines

def thesisGeneration(query, merged):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    augmented_prompt = thesis_chain.invoke({'question': query, 'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 'context': formatted_context})
    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    answer = generator(normal_string)
    return answer

## 🔹 Test: how well the thesis alone is able to perform?

In [26]:
answers = []
for i in range(200):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    answers.append(thesisGeneration(first_queries[i], merged_options))

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Unknown attribute 'append' of type ListType[Tuple(array([unichr x 2], 1d, A), array(int64, 1d, A))]

File "../nlp-env/lib64/python3.9/site-packages/numba/typed/typedlist.py", line 82:
def _append(l, item):
    l.append(item)
    ^

During: typing of get attribute at /orfeo/cephfs/home/dssc/scandu00/nlp-env/lib64/python3.9/site-packages/numba/typed/typedlist.py (82)

File "../nlp-env/lib64/python3.9/site-packages/numba/typed/typedlist.py", line 82:
def _append(l, item):
    l.append(item)
    ^


In [65]:
answers

["'Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.'",
 "'A contamination which is associated with the food itself and not through other causes of contamination.'",
 "'20-25 minutes'",
 "'11 to 22 per square foot'",
 "'Due to symptoms in the body'"]

## 🔸 PromptTemplate definition and a LLMChain for the **antithesis** 

In [23]:
from langchain.llms import HuggingFacePipeline

antithesis_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

antithesis_llm = HuggingFacePipeline(pipeline=antithesis_pipeline)

In [24]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Given this question: {question} you have to check is the following candidate answer: {candidate_answer}.
    You have to answer by saying 'Yes' if it is correct and 'No' otherwise, then explain why you think so. 
    Think carefully of the all the possible options: {option_a}, {option_b}, {option_c} and say if one of them seems to you more appropriate than the candidate.
    Here is context to help: {context} \n
    Why or why not the answer is correct:
 """
)

In [25]:
# LLM chain definition
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "context": itemgetter("context"), }

antithesis_chain = augmentation | prompt_template | antithesis_llm

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [21]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    
    second_answer = antithesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'context': formatted_context})
    return second_answer

In [19]:
def extract_answer_ant(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Why or why not the answer is correct:")
    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Why or why not the answer is correct:")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [22]:
ant_answers = []
for i in range(200):
    # print(f"True answer: {correct_answers[i]}")
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    ant_answers.append(extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i])))
    # print(f"Given answer: {extract_answer_ant(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i]))}")
    # print('****************')

NameError: name 'antithesis_chain' is not defined

In [82]:
ant_answers

['Yes - This option matches exactly how Results Based Accounting (RBA) works according to their website description provided above; they state themselves being an organization focused on helping businesses create better user experience through strategic planning using both tech solutions while also providing evidence based results tracking for accountable decision making purposes within organizations across industries such as healthcare providers etcetera thus align perfectly well together under umbrella term called Digital Transformation Consultants who focus solely upon enhancing customer satisfaction via seamless integration between human centered approach alongside cutting edge technologies designed exclusively around meeting client needs more efficiently than ever before possible!',
 'I would say yes because according to what has been provided in the text "Ronald Regan" as an example for someone who switched from being Democratic party member too republicans.',
 'I agree with your

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [None]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to answer a certain question, given a certain number of candidate options and the context.
    You are also provided with an initial response and its critique, that could enforce or not the first opinion.
    Make a reasonable synthesis of these two opinions, but answer by outputting exactly one of the answer options only.
    Given this question: {question} you have to answer given only one of the following options: {option_a}, {option_b}, {option_c}. \n
    The answer that you have to check is {candidate_answer} and this is its critique: {critique}. 
    If the critique is not happy with the previous answer, correct it with another of the available options.
    Here is context to help: {context} \n
    The answer is:
 """
)

In [None]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [None]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': formatted_context})

    normal_string = clean_text(augmented_prompt.text)
    options = [merged[0], merged[1], merged[2]]
    generator = outlines.generate.choice(new_model, options)
    final_answer = generator(normal_string)
    
    return final_answer

In [None]:
syn_answers = []
for i in range(200):
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    syn_answers.append(synthesisGeneration(first_queries[i], prompt_template, merged_options, answers[i], ant_answers[i]))

In [None]:
dataset_new = {'query': first_queries, 'correct answer': correct_answers, 'thesis': answers, 'antithesis': ant_answers, 'synthesis': syn_answers}

In [None]:
import pandas as pd

df = pd.DataFrame(dataset_new)

In [None]:
df.head()

In [117]:
df.to_csv('first_output.csv')