# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset containing the tuples `(query, correct_answer, distractor_1, distractor_2)` and the one containing the documents

In [1]:
from datasets import load_dataset

dataset = load_dataset('saracandu/msmarco_modified', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['Unnamed: 0', 'answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'correct_answer', 'distractor_1', 'distractor_2'],
    num_rows: 82326
})

In [2]:
from langchain.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader('saracandu/msmarco_filtered', 'passage_text')
documents = loader.load()
documents[0] # just to check



Document(page_content='"Since 2007, the RBA\'s outstanding reputation has been affected by the \'Securency\' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA\'s employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."')

## 🔹 Turn `documents` into a vector database using FAISS

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# create an instance of the RecursiveCharacterTextSplitter class with specific parameters
# (it splits text into chunks of 50 characters each with a 20-character overlap)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

# 'documents' holds the text you want to split, split the text into documents using the text splitter
docs = text_splitter.split_documents(documents)

In [None]:
# choose an embedding method
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  
)

In [None]:
# embed the documents 'docs' into vectors using the embedding method specified by 'embedding'
# the result is stored in a FAISS index:
db = FAISS.from_documents(docs, embeddings)

# to avoid computing it each time (since the docs won't change), save the result in the storage
db.save_local(folder_path="faiss_db", index_name="MSMARCO_FaissIndex")

## 🔹 Upload the already existing vector database (if it exists)

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  
)

db = FAISS.load_local(
    folder_path="faiss_db", # where to find it
    embeddings=embeddings, # in which "embedding language" it is expressed
    index_name="MSMARCO_FaissIndex", # since the folder contains multiple vector databases, specify its name
    allow_dangerous_deserialization=True
)

## 🔹 Use it as a `retriever`

**Note:** `'k'=10` specifies the number of documents to retrieve each time `retrieved` is invoked. 
The default type of search performed is `similarity`.

In [6]:
retriever = db.as_retriever(
    search_kwargs={'k': 10,}
) 

Why `'k'=10`? Because MSMARCO assigns to each `(query, answer)` pair 10 text passages, and only 1 or 2 of these are truly relevant. 
In this first step of analysis I chose not to create `len(dataset)` different vector databases, one for each `(query, answer)` pair, but instead to merge all the passages together and store them into an unique vector database. 


**SE `'k'=10` SBAGLIA ALCUNE RISPOSTE! SE LO ABBASSI A `3` O A `4` NO :)**

# Model part (`Llama-2-7b-chat-hf`)

## 🔹 Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [7]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

#################################################################
# Tokenizer
#################################################################

model_name="meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 🔹 Pipeline for **thesis** generation

In [26]:
from langchain.llms import HuggingFacePipeline

response_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation", # deve matchare la scheda del modello HF o dà errore
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

## 🔹 Select a subset of the true dataset as a test

In [27]:
# select a subset of the queries, just for test:
first_queries = dataset['query'][:5]
first_queries

['what is rba',
 'was ronald reagan a democrat',
 'how long do you need for sydney and surrounding areas',
 'price to install tile in shower',
 'why conversion observed in body']

In [28]:
# same for correct answers and distractors:
correct_answers = dataset['correct_answer'][:5]
distractors_1 = dataset['distractor_1'][:5]
distractors_2 = dataset['distractor_2'][:5]

## 🔹 Merge the true answer and the distractors into a vector, shuffling the order of the elements

In [29]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import random

def shuffleAnswers(correct_answer, distractor_1, distractor_2):
    merge_options = [correct_answer, distractor_1, distractor_2]
    random.shuffle(merge_options)
    return merge_options

## 🔹 Function to format them properly the retrieved documents

In [30]:
# auxiliary function to format properly the output of the retrieval step

def format_page_content(documents):
    """
    Formats the list of retrieved documents such that 'page_content', 'Documents', 'metadata' 
    words are removed and just the true content is kept.
    """
    formatted_output = ""
    for i, doc in enumerate(documents, start=1):
        content = doc.page_content.strip(" ")
        formatted_output += f"[{i}]: {content}\n"
    return formatted_output

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [31]:
# prompt template definition
# requires question, options (a string containing the possible options) and the context as input variables!

from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. 
    Answer with the correct option only and then stop.
    Given this question: {question} \n
    You must answer by choosing only one option above these: {option_a}, {option_b}, {option_c}. \n
    Here is context to help: {context} \n
    The correct answer is:
 """
)

In [32]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "context": itemgetter("context"), }

thesis_chain = augmentation | prompt_template | response_generation_llm

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [33]:
def thesisGeneration(query, prompt_template, merged):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    
    given_answer = thesis_chain.invoke({'question': query, 
                                     'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                     'context': formatted_context})
    return given_answer

In [34]:
# extract the true answer (i.e. remove the unnecessary)

def extract_answer(text):
    # trova l'indice in cui inizia il testo "the correct answer is:"
    start_index = text.find("The correct answer is:") + len("The correct answer is:")
    # estrai il testo dopo "The correct answer is:"
    correct_answer_text = text[start_index:].strip()
    
    return correct_answer_text

## 🔹 Test: how well the thesis alone is able to perform?

In [39]:
answers = []
for i in range(5):
    print(f"True answer: {correct_answers[i]}")
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    answers.append(extract_answer(thesisGeneration(first_queries[i], prompt_template, merged_options)))
    print(f"Given answer: {extract_answer(thesisGeneration(first_queries[i], prompt_template, merged_options))}")
    print('****************')

True answer: ['Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.']




Given answer: Results-Based Accountability
****************
True answer: ['Yes']




Given answer: Option B - No
****************
True answer: ['20-25 minutes']




Given answer: ['Longer']
****************
True answer: ['$11 to $22 per square foot']




Given answer: They are fatty acids that have one double bond in the fatty acid chain with all of the remainder carbon atoms being single-bonded..
****************
True answer: ['Due to symptoms in the body']




Given answer: Due to symtions in th ebody
****************


In [40]:
answers

['Results-Based Accountability',
 'Option B - No',
 "['Longer']",
 'They are fatty acids that have one double bond in the fatty acid chain with all of the remainder carbon atoms being single-bonded..',
 'Due to symtions in th ebody']

## 🔹 Pipeline for **antithesis** generation

In [42]:
response_check_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=False,
    temperature=0.0,
    repetition_penalty=1.5,
    return_full_text=True,
    max_new_tokens=400,
    top_p=0.0
)

response_check_llm = HuggingFacePipeline(pipeline=response_check_pipeline)

## 🔹 PromptTemplate definition and a LLMChain for the **antithesis** 

In [43]:
from langchain import PromptTemplate
prompt_template = PromptTemplate.from_template(
"""
    You're a helpful assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: {question} \n 
    These are the possible options: {option_a}, {option_b}, {option_c} \n.
    The answer that you have to check is {candidate_answer}. 
    Here is context to help: {context} \n
    Is the answer correct? Why or why not?
 """
)

In [44]:
# LLM chain definition

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"question": itemgetter("question"),
                "option_a": itemgetter("option_a"), 
                "option_b": itemgetter("option_b"),
                "option_c": itemgetter("option_c"),
                "candidate_answer": itemgetter("candidate_answer"),
                "context": itemgetter("context"), }

antithesis_chain = augmentation | prompt_template | response_check_llm

## 🔹 Function to generate the antithesis given the question, the thesis, the context and the options

In [45]:
def antithesisGeneration(query, prompt_template, merged, candidate_answer):
    documents_retrieved = retriever.invoke(query)
    formatted_context = format_page_content(documents_retrieved)
    
    second_answer = antithesis_chain.invoke({'question': query, 
                                            'option_a': merged[0], 'option_b': merged[1], 'option_c': merged[2], 
                                            'candidate_answer': candidate_answer,
                                            'context': formatted_context})
    return second_answer

In [46]:
ant_answers = []
for i in range(5):
    print(f"True answer: {correct_answers[i]}")
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    ant_answers.append(extract_answer(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i])))
    print(f"Given answer: {extract_answer(antithesisGeneration(first_queries[i], prompt_template, merged_options, answers[i]))}")
    print('****************')

True answer: ['Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.']




Given answer: assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: what is rba 
 
    These are the possible options: ['Webbed feet'], ['Results-Based Accountability is a disciplined way of thinking and taking action that communities can use to improve the lives of children, youth, families, adults and the community as a whole.'], ["'Other Allowance' which is basically to compensate all and any kind of allowances which is required to be paid at different regions/localities of the various project sites."] 
.
    The answer that you have to check is Results-Based Accountability. 
    Here is context to help: [1]: "Get To Know Us. RBA is a digital and technology consultancy with roots in strategy, design and technology. Our team of specialists help progressive companies deliver modern digital experiences backed



Given answer: assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: was ronald reagan a democrat 
 
    These are the possible options: ['A contamination which is associated with the food itself and not through other causes of contamination.'], ['50, 55, 60, 65 and 70 °C'], ['Yes'] 
.
    The answer that you have to check is Option B - No. 
    Here is context to help: [1]: "From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic 'malaise' of the 1970s and the presidency of Jimmy Carter (ev



Given answer: assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: how long do you need for sydney and surrounding areas 
 
    These are the possible options: ['Yes'], ['20-25 minutes'], ['Oatmeal, beans, apples, pears, barley and prunes.'] 
.
    The answer that you have to check is ['Longer']. 
    Here is context to help: [1]: a Few-sydney area recommendations on where to get). them"
[2]: Highway it is 969km from Sydney to Brisbane. That will take 48 and a half days travelling 20km per day. If you cover 30km per day it's over 32 days and 40km a day will get you there in just over 24"
[3]: "The Sydney central business district, Sydney harbour and outer suburbs from the West. North Sydney 's commercial district. The extensive area covered by urban Sydney is formally divided into more than 300 suburbs for a



Given answer: assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: price to install tile in shower 
 
    These are the possible options: ['$11 to $22 per square foot'], ['They are fatty acids that have one double bond in the fatty acid chain with all of the remainder carbon atoms being single-bonded.'], ['Honolulu'] 
.
    The answer that you have to check is They are fatty acids that have one double bond in the fatty acid chain with all of the remainder carbon atoms being single-bonded... 
    Here is context to help: [1]: "1 For tile shower installation, you\u2019re probably looking at a minimum cost of at least $1,500, although intricate, labor-heavy designs and add-ons (such as new faucets and a tub) could easy bring the price up to $5,000 or more."
[2]: "In regards to tile installation costs, consumers



Given answer: assistant and you are asked to check whether or not a question was answered correctly, given a certain number of candidate options and the context. 
    Answer whether and why you think the answer is correct.
    Given this question: why conversion observed in body 
 
    These are the possible options: ['Due to symptoms in the body'], ['A chief engineer is responsible for all operations and maintenance that has to do with any and all engineering equipment throughout the entire ship.', 'The chief engineer is responsible for the technical supervision of the development, production or operation of an engineering project for a multinational corporation, a major company or a government institution.'], ['Nigeria/Cameroon'] 
.
    The answer that you have to check is Due to symtions in th ebody. 
    Here is context to help: [1]: "Conversion disorder is a type of somatoform disorder where physical symptoms or signs are present that cannot be explained by a medical condition. Ve



In [None]:
for i in range(5):
    print(f"True answer: {correct_answers[i]}")
    merged_options = shuffleAnswers(correct_answers[i], distractors_1[i], distractors_2[i])
    print(f"Given answer: {extract_answer(antithesisGeneration(first_queries[i], prompt_template, merged_options))}")
    print('****************')

In [None]:
import re

# Use regular expression to extract the correct answer
correct_answer_match = re.search(r"The correct answer is:\n\s*\([A-Z]\)", given_answer)

if correct_answer_match:
    correct_answer = correct_answer_match.group(1)
    print(f"The correct answer is: {correct_answer}")
else:
    print("Correct answer not found.")

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

augmentation = {"options": itemgetter("options"), "context": itemgetter("context"), 
                "question": itemgetter("question"), "candidate_answer": itemgetter("candidate_answer")}
llm_chain = augmentation | prompt_template | response_check_llm

In [None]:
llm_chain.invoke({'question': "What is the name of the potion allowing to change appearance?", 
                  'options': "A. Felix Felicis; B. Polyjuice Potion; C. Amortentia", 
                  'context': formatted_context,'candidate_answer': 'B'})

In [None]:
Il problema è che non hai ancora capito come formattare

##  Transformers pipeline (and zero-shot):

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. Answer with the correct option only and then stop."},
    {"role": "user", "content": "Given this question: Who is Taylor Swift? \n you must answer by choosing only one option above these: A. a snowboard; B. a cat; C. a singer. The correct answer is:"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])


##  Transformers AutoModelForCausalLM  (and zero-shot)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You're a helpful assistant and you are asked to answer a question correctly, given a certain number of options. Answer with the correct option only and then stop."},
    {"role": "user", "content": "Given this question: Who is Taylor Swift? \n you must answer by choosing only one option above these: A. a snowboard; B. a cat; C. a singer. The correct answer is:"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))