# RAG Evaluation
- Original nb from https://huggingface.co/learn/cookbook/en/rag_evaluation#evaluating-rag-performance.   Going fully open source, local, optimized for speed with exl2

<img src="https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/RAG_workflow.png" height="700">

In [1]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import os
import sys, os
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Get the current working directory
cwd = os.getcwd()

# Add the parent directory to the system path
sys.path.append(os.path.dirname(cwd))
from scripts import utils
pd.set_option("display.max_colwidth", None)

# 1. Build a synthetic dataset for evaluation

### 1.1. Prepare source documents

- Drop REFERENCES onward, respecting previous discoveries that these just introduce noise to retrieval.  Must process new docs the same way!

In [2]:
os.getcwd()

'/home/mainuser/Desktop/LLMs/RagOverArXiv/nbs'

In [3]:
#%%writefile -a ../scripts/get_vector_store.py

import os
import PyPDF2
from PyPDF2 import PdfReader
from pathlib import Path
FILES_PATH = Path('../data/pdfs_ws_mrkp_test/pdfs')
FILES = list(FILES_PATH.glob('*.pdf'))
FILES[0], len(FILES)

reader = PdfReader(os.path.expanduser(FILES[0]))
pages = reader.pages
documents = []
for page in pages:
  documents.append(page.extract_text())


def load_pdf_to_string(pdf_path):
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        # Create a PDF file reader object
        pdf_reader = PyPDF2.PdfReader(file)

        # Initialize an empty string to hold the text
        text = ''

        # Loop through each page and extract the text
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            references_index= page_text.upper().find('\nREFERENCES\n')
            if references_index != -1:
              page_text = page_text[:references_index]
              text += page_text
              return text
            text += page_text
    return text

# Use the function to load a PDF into a string
text = load_pdf_to_string(os.path.expanduser(FILES[0]))
def get_title(pdf_path): return os.path.expanduser(pdf_path).split('/')[-1]

all_docs_and_titles = [(load_pdf_to_string(os.path.expanduser(pdf_path)),get_title(pdf_path)) for pdf_path in FILES]

all_docs = [doc[0] for doc in all_docs_and_titles]
all_titles = [doc[1] for doc in all_docs_and_titles]

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document 


CHUNK_SIZE = 2000 #try 2000 next
CHUNK_OVERLAP = 200 #try 200 next

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function=len,
)

docs_processed  = [text_splitter.split_documents([Document(page_content=doc, metadata={'filename':FILES[idx].name,'title':utils.get_title(FILES[idx].name.split('_')[0],use_logging=False)})]) 
         for idx,doc in enumerate(all_docs)]


# docs_processed = []
# for i in range(len(FILES)):
#     doc = text_splitter.create_documents([FILES[i].read_text()],metadatas=[{'filename':FILES[i].name,'title':utils.get_title(FILES[i].name.split('_')[0])}])
#     docs_processed.extend(doc)

In [4]:
def get_docs_from_pdf(files):
    all_docs = [load_pdf_to_string(os.path.expanduser(pdf_path)) for  pdf_path in files]
    docs_processed  = [text_splitter.split_documents([Document(page_content=doc, metadata={'filename':files[idx].name,'title':utils.get_title(files[idx].name.split('_')[0],use_logging=False)})]) 
            for idx,doc in enumerate(all_docs)]
    docs_processed = [txt for doc in docs_processed for txt in doc]
    return docs_processed
docs_processed = get_docs_from_pdf(FILES)

In [5]:
len(docs_processed)

262

In [7]:
pd.DataFrame([d.metadata['title'] for d in docs_processed],columns=['title']).value_counts()

tilte                                                                                     
SequenceMatch: Imitation Learning for Autoregressive Sequence Modelling  with Backtracking    69
Self-RAG: Learning to Retrieve, Generate, and Critique through  Self-Reflection               26
Tree of Thoughts: Deliberate Problem Solving with Large Language Models                       26
RewardBench: Evaluating Reward Models for Language Modeling                                   25
How to Train Data-Efficient LLMs                                                              22
KTO: Model Alignment as Prospect Theoretic Optimization                                       22
LLM Augmented LLMs: Expanding Capabilities through Composition                                21
ORPO: Monolithic Preference Optimization without Reference Model                              21
RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder  Language Models          20
TinyLlama: An Open-Source Small Lang

- Make embeddings and save to vector store
- Currently taking from Part3_Metadata+ArXivExplore_single_source nb, should tune

In [8]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

#embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
embed_model_id = 'mixedbread-ai/mxbai-embed-large-v1'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)


from langchain.schema.document import Document

# for index, pdf in enumerate(docs_processed):
#    pdf.metadata['vs_index'] = index
#    content = docs_processed[index]
#    if index == 0:
#        vector_store = FAISS.from_documents([content], embedder)
#    else:
#       vector_store_i = FAISS.from_documents([content], embedder)
#       vector_store.merge_from(vector_store_i)

# vector_store
docs = docs_processed
for i, doc in enumerate(docs):
        doc.metadata['vs_index'] = i
        if i == 0:
            vector_store = FAISS.from_documents([doc], embedder)
        else:
            vector_store_i = FAISS.from_documents([doc], embedder)
            vector_store.merge_from(vector_store_i)


In [10]:
vs_indices = [doc.metadata['vs_index'] for doc in docs_processed]
for page in docs_processed:
  print(page.page_content)
  print(page.metadata)

Tree of Thoughts: Deliberate Problem Solving
with Large Language Models
Shunyu Yao
Princeton UniversityDian Yu
Google DeepMindJeffrey Zhao
Google DeepMindIzhak Shafran
Google DeepMind
Thomas L. Griffiths
Princeton UniversityYuan Cao
Google DeepMindKarthik Narasimhan
Princeton University
Abstract
Language models are increasingly being deployed for general problem solving
across a wide range of tasks, but are still confined to token-level, left-to-right
decision-making processes during inference. This means they can fall short in
tasks that require exploration, strategic lookahead, or where initial decisions play
a pivotal role. To surmount these challenges, we introduce a new framework for
language model inference, “Tree of Thoughts” (ToT), which generalizes over the
popular “Chain of Thought” approach to prompting language models, and enables
exploration over coherent units of text (“thoughts”) that serve as intermediate steps
toward problem solving. ToT allows LMs to perform deliberat

#### TODO: Once have the pipeline working and have a baseline, look into https://huggingface.co/spaces/mteb/leaderboard.  SFR-Embedding-Mistral or something along those lines may work much better.

In [11]:
vector_store.save_local('../data/rag_index_dir')

### 1.2. Setup agents for question generation

- HF used [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) for QA couple generation because it it has excellent performance in leaderboards such as [Chatbot Arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard).  Used Mixtral-8x7b-4bit exl2, and it did not appear significantly better than Mistral, so using Mistral for speed but may come back to this.

#### TODO: Tried Mixtral4Bit, visually perhaps a bit better, but overfits on 'deep question'. Perhaps return to this after looking at embedding model

In [13]:
from exllamav2 import *
from exllamav2.generator import *
import sys, torch


generator_config = ExLlamaV2Config()
generator_config.model_dir = "../../MiStralInference"
generator_config.prepare()

generator_model = ExLlamaV2(generator_config)
cache = ExLlamaV2Cache(generator_model, lazy = True)

print("Loading model...")
generator_model.load_autosplit(cache)

generator_tokenizer = ExLlamaV2Tokenizer(generator_config)
generator_llm = ExLlamaV2StreamingGenerator(generator_model, cache, generator_tokenizer)
generator_llm.set_stop_conditions([generator_tokenizer.eos_token_id])
generator_settings = ExLlamaV2Sampler.Settings()
generator_settings.temperature = 0.85
generator_settings.top_k = 50
generator_settings.top_p = 0.8
generator_settings.token_repetition_penalty = 1.01
#generator_settings.disallow_tokens(generator_tokenizer, [generator_tokenizer.eos_token_id])
# see if commenting out the above solved the endless generation issue (did not have with stream generator)

Loading model...


In [14]:
#Working except eos
from transformers import Pipeline
from ragatouille import RAGPretrainedModel
from typing import Optional, List, Tuple
from langchain.docstore.document import Document
import time

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
from langchain.docstore.document import Document as LangchainDocument
def call_llm(
    question: str,
    generator: ExLlamaV2StreamingGenerator,
    tokenizer: ExLlamaV2Tokenizer,
    settings:ExLlamaV2Sampler.Settings,
    max_new_tokens = 512
) -> Tuple[str, List[LangchainDocument]]:

    max_new_tokens = max_new_tokens

    generator.warmup()
    output = generator.generate_simple(f"<s>[INST] {question} [/INST]", settings, max_new_tokens, seed = 1234)
    return output


call_llm(question="How can I get my cat to like me?", generator=generator_llm,tokenizer=generator_tokenizer,settings=generator_settings,max_new_tokens=1024)

"<s>[INST] How can I get my cat to like me? [/INST] 1. Spend time with your cat: Cats enjoy spending time with their owners, so take some time to play with them, cuddle with them, and give them attention.\n2. Provide food and shelter: Make sure your cat has a comfortable place to sleep, eat, and drink water.\n3. Use positive reinforcement: Reward your cat with treats, praise, and affection when they behave well around you.\n4. Be patient: Cats can take time to warm up to new people, so be patient and give them time to get used to you.\n5. Show respect: Cats are independent creatures, so it's important to show respect for their boundaries and allow them to make their own choices.\n6. Keep your environment clean: Cats are clean animals, so make sure their litter box is clean and accessible at all times.\n7. Provide toys and scratching posts: Cats love to play and scratch, so provide them with toys and scratching posts to keep them entertained.\n8. Be consistent: Consistency is key when i

In [15]:
QA_generation_prompt = """
Your task is to write a deep factual or conceptual question and an answer given a context.
Your deep question should be unambigiously answerable from the context.
Your deep question should be formulated in the same style as questions people reading advanced LLM papers would ask.
This means that your question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Deep question: (your deep question)
Answer: (your answer to the deep question)

Now here is the context.

Context: {context}\n
Output:::"""

Now let's generate our QA couples.
For this example, we generate only 10 QA couples and will load the rest from the Hub.

But for your specific knowledge base, given that you want to get at least ~100 test samples, and accounting for the fact that we will filter out around half of these with our critique agents later on, you should generate much more, in the >200 samples.

In [25]:
import random
from tqdm import tqdm
N_GENERATIONS = 10  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    # output_QA_couple = call_llm(
    #     llm_client, QA_generation_prompt.format(context=sampled_context.page_content)
    # )
    output_QA_couple = call_llm(question=QA_generation_prompt.format(context=sampled_context.page_content), generator=generator_llm,tokenizer=generator_tokenizer,settings=generator_settings,
                                max_new_tokens=1024)
    try:
        question = output_QA_couple.split("Deep question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        #assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["title"],
            }
        )
    except:
        continue

Generating 5 QA couples...


100%|██████████| 5/5 [00:10<00:00,  2.06s/it]


In [26]:
outputs[:2]

[{'context': 'reasoning or code. Full reasoning results are included in Tab. 11.\nEvaluating across Safety Metrics Tab. 6 (full results in Tab. 10 in Appendix) compares dif-\nferent reward models across different safety categories, indicating challenges on striking a bal-\nance between refusing too much or not refusing. Models, such as zephyr-7b-beta and\nzephyr-7b-gemma-v0.1 show how a model focused on helpfulness without a strong notion of\nsafety will score poorly on the should-refuse subsets of the safety section, but highly on XSTest\nShould Respond . Other models, namely those at the top of the overall leaderboard, clearly in-\nclude safety information in the training process andmaintain strong performance on trick questions\nthat could induce false refusals ( XSTest Should Respond ). Finally, the third option is also repre-\nsented in models – those that score highly on prompts that they should refuse and poorly on those\nthey should not, indicating a model that is likely to fal

In [27]:
import pandas as pd
pd.set_option('display.max_colwidth',800)
display(pd.DataFrame(outputs).head(2))

Unnamed: 0,context,question,answer,source_doc
0,"reasoning or code. Full reasoning results are included in Tab. 11.\nEvaluating across Safety Metrics Tab. 6 (full results in Tab. 10 in Appendix) compares dif-\nferent reward models across different safety categories, indicating challenges on striking a bal-\nance between refusing too much or not refusing. Models, such as zephyr-7b-beta and\nzephyr-7b-gemma-v0.1 show how a model focused on helpfulness without a strong notion of\nsafety will score poorly on the should-refuse subsets of the safety section, but highly on XSTest\nShould Respond . Other models, namely those at the top of the overall leaderboard, clearly in-\nclude safety information in the training process andmaintain strong performance on trick questions\nthat could induce false refusals ( XSTest Should Respond ). Finally,...","Based on the provided context, how does the R EWARD BENCH evaluate models' safety behavior and what is its significance in identifying potential false refusal queries?\n\n","The R EWARD BENCH evaluates models' safety behavior by assessing their performance on different subsets of the safety section and their ability to distinguish between prompts they should refuse and those they should not. Models that score highly on prompts they should refuse and poorly on those they should not are likely to falsely refuse queries. The significance of the R EWARD BENCH lies in its ability to quickly identify potential false refusal queries, especially when trained with DPO. This can be especially useful in the context of reward models, where striking a balance between refusing too much or not refusing is a challenge.",RewardBench: Evaluating Reward Models for Language Modeling
1,"derman et al., 2023) and Llama- {7B, 13B, 30B }(Touvron\net al., 2023). This permits us to see how LLM alignment\nscales within a model family (Llama-2 lacks a 30B model,\nhence our use of Llama). Later experiments ( §4.2) are done\non Mistral-7B and its derivatives (Jiang et al., 2023). The\nmodels were trained on a combination of Anthropic HH\n(Ganguli et al., 2022), OpenAssistant (K ¨opf et al., 2023),\nand SHP (Ethayarajh et al., 2022).\nAll models were aligned under identical settings on the\nsame data (e.g., same effective batch size, same optimizer,\netc.), save for hyperparameters unique to them. Similar to\nRafailov et al. (2023), the target sequences for SFT are a\nsubset of the generations used to subsequently align the\nmodel; however, for a more realistic SFT setup, we do ...","Which existing alignment methods outperform non-HALOs in generating high-quality responses, and how do the HALO-aligned Llama-{13B, 30B} models compare to the SFT target sequences in terms of helpfulness, harmlessness, and conciseness?\n\n","Among existing alignment methods, the HALOs (DPO and our offline PPO variant) generally outperform non-HALOs (SLiC and CSFT). However, the gap is only significant (p < 0.05) at 13B+ model sizes. The HALO-aligned Llama-{13B, 30B} models are able to match or exceed the generation quality of SFT target sequences, which are drawn directly from the alignment dataset. Up to a scale of 7B parameters, virtually all of the gains from LLM alignment come from the SFT stage. When judging whether the aligned model's response was better than the SFT target for the given input with respect to helpfulness, harmlessness, and conciseness, a now standard practice is to use a standard analysis that considers the SFT target as a desirable output for x, but not necessarily the best output, meaning that it c...",KTO: Model Alignment as Prospect Theoretic Optimization


### 1.3. Setup critique agents

In [28]:
#%%writefile -a ../scripts/critique_qa.py
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [29]:
#%%writefile -a ../scripts/critique_qa.py
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(question=question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]), 
                                generator=generator_llm,
                                tokenizer=generator_tokenizer,settings=generator_settings,
                                max_new_tokens=1024),
        "relevance": call_llm(question=question_relevance_critique_prompt.format(question=output["question"]), 
                                generator=generator_llm,
                                tokenizer=generator_tokenizer,settings=generator_settings,
                                max_new_tokens=1024),
                    
        "standalone": call_llm(question=question_standalone_critique_prompt.format(question=output["question"]),
                                generator=generator_llm,
                                tokenizer=generator_tokenizer,settings=generator_settings,
                                max_new_tokens=1024)
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                # int(evaluation.split("Total rating: ")[-1].strip()),
                (evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        #print("\033[91m" + f"EVALUATION:" + "\033[0m")
        #print(evaluations)
        #print("\033[91m" + f"EXCEPTION: {e}" + "\033[0m")
        continue

Generating critique for each QA couple...


100%|██████████| 5/5 [00:21<00:00,  4.38s/it]


In [30]:
evaluations

{'groundedness': "<s>[INST] \nYou will be given a context and a question.\nYour task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.\nGive your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.\n\nProvide your answer as follows:\n\nAnswer:::\nEvaluation: (your rationale for the rating, as a text)\nTotal rating: (your rating, as a number between 1 and 5)\n\nYou MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.\n\nNow here are the question and context.\n\nQuestion: In a multi-turn instruction-following task, how does the performance of Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) compare to that of larger or proprietary models such as Llama-2-Chat (70B) and Claude V1?\n\n\n\nContext: Mistral (7B) with single-turn conversation dataset,\nUltraFeedback, a

Now let us filter out bad questions based on our critique agent scores:

In [31]:
outputs[0].keys()

dict_keys(['context', 'question', 'answer', 'source_doc', 'groundedness_score', 'groundedness_eval', 'relevance_score', 'relevance_eval', 'standalone_score', 'standalone_eval'])

In [32]:
#%%writefile -a ../scripts/critique_qa.py
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,"Based on the provided context, how does the R EWARD BENCH evaluate models' safety behavior and what is its significance in identifying potential false refusal queries?\n\n","The R EWARD BENCH evaluates models' safety behavior by assessing their performance on different subsets of the safety section and their ability to distinguish between prompts they should refuse and those they should not. Models that score highly on prompts they should refuse and poorly on those they should not are likely to falsely refuse queries. The significance of the R EWARD BENCH lies in its ability to quickly identify potential false refusal queries, especially when trained with DPO. This can be especially useful in the context of reward models, where striking a balance between refusing too much or not refusing is a challenge.","4.5\n\nThe context provides a clear explanation of how R EWARD BENCH evaluates models' safety behavior and its significance in identifying potential false refusal queries. The evaluation considers multiple safety categories and provides examples of how different models perform on these categories. Additionally, the context explains how R EWARD BENCH is designed to avoid length bias, which is a common issue in RLHF and reward models. Overall, the context provides a comprehensive understanding of how R EWARD BENCH evaluates models' safety behavior and its significance in identifying potential false refusal queries.","4.5\n\nThe R EWARD BENCH is a very useful tool for machine learning developers building NLP applications with the Hugging Face ecosystem. It provides a comprehensive evaluation of models' safety behavior and can help identify potential false refusal queries, which are an important issue in many NLP applications. The combination of rule-based and machine learning-based approaches used by the R EWARD BENCH also makes it a very versatile tool that can be used with a wide range of NLP models. Overall, the R EWARD BENCH is an extremely useful tool for machine learning developers building NLP applications with the Hugging Face ecosystem.",5.0
1,"Which existing alignment methods outperform non-HALOs in generating high-quality responses, and how do the HALO-aligned Llama-{13B, 30B} models compare to the SFT target sequences in terms of helpfulness, harmlessness, and conciseness?\n\n","Among existing alignment methods, the HALOs (DPO and our offline PPO variant) generally outperform non-HALOs (SLiC and CSFT). However, the gap is only significant (p < 0.05) at 13B+ model sizes. The HALO-aligned Llama-{13B, 30B} models are able to match or exceed the generation quality of SFT target sequences, which are drawn directly from the alignment dataset. Up to a scale of 7B parameters, virtually all of the gains from LLM alignment come from the SFT stage. When judging whether the aligned model's response was better than the SFT target for the given input with respect to helpfulness, harmlessness, and conciseness, a now standard practice is to use a standard analysis that considers the SFT target as a desirable output for x, but not necessarily the best output, meaning that it can be improved upon by an aligned model.",,,
2,What specific changes is Toyota implementing to its EV business environment and how does it differ from other companies that have recently embraced a hybrid fleet?\n\n,"Toyota is implementing changes to its executives' areas of responsibility that require disclosure of further changes to its policies and practices as the larger EV business evolves. The company has not had a hybrid supply chain for years and hybrid research is not allowed inside its laboratories. These changes will only come with the launch of a new integrated EV product division, which has not been finalized yet. Additionally, Toyota's ""Reuturistic and Singular Communication Systems"" are the industry's most ambitious ""big guy"" partnerships, and the company is opening the first car dealerships in the United States as the name suggests. There is no plan to extend out its RAV4 EV program beyond the University of Arizona system in Reno. Consumers are more willing to hop on the electric car bandwagon than at any time in history, and the shift to EV mandates has become what almost everyone has been dying to do for decades.",4.5,"4. This question is moderately useful for machine learning developers building NLP applications with the Hugging Face ecosystem, as it requires them to analyze and compare specific changes implemented by Toyota in its EV business environment with those of other companies that have recently embraced a hybrid fleet. However, the question could be more useful if it provided more specific details or criteria for evaluating the changes implemented by each company, or if it focused on a specific aspect of EV adoption, such as charging infrastructure or government policies.",
3,What is the estimated local density of an example using the kernel sum approach and the Densi-LLM method?\n\n,"The estimated local density of an example using the kernel sum approach and the Densi-LLM method is calculated by taking the kernel sum of the scores of all examples in the dataset. The kernel is a function that determines the similarity between two examples based on their embeddings. The smoothing parameter λ controls the scale of the points' effects and helps reduce the complexity from O(N^2) to O(NlogN). The method resembles that of Coleman et al. (2022), except that it uses a two-pass sampling algorithm with stronger theoretical guarantees and performs the density estimation in the latent space of the model.",,,
4,"In a multi-turn instruction-following task, how does the performance of Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) compare to that of larger or proprietary models such as Llama-2-Chat (70B) and Claude V1?\n\n","In the multi-turn instruction-following task, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) achieved comparable results to larger or proprietary models such as Llama-2-Chat (70B) and Claude V1. Specifically, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) scored 7.23 and 7.32 in MT-Bench without being exposed to the multi-turn conversation dataset.",4,,


In [33]:
#%%writefile -a ../scripts/critique_qa.py
generated_questions['groundedness_score']=generated_questions['groundedness_score'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
generated_questions['relevance_score']=generated_questions['relevance_score'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
generated_questions['standalone_score']=generated_questions['groundedness_score'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

In [34]:
generated_questions.head(2)

Unnamed: 0,context,question,answer,source_doc,groundedness_score,groundedness_eval,relevance_score,relevance_eval,standalone_score,standalone_eval
0,"reasoning or code. Full reasoning results are included in Tab. 11.\nEvaluating across Safety Metrics Tab. 6 (full results in Tab. 10 in Appendix) compares dif-\nferent reward models across different safety categories, indicating challenges on striking a bal-\nance between refusing too much or not refusing. Models, such as zephyr-7b-beta and\nzephyr-7b-gemma-v0.1 show how a model focused on helpfulness without a strong notion of\nsafety will score poorly on the should-refuse subsets of the safety section, but highly on XSTest\nShould Respond . Other models, namely those at the top of the overall leaderboard, clearly in-\nclude safety information in the training process andmaintain strong performance on trick questions\nthat could induce false refusals ( XSTest Should Respond ). Finally, the third option is also repre-\nsented in models – those that score highly on prompts that they should refuse and poorly on those\nthey should not, indicating a model that is likely to falsely refusal queries (for example, the Qwen\nchat models). These three behavior modes being represented indicates that R EWARD BENCH can\nbe used as a quick check of the safety behavior of a candidate model, especially when trained with\nDPO (as it will not need further RL training like the classifier models).\nEvaluating Length Bias Given the results showing length bias in RLHF and reward models (Sing-\nhal et al., 2023), we designed R EWARD BENCH so that the chosen responses are either a similar\nlength or shorter than the rejected responses. For example, the AlpacaEval Length subset is de-\nsigned to differentiate between other Chat subsets by having notably different models capabilities\nwith the same average length (results in Tab. 8). In this case, the results are lower than other easy\n10chat subsets, but 90% plus accuracy is achieved by over 10 models – far above random for most\nmodels. Though, more detailed statistical tests are needed to fully understand this, as this only tests","Based on the provided context, how does the R EWARD BENCH evaluate models' safety behavior and what is its significance in identifying potential false refusal queries?\n\n","The R EWARD BENCH evaluates models' safety behavior by assessing their performance on different subsets of the safety section and their ability to distinguish between prompts they should refuse and those they should not. Models that score highly on prompts they should refuse and poorly on those they should not are likely to falsely refuse queries. The significance of the R EWARD BENCH lies in its ability to quickly identify potential false refusal queries, especially when trained with DPO. This can be especially useful in the context of reward models, where striking a balance between refusing too much or not refusing is a challenge.",RewardBench: Evaluating Reward Models for Language Modeling,4.5,"R EWARD BENCH evaluates models' safety behavior by comparing their performance on different safety categories, such as should-refuse subsets, trick questions, and prompts that they should refuse. The evaluation considers how well the models balance refusing too much or not refusing, and how well they include safety information in the training process. Additionally, R EWARD BENCH is designed to avoid length bias by choosing responses that are either similar in length or shorter than the rejected responses.\n\n",4.5,"The R EWARD BENCH is a benchmark for evaluating the safety behavior of NLP models. It measures how well a model can correctly identify potential false refusal queries, which are queries that are intended to be accepted but are incorrectly marked as rejected. The R EWARD BENCH uses a combination of rule-based and machine learning-based approaches to evaluate models' safety behavior.\n\n",4.5,"The REWARD BENCH evaluates models' safety behavior by measuring their performance on a set of tasks that are designed to test their ability to recognize and respond to potentially harmful or malicious inputs. The benchmark assesses a model's ability to correctly identify and reject inputs that could cause harm to users or systems. The significance of this evaluation in identifying potential false refusal queries lies in the fact that false refusal queries can occur when a model is not able to correctly identify and reject potentially harmful inputs. By evaluating models' safety behavior using the REWARD BENCH, researchers can identify models that are more likely to generate false refusal queries and take steps to improve their performance.\n\n"
1,"derman et al., 2023) and Llama- {7B, 13B, 30B }(Touvron\net al., 2023). This permits us to see how LLM alignment\nscales within a model family (Llama-2 lacks a 30B model,\nhence our use of Llama). Later experiments ( §4.2) are done\non Mistral-7B and its derivatives (Jiang et al., 2023). The\nmodels were trained on a combination of Anthropic HH\n(Ganguli et al., 2022), OpenAssistant (K ¨opf et al., 2023),\nand SHP (Ethayarajh et al., 2022).\nAll models were aligned under identical settings on the\nsame data (e.g., same effective batch size, same optimizer,\netc.), save for hyperparameters unique to them. Similar to\nRafailov et al. (2023), the target sequences for SFT are a\nsubset of the generations used to subsequently align the\nmodel; however, for a more realistic SFT setup, we do not\nnecessarily set the most preferred generation to be the target\n(with the exception of HH, since the dispreferred output in\nthat dataset is often harmful). Then we used GPT-4-0613\n4Model Alignment as Prospect Theoretic Optimization\nFigure 3. Among existing alignment methods, the HALOs (DPO and our offline PPO variant) generally outperform non-HALOs (SLiC\nand CSFT), though the gap is only significant (p < 0.05)at 13B+ model sizes. In fact, only the HALO-aligned Llama- {13B, 30B }\nmodels are able to match or exceed the generation quality of SFT target sequences, which are drawn directly from the alignment dataset.\nIt is also worth noting that up to a scale of 7B parameters, virtually all of the gains from LLM alignment come from the SFT stage.\nto judge whether the aligned model’s response was bet-\nter than the SFT target for the given input with respect to\nhelpfulness, harmlessness, and conciseness, a now standard\npractice (Zheng et al., 2023; Li et al., 2023).3Note that\nwhile the SFT target is considered a desirable output for x,\nit is by no means the best output, meaning that it can be\nimproved upon by an aligned model.\nIn Figure 3, we see the results of this analysis:","Which existing alignment methods outperform non-HALOs in generating high-quality responses, and how do the HALO-aligned Llama-{13B, 30B} models compare to the SFT target sequences in terms of helpfulness, harmlessness, and conciseness?\n\n","Among existing alignment methods, the HALOs (DPO and our offline PPO variant) generally outperform non-HALOs (SLiC and CSFT). However, the gap is only significant (p < 0.05) at 13B+ model sizes. The HALO-aligned Llama-{13B, 30B} models are able to match or exceed the generation quality of SFT target sequences, which are drawn directly from the alignment dataset. Up to a scale of 7B parameters, virtually all of the gains from LLM alignment come from the SFT stage. When judging whether the aligned model's response was better than the SFT target for the given input with respect to helpfulness, harmlessness, and conciseness, a now standard practice is to use a standard analysis that considers the SFT target as a desirable output for x, but not necessarily the best output, meaning that it can be improved upon by an aligned model.",KTO: Model Alignment as Prospect Theoretic Optimization,,,,,,


In [None]:
#%%writefile -a ../scripts/critique_qa.py
generated_questions.to_csv("../data/pdfs_ws_mrkp_test/generated_questions_pdf_raw.csv", index=False)

In [35]:
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,"Based on the provided context, how does the R EWARD BENCH evaluate models' safety behavior and what is its significance in identifying potential false refusal queries?\n\n","The R EWARD BENCH evaluates models' safety behavior by assessing their performance on different subsets of the safety section and their ability to distinguish between prompts they should refuse and those they should not. Models that score highly on prompts they should refuse and poorly on those they should not are likely to falsely refuse queries. The significance of the R EWARD BENCH lies in its ability to quickly identify potential false refusal queries, especially when trained with DPO. This can be especially useful in the context of reward models, where striking a balance between refusing too much or not refusing is a challenge.",4.5,4.5,4.5
1,"Which existing alignment methods outperform non-HALOs in generating high-quality responses, and how do the HALO-aligned Llama-{13B, 30B} models compare to the SFT target sequences in terms of helpfulness, harmlessness, and conciseness?\n\n","Among existing alignment methods, the HALOs (DPO and our offline PPO variant) generally outperform non-HALOs (SLiC and CSFT). However, the gap is only significant (p < 0.05) at 13B+ model sizes. The HALO-aligned Llama-{13B, 30B} models are able to match or exceed the generation quality of SFT target sequences, which are drawn directly from the alignment dataset. Up to a scale of 7B parameters, virtually all of the gains from LLM alignment come from the SFT stage. When judging whether the aligned model's response was better than the SFT target for the given input with respect to helpfulness, harmlessness, and conciseness, a now standard practice is to use a standard analysis that considers the SFT target as a desirable output for x, but not necessarily the best output, meaning that it can be improved upon by an aligned model.",,,
2,What specific changes is Toyota implementing to its EV business environment and how does it differ from other companies that have recently embraced a hybrid fleet?\n\n,"Toyota is implementing changes to its executives' areas of responsibility that require disclosure of further changes to its policies and practices as the larger EV business evolves. The company has not had a hybrid supply chain for years and hybrid research is not allowed inside its laboratories. These changes will only come with the launch of a new integrated EV product division, which has not been finalized yet. Additionally, Toyota's ""Reuturistic and Singular Communication Systems"" are the industry's most ambitious ""big guy"" partnerships, and the company is opening the first car dealerships in the United States as the name suggests. There is no plan to extend out its RAV4 EV program beyond the University of Arizona system in Reno. Consumers are more willing to hop on the electric car bandwagon than at any time in history, and the shift to EV mandates has become what almost everyone has been dying to do for decades.",4.5,4.0,4.5
3,What is the estimated local density of an example using the kernel sum approach and the Densi-LLM method?\n\n,"The estimated local density of an example using the kernel sum approach and the Densi-LLM method is calculated by taking the kernel sum of the scores of all examples in the dataset. The kernel is a function that determines the similarity between two examples based on their embeddings. The smoothing parameter λ controls the scale of the points' effects and helps reduce the complexity from O(N^2) to O(NlogN). The method resembles that of Coleman et al. (2022), except that it uses a two-pass sampling algorithm with stronger theoretical guarantees and performs the density estimation in the latent space of the model.",,,
4,"In a multi-turn instruction-following task, how does the performance of Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) compare to that of larger or proprietary models such as Llama-2-Chat (70B) and Claude V1?\n\n","In the multi-turn instruction-following task, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) achieved comparable results to larger or proprietary models such as Llama-2-Chat (70B) and Claude V1. Specifically, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) scored 7.23 and 7.32 in MT-Bench without being exposed to the multi-turn conversation dataset.",4.0,,4.0


In [36]:
#%%writefile -a ../scripts/critique_qa.py
for col in ["groundedness_score", "relevance_score", "standalone_score"]:
    generated_questions[col] = generated_questions[col].fillna(generated_questions[["groundedness_score", "relevance_score", "standalone_score"]].min(axis=1))

In [37]:
#%%writefile -a ../scripts/critique_qa.py
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 3.0)
    & (generated_questions["relevance_score"] >= 3.0)
    & (generated_questions["standalone_score"] >= 3.0)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,"Based on the provided context, how does the R EWARD BENCH evaluate models' safety behavior and what is its significance in identifying potential false refusal queries?\n\n","The R EWARD BENCH evaluates models' safety behavior by assessing their performance on different subsets of the safety section and their ability to distinguish between prompts they should refuse and those they should not. Models that score highly on prompts they should refuse and poorly on those they should not are likely to falsely refuse queries. The significance of the R EWARD BENCH lies in its ability to quickly identify potential false refusal queries, especially when trained with DPO. This can be especially useful in the context of reward models, where striking a balance between refusing too much or not refusing is a challenge.",4.5,4.5,4.5
2,What specific changes is Toyota implementing to its EV business environment and how does it differ from other companies that have recently embraced a hybrid fleet?\n\n,"Toyota is implementing changes to its executives' areas of responsibility that require disclosure of further changes to its policies and practices as the larger EV business evolves. The company has not had a hybrid supply chain for years and hybrid research is not allowed inside its laboratories. These changes will only come with the launch of a new integrated EV product division, which has not been finalized yet. Additionally, Toyota's ""Reuturistic and Singular Communication Systems"" are the industry's most ambitious ""big guy"" partnerships, and the company is opening the first car dealerships in the United States as the name suggests. There is no plan to extend out its RAV4 EV program beyond the University of Arizona system in Reno. Consumers are more willing to hop on the electric car bandwagon than at any time in history, and the shift to EV mandates has become what almost everyone has been dying to do for decades.",4.5,4.0,4.5
4,"In a multi-turn instruction-following task, how does the performance of Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) compare to that of larger or proprietary models such as Llama-2-Chat (70B) and Claude V1?\n\n","In the multi-turn instruction-following task, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) achieved comparable results to larger or proprietary models such as Llama-2-Chat (70B) and Claude V1. Specifically, Mistral- ORPO -α(7B) and Mistral- ORPO -β(7B) scored 7.23 and 7.32 in MT-Bench without being exposed to the multi-turn conversation dataset.",4.0,4.0,4.0


In [None]:
%#%writefile -a ../scripts/critique_qa.py
generated_questions.to_csv("../data/pdfs_ws_mrkp_test/generated_questions_pdf_filtered.csv", index=False)

- Go through the 181 rows remaining post dropping missing vals and missing value imputation visually, keep the better 120ish questions
    - Dropped questions that were off-target for learning about LLMs, relied on the reference section, or mentioned the context.

In [None]:
#eval_dataset = pd.read_csv("../data/generated_questions_filtered_w_human.csv")
eval_dataset = generated_questions

In [None]:
eval_dataset.shape

# 2. Build our RAG System

### 2.1. Preprocessing documents to build our vector database

### 2.2. Retriever - embeddings 🗂️

### 2.3. Reader - LLM 💬

In [38]:
#%%writefile -a ../scripts/answer_w_rag_for_eval.py
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
store = LocalFileStore("./cache/")

#embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
embed_model_id = 'mixedbread-ai/mxbai-embed-large-v1'
core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)
embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.load_local('../data/rag_index_dir', embedder,allow_dangerous_deserialization=True)

In [39]:
#%%writefile -a ../scripts/answer_w_rag_for_eval.py
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [41]:
#%%writefile -a ../scripts/answer_w_rag_for_eval.py
from exllamav2 import *
from exllamav2.generator import *
import sys, torch


reader_config = ExLlamaV2Config()
reader_config.model_dir = "../../ZephyrInference"
reader_config.prepare()

reader_model = ExLlamaV2(reader_config)
cache = ExLlamaV2Cache(reader_model, lazy = True)

print("Loading model...")
reader_model.load_autosplit(cache)

reader_tokenizer = ExLlamaV2Tokenizer(reader_config)
reader_llm = ExLlamaV2StreamingGenerator(reader_model, cache, reader_tokenizer)
#reader_llm.set_stop_conditions([reader_tokenizer.eos_token_id])
reader_settings = ExLlamaV2Sampler.Settings()
reader_settings.temperature = 0.85
reader_settings.top_k = 30
reader_settings.top_p = 0.8
reader_settings.token_repetition_penalty = 1.03

Loading model...


In [56]:
#%%writefile -a ../scripts/answer_w_rag_for_eval.py
from ragatouille import RAGPretrainedModel
from typing import Optional, List, Tuple
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
from langchain.docstore.document import Document as LangchainDocument
def answer_with_rag(
    question: str,
    generator: ExLlamaV2StreamingGenerator,
   # tokenizer: ExLlamaV2Tokenizer,
    settings:ExLlamaV2Sampler.Settings,
    max_new_tokens = 512,
    knowledge_index: FAISS = vector_store,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 10, #30,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    embedding_vector = core_embeddings_model.embed_query(question)
    relevant_docs = knowledge_index.similarity_search_by_vector(embedding_vector, k = num_retrieved_docs)#num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    #Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = RERANKER.rerank(question, relevant_docs, k=num_docs_final)
        #print(f"Type is : {type(relevant_docs[0])}")
        print(dir(relevant_docs[0]))
        print(relevant_docs[0])
        relevant_docs = [doc['content'] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]
    print(f'Len of relevant_docs: {len(relevant_docs)}')
    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

   
    generator.warmup()
    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    answer = generator.generate_simple(final_prompt, 
    settings, max_new_tokens, seed = 1234)
    # print(answer)
    return answer,relevant_docs


answer, relevant_docs = answer_with_rag(question="What is the difference between RAG and self-RAG?", generator=reader_llm,settings=reader_settings,max_new_tokens=512,reranker =True)

=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 22.80it/s]

['__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']
{'content': 'SELF-RAGis a framework that enhances the quality and factuality of an LLM through retrieval and\nself-reflection, without sacrificing LLM’s original creativity and versatility. Our end-to-end training\nlets an LM Mgenerate text informed by retrieved passages, if needed, and criticize the output by\nlearning to generate special tokens. These reflection tokens (Table 1) signal the need for retrieval\nor conf




In [55]:
prompt_end_idx=answer.find('<|assistant|>')
answer[prompt_end_idx+14:]

'Self-RAG, introduced in the paper "SELF-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection," is an improvement over the ad hoc approach of Retrieval-Augmented Generation (RAG) for enhancing the factuality and quality of large language models (LLMs). While RAG indiscriminately retrieves and incorporates a fixed number of passages regardless of their relevance or necessity, Self-RAG trains a single LLM to adaptively retrieve passages on demand and generates and reflects on retrieved passages and its own generations using special tokens called reflection tokens. Self-RAG enables a customizable decoding algorithm to satisfy hard or soft constraints and improves overall performance, factuality, and verifiability for various tasks. Empirical results show that Self-RAG significantly outperforms pre-trained and instruction-tuned LLMs and conventional RAG approaches for reasoning, long-form generation, and fact verification tasks.'

- OK, Zephyr seems to work well, under 4s/question with exl2.  Will try to setup reranker, then onto generating questions and relevant docs.

In [None]:
#%%writefile -a ../scripts/answer_w_rag_for_eval.py
from ragatouille import RAGPretrainedModel
from typing import Optional, List, Tuple
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
from langchain.docstore.document import Document as LangchainDocument
def answer_with_rag(
    question: str,
    generator: ExLlamaV2StreamingGenerator,
   # tokenizer: ExLlamaV2Tokenizer,
    settings:ExLlamaV2Sampler.Settings,
    max_new_tokens = 512,
    knowledge_index: FAISS = vector_store,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 10, #30,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    embedding_vector = core_embeddings_model.embed_query(question)
    relevant_docs = knowledge_index.similarity_search_by_vector(embedding_vector, k = num_retrieved_docs)#num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text


    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]


    relevant_docs = relevant_docs[:num_retrieved_docs]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

   
    generator.warmup()
    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    answer = generator.generate_simple(final_prompt, 
    settings, max_new_tokens, seed = 1234)
    return answer,relevant_docs


answer, relevant_docs = answer_with_rag(question="What is the difference between RAG and self-RAG?", generator=reader_llm,settings=reader_settings,max_new_tokens=512,reranker = RERANKER)

In [None]:
type(relevant_docs)

# 3. Benchmarking the RAG system

In [None]:
'kaist-ai/prometheus-13b-v1.0'

In [None]:
from langchain_core.vectorstores import VectorStore
import pandas as pd

In [None]:
#eval_dataset = pd.read_csv("../data/generated_questions_filtered_w_human.csv")

In [None]:
eval_dataset.head(1)

In [None]:
#%%writefile -a ../scripts/run_rag_tests.py
from collections import namedtuple
def run_rag_tests(
    dataset: pd.DataFrame,
    llm: ExLlamaV2StreamingGenerator,
    knowledge_index: VectorStore,
    #output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = False,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""

    dataset_copy = dataset.copy(deep=True)
    dataset_copy['retrieved_docs'] = None
    for example_row in tqdm(dataset_copy.iterrows()):
        index, example = example_row
        question = example["question"]
        if dataset_copy.loc[index,'retrieved_docs']: #already retrieved
            print(f"Continue for {index} since already processed")
            continue

        generated_answer, relevant_docs =  answer_with_rag(question, knowledge_index=knowledge_index, generator=llm,settings=reader_settings,max_new_tokens=512,reranker = reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        dataset_copy.at[index,'retrieved_docs'] = relevant_docs
        dataset_copy.loc[index,'true_answer'] = dataset_copy.loc[index,'answer']
        dataset_copy.loc[index,'generated_answer'] = generated_answer


        if test_settings:
            dataset_copy["test_settings"] = test_settings
    return dataset_copy #INDENTED ON PURPOSE, TEST RUN!


In [None]:
eval_dataset.columns

In [None]:
#%%writefile -a ../scripts/run_rag_tests.py
ds_rag = run_rag_tests(eval_dataset,reader_llm,vector_store,reranker = None,test_settings='MistralQs-all_MiniLM_L6_v2Embed-ZephyrRead-2000x200chunks-NoRerank')

In [None]:
ds_rag.to_csv("../data/pdfs_ws_mrkp_test/MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank.csv", index=False)

In [None]:
#ds_rag.head(1)

In [None]:
import pandas as pd
df_rag = pd.read_csv("../data/pdfs_ws_mrkp_test/MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank.csv")

In [None]:
from exllamav2 import *
from exllamav2.generator import *

judge_config = ExLlamaV2Config()
judge_config.model_dir = "../PrometheusEval"
#judge_config.model_dir = '../Mixtral4bit'
judge_config.prepare()

judge_model = ExLlamaV2(judge_config)
cache = ExLlamaV2Cache(judge_model, lazy = True)

print("Loading model...")
judge_model.load_autosplit(cache)

judge_tokenizer = ExLlamaV2Tokenizer(judge_config)
judge_llm = ExLlamaV2StreamingGenerator(judge_model, cache, judge_tokenizer)
#judge_llm.set_stop_conditions([judge_tokenizer.eos_token_id])
judge_settings = ExLlamaV2Sampler.Settings()
judge_settings.temperature = 1.0
# judge_settings.top_k = 30
# judge_settings.top_p = 0.8
# judge_settings.token_repetition_penalty = 1.03

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
evaluation_prompt_template

In [None]:
import re

def evaluate_answers(
    answer_path: str,
    eval_chat_model:ExLlamaV2StreamingGenerator,
    settings:ExLlamaV2Sampler.Settings,
    evaluation_prompt: str
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = pd.read_csv(answer_path)
    for example_row in tqdm(answers.iterrows()):
        index, example = example_row
        if f"eval_score" in example:
            continue

        eval_prompt = evaluation_prompt.format(
            instruction=example["question"],
            response=example["generated_answer"],
            reference_answer=example["true_answer"],
        )

        eval_chat_model.warmup()
        
        eval_result = eval_chat_model.generate_simple(eval_prompt, 
        settings, num_tokens=1024, seed = 1234) #max_new_tokens=1024,
        feedback = re.search(r'###Feedback:\s*(.*)',eval_result,re.DOTALL).group(1)
        try:
            #score = re.search(r'(\d+)', feedback).group(1)
            score = re.search(r'overall score is (\d)', feedback).group(1)
        except AttributeError:
            score = 'NaN'
        answers.loc[index,f"eval_score"] = score
        answers.loc[index,f"eval_feedback"] = feedback
        print(f'Score: {score}')
        print(f'Feedback: {feedback}')
    return answers #INDENTED ON PURPOSE, TEST RUN!
        # with open(answer_path, "w") as f:
        #     json.dump(answers, f)

In [None]:
#%%writefile -a ../scripts/run_rag_tests.py
# temp=evaluate_answers(answer_path='../data/pdfs_ws_mrkp_test/MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank.csv',
#                  eval_chat_model=judge_llm,settings=judge_settings,evaluation_prompt=EVALUATION_PROMPT) # SHOULD BE evaluation_prompt_template !!!
import os
from tqdm.auto import tqdm
temp=evaluate_answers(answer_path='../data/pdfs_ws_mrkp_test/MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank.csv',
                 eval_chat_model=judge_llm,settings=judge_settings,evaluation_prompt=evaluation_prompt_template) # SHOULD BE evaluation_prompt_template !!!

In [None]:
#%%writefile -a ../scripts/run_rag_tests.py
temp.to_csv("../data/pdfs_ws_mrkp_test/MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank-Evaluated.csv", index=False)

In [None]:
temp.head()

In [None]:
temp.columns

In [None]:
#%%writefile -a ../scripts/run_rag_tests.py
import matplotlib.pyplot as plt
temp.eval_score.sort_values().hist()
plt.title("Pdf-MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank");
plt.savefig('../data/pdfs_ws_mrkp_test/Pdf-MistralQs-mxbai_embed-ZephyrRead-2000x200chunks-NoRerank-Evaluated.png')

In [None]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

### Inspect results

In [None]:
import os
os.getcwd()

In [None]:
import glob
import pandas as pd
#outputs = []
#RagOverArXiv/data/pdfs_ws_mrkp_test/eval_outputs
pdf_files = glob.glob("../data/pdfs_ws_mrkp_test/eval_outputs/pdf*.csv")
txt_files = glob.glob("../data/pdfs_ws_mrkp_test/eval_outputs/txt*.csv")
all_files = pdf_files+txt_files
result = pd.DataFrame()#index=list(all_files.index))
for file_path in all_files:
    print(file_path)
    output = pd.read_csv(file_path)
    print(output.columns)
    output["settings"] = file_path.split('/')[-1]
    result = pd.concat([result,output])
print(result.shape)
pd.set_option('display.max_colwidth',500)
result[['eval_score','eval_feedback']].head(20)
#result.eval_score.dtype

In [None]:
print(result['eval_score'].unique())

In [None]:
result.eval_score.isna()

In [None]:
import numpy as np
result["eval_score"] = result["eval_score"].apply(
    lambda x: x if not pd.isna(x) else 1
)
result["eval_score"] = (result["eval_score"] - 1) / 4

In [None]:
result[['eval_score','eval_feedback']].head(20)

In [None]:
average_scores = result.groupby("settings")["eval_score"].mean()
average_scores.sort_values()