diff --git a/Makefile b/Makefile index b273ec94d..135942143 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,8 @@ docs-site: ## Build and serve documentation @sphinx-build -nW --keep-going -j 4 -b html $(GIT_ROOT)/docs/ $(GIT_ROOT)/docs/_build/html @python -m http.server --directory $(GIT_ROOT)/docs/_build/html watch-docs: ## Build and watch documentation - sphinx-autobuild docs docs/_build/html --watch $(GIT_ROOT)/src/ --ignore ".ipynb" + rm -rf $(GIT_ROOT)/docs/_build/{html, jupyter_execute} + sphinx-autobuild docs docs/_build/html --watch $(GIT_ROOT)/src/ --ignore "_build" --open-browser # Benchmarks run-benchmarks-eval: ## Run benchmarks for Evaluation diff --git a/docs/alfred.py b/docs/alfred.py new file mode 100644 index 000000000..5ea41a560 --- /dev/null +++ b/docs/alfred.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import os +from collections import namedtuple +import asyncio +from tqdm.asyncio import tqdm +import typing as t +from langchain_openai.chat_models import ChatOpenAI +from langchain_core.language_models.chat_models import BaseChatModel +from langchain.prompts import ChatPromptTemplate + +File = namedtuple("File", "name content") + + +def get_files(path: str, ext: str) -> list: + return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(ext)] + + +def load_docs(path: str) -> t.List[File]: + files = [*get_files(path, ".md")] + docs = [] + for file in files: + with open(file, "r") as f: + docs.append(File(file, f.read())) + return docs + + +async def fix_doc_with_llm(doc: File, llm: BaseChatModel) -> File: + prompt = """\ +fix the following grammar and spelling mistakes in the following text. +Please keep the markdown format intact when reformating it. +Do not make any change to the parts of text that are for formating or additional metadata for the core text in markdown. +The target audience for this is developers so keep the tone serious and to the point without any marketing terms. +The output text should me in .md format. + +text: {text} +""" + fix_docs_prompt = ChatPromptTemplate.from_messages( + [ + (prompt), + ] + ) + # get output + fixed_doc = await llm.ainvoke(fix_docs_prompt.format_messages(text=doc.content)) + return File(doc.name, fixed_doc.content) + + +async def main(docs: t.List[File], llm: BaseChatModel): + fix_doc_routines = [fix_doc_with_llm(doc, llm) for doc in docs] + return await tqdm.gather(*fix_doc_routines) + + +if __name__ == "__main__": + """ + Helpful assistant for documentation review and more (hopefully in the future). + """ + gpt4 = ChatOpenAI(model="gpt-4") + docs = load_docs("./getstarted/") + fix_docs = asyncio.run(main(docs, gpt4)) + for doc in fix_docs: + with open(doc.name, "w") as f: + f.write(doc.content) diff --git a/docs/community/index.md b/docs/community/index.md index 5b23ea666..e7ad35248 100644 --- a/docs/community/index.md +++ b/docs/community/index.md @@ -1,5 +1,5 @@ (community)= -# Community ❀️ +# ❀️ Community **"Alone we can do so little; together we can do so much." - Helen Keller** diff --git a/docs/concepts/index.md b/docs/concepts/index.md index f91e82d1f..fc62a9a18 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -1,5 +1,5 @@ (core-concepts)= -# Core Concepts +# πŸ“š Core Concepts :::{toctree} :caption: Concepts :hidden: diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md index df4029620..2086c1fc3 100644 --- a/docs/getstarted/evaluation.md +++ b/docs/getstarted/evaluation.md @@ -1,36 +1,29 @@ (get-started-evaluation)= -# Evaluation +# Evaluating Using Your Test Set -Welcome to the ragas quickstart. We're going to get you up and running with ragas as quickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline. +Once your test set is ready (whether you've created your own or used the [synthetic test set generation module](get-started-testset-generation)), it's time to evaluate your RAG pipeline. This guide assists you in setting up Ragas as quickly as possible, enabling you to focus on enhancing your Retrieval Augmented Generation pipelines while this library ensures that your modifications are improving the entire pipeline. -to kick things of lets start with the data - -:::{note} -Are you using Azure OpenAI endpoints? Then checkout [this quickstart -guide](../howtos/customisations/azure-openai.ipynb) -::: - -```bash -pip install ragas -``` - -Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment +This guide utilizes OpenAI for running some metrics, so ensure you have your OpenAI key ready and available in your environment. ```python import os os.environ["OPENAI_API_KEY"] = "your-openai-key" ``` +:::{note} +By default, these metrics use OpenAI's API to compute the score. If you're using this metric, ensure that you've set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [LLM guide](../howtos/customisations/llms.ipynb) to learn more. +::: + +Let's begin with the data. + ## The Data -For this tutorial we are going to use an example dataset from one of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/). The dataset has the following columns. +For this tutorial, we'll use an example dataset from one of the baselines we created for the [Amnesty QA](https://huggingface.co/datasets/explodinggradients/amnesty_qa) dataset. The dataset contains the following columns: - question: `list[str]` - These are the questions your RAG pipeline will be evaluated on. -- answer: `list[str]` - The answer generated from the RAG pipeline and given to the user. -- contexts: `list[list[str]]` - The contexts which were passed into the LLM to answer the question. -- ground_truths: `list[list[str]]` - The ground truth answer to the questions. (only required if you are using context_recall) - -Ideally your list of questions should reflect the questions your users give, including those that you have been problematic in the past. +- context: `list[list[str]]` - The contexts which were passed into the LLM to answer the question. +- ground_truth: `list[str]` - The ground truth answer to the questions. +An ideal test data set should contain samples that closely mirror your real-world use case. ```{code-block} python :caption: import sample dataset @@ -42,19 +35,19 @@ amnesty_qa ``` :::{seealso} -See [testset generation](./testset_generation.md) to learn how to generate your own synthetic data for evaluation. +See [test set generation](./testset_generation.md) to learn how to generate your own `Question/Context/Ground_Truth` triplets for evaluation. ::: ## Metrics -Ragas provides you with a few metrics to evaluate the different aspects of your RAG systems namely +Ragas provides several metrics to evaluate various aspects of your RAG systems: -1. Retriever: offers `context_precision` and `context_recall` which give you the measure of the performance of your retrieval system. -2. Generator (LLM): offers `faithfulness` which measures hallucinations and `answer_relevancy` which measures how to the point the answers are to the question. +1. Retriever: Offers `context_precision` and `context_recall` that measure the performance of your retrieval system. +2. Generator (LLM): Provides `faithfulness` that measures hallucinations and `answer_relevancy` that measures how relevant the answers are to the question. -The harmonic mean of these 4 aspects gives you the **ragas score** which is a single measure of the performance of your QA system across all the important aspects. +There are numerous other metrics available in Ragas, check the [metrics guide](ragas-metrics) to learn more. -now lets import these metrics and understand more about what they denote +Now, let's import these metrics and understand more about what they denote. ```{code-block} python :caption: import metrics @@ -65,21 +58,18 @@ from ragas.metrics import ( context_precision, ) ``` -here you can see that we are using 4 metrics, but what do they represent? +Here we're using four metrics, but what do they represent? -1. faithfulness - the factual consistency of the answer to the context base on the question. -2. context_precision - a measure of how relevant the retrieved context is to the question. Conveys quality of the retrieval pipeline. -3. answer_relevancy - a measure of how relevant the answer is to the question -4. context_recall: measures the ability of the retriever to retrieve all the necessary information needed to answer the question. +1. Faithfulness - Measures the factual consistency of the answer to the context based on the question. +2. Context_precision - Measures how relevant the retrieved context is to the question, conveying the quality of the retrieval pipeline. +3. Answer_relevancy - Measures how relevant the answer is to the question. +4. Context_recall - Measures the retriever's ability to retrieve all necessary information required to answer the question. - -:::{note} -by default these metrics are using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [llm guide](../howtos/customisations/llms.ipynb) to learn more -::: +To explore other metrics, check the [metrics guide](ragas-metrics). ## Evaluation -Running the evaluation is as simple as calling evaluate on the `Dataset` with the metrics of your choice. +Running the evaluation is as simple as calling `evaluate` on the `Dataset` with your chosen metrics. ```{code-block} python :caption: evaluate using sample dataset @@ -97,9 +87,9 @@ result = evaluate( result ``` -and there you have it, all the scores you need. +There you have it, all the scores you need. -Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too! +If you want to delve deeper into the results and identify examples where your pipeline performed poorly or exceptionally well, you can convert it into a pandas DataFrame and use your standard analytics tools! ```{code-block} python :caption: export results @@ -110,6 +100,6 @@ df.head() quickstart-outputs

-And thats it! +That's all! -If you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁 +If you have any suggestions, feedback, or issues, please share them in the [issue section](https://github.com/explodinggradients/ragas/issues). We value your input. \ No newline at end of file diff --git a/docs/getstarted/index.md b/docs/getstarted/index.md index 73bda2d90..55a3d9836 100644 --- a/docs/getstarted/index.md +++ b/docs/getstarted/index.md @@ -1,49 +1,44 @@ (get-started)= -# Get Started +# πŸš€ Get Started :::{toctree} :maxdepth: 1 :hidden: install.md -evaluation.md testset_generation.md +evaluation.md monitoring.md ::: -Welcome to the Ragas tutorials! These beginner-friendly tutorials will guide you -through the fundamentals of working with Ragas. These tutorials do assume basic -knowledge of Python and Retrieval Augmented Generation (RAG) pipelines. +Welcome to the Ragas tutorials! If you're new to Ragas, the Get Started guides will walk you through the fundamentals of working with Ragas. These tutorials assume basic knowledge of Python and Retrieval Augmented Generation (RAG) pipelines. -Before you go further make sure you have [Ragas installed](./install.md)! +Before you proceed further, ensure that you have [Ragas installed](./install.md)! :::{note} -The tutorials only give you on overview of what you can do with ragas and the -basic skill you need to use it. If you want an in-depth explanation of the -core-concepts behind Ragas, check out the [Core Concepts](../concepts/index.md) page. You can also checkout the [How-to Guides](../howtos/index.md) if you want to specific applications of Ragas. +The tutorials only provide an overview of what you can accomplish with Ragas and the basic skills needed to utilize it effectively. For an in-depth explanation of the core concepts behind Ragas, check out the [Core Concepts](../concepts/index.md) page. You can also explore the [How-to Guides](../howtos/index.md) for specific applications of Ragas. ::: +If you have any questions about Ragas, feel free to join and ask in the `#questions` channel in our Discord community. -If you have any questions about Ragas, feel free to join and ask in the -`#questions` channel in our discord community ❀ . - -Let’s get started! 🏁 +Let's get started! -:::{card} Ragas Metrics and Evaluation -:link: get-started-evaluation +:::{card} Generate a Synthetic Testset +:link: get-started-testset-generation :link-type: ref -How to use the Ragas Metrics to evaluate your RAG pipelines. +Learn how to generate `Question/Context/Ground_Truth` triplets to get started. ::: -:::{card} Synthetic Test data Generation -:link: get-started-testset-generation +:::{card} Evaluate Using Your Testset +:link: get-started-evaluation :link-type: ref -How to generate test set to assess your RAG pipelines +Find out how to evaluate your RAG pipeline using your test set (your own dataset or synthetic). ::: -:::{card} Monitoring + +:::{card} Monitor Your RAG in Production :link: get-started-monitoring :link-type: ref -How to monitor your RAG systems in production. -::: +Discover how to monitor the performance and quality of your RAG application in production. +::: \ No newline at end of file diff --git a/docs/getstarted/install.md b/docs/getstarted/install.md index 9e9f4e317..712f746e0 100644 --- a/docs/getstarted/install.md +++ b/docs/getstarted/install.md @@ -1,20 +1,23 @@ -# Install +# Installation + +To get started, install Ragas using `pip` with the following command: -You can install ragas with ```bash pip install ragas ``` -If you want to install the latest version (from the main branch) +If you'd like to experiment with the latest features, install the most recent version from the main branch: + ```bash pip install git+https://github.com/explodinggradients/ragas.git ``` -If you are looking to contribute and make changes to the code, make sure you -clone the repo and install it as [editable -install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs). +If you're planning to contribute and make modifications to the code, ensure that you clone the repository and set it up as an [editable install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs). + ```bash git clone https://github.com/explodinggradients/ragas.git cd ragas pip install -e . ``` + +Next, let's construct a [synthetic test set](get-started-testset-generation) using your own data. If you've brought your own test set, you can learn how to [evaluate it](get-started-evaluation) using Ragas. \ No newline at end of file diff --git a/docs/getstarted/monitoring.md b/docs/getstarted/monitoring.md index 37593c6bf..37f10b393 100644 --- a/docs/getstarted/monitoring.md +++ b/docs/getstarted/monitoring.md @@ -1,27 +1,30 @@ (get-started-monitoring)= -# Monitoring +# Monitor Your RAG in Production -Maintaining the quality and performance of an LLM application in a production environment can be challenging. Ragas provides with basic building blocks that you can use for production quality monitoring, offering valuable insights into your application's performance. This is achieved by constructing custom, smaller, more cost-effective, and faster models. +Maintaining the quality and performance of a RAG application in a production environment is challenging. RAG currently provides the essential building blocks for production-quality monitoring, offering valuable insights into your application's performance. However, we are also working towards building a more advanced production monitoring solution by addressing three key areas: + +1. How to ensure the distribution of your production dataset remains consistent with your test set. +2. How to effectively extract insights from the explicit and implicit signals your users provide to infer the quality of your RAG application and identify areas that require attention. +3. How to construct custom, smaller, more cost-effective, and faster models for evaluation and advanced test set generation. :::{note} -This is feature is still in beta access. You can requests for -[**early access**](https://calendly.com/shahules/30min) to try it out. +We are still developing and gathering feedback for upcoming releases. You can request +[**early access**](https://calendly.com/shahules/30min) to try it out or share the challenges you face in this area. We would love to hear your thoughts and challenges. ::: -The Ragas metrics can also be used with other LLM observability tools like -[Langsmith](https://www.langchain.com/langsmith) and -[Langfuse](https://langfuse.com/) to get model-based feedback about various -aspects of you application like those mentioned below +In addition, you can use the RAG metrics with other LLM observability tools like: -:::{seealso} -[Langfuse Integration](../howtos/integrations/langfuse.ipynb) to see Ragas -monitoring in action within the Langfuse dashboard and how to set it up -::: +- [Langsmith](../howtos/integrations/langsmith.ipynb) +- [Phoenix (Arize)](../howtos/integrations/ragas-arize.ipynb) +- [Langfuse](../howtos/integrations/langfuse.ipynb) +- [OpenLayer](https://openlayer.com/) + +These tools can provide model-based feedback about various aspects of your application, such as the ones mentioned below: ## Aspects to Monitor -1. Faithfulness: This feature assists in identifying and quantifying instances of hallucinations. -2. Bad retrieval: This feature helps identify and quantify poor context retrievals. -3. Bad response: This feature helps in recognizing and quantifying evasive, harmful, or toxic responses. -4. Bad format: This feature helps in detecting and quantifying responses with incorrect formatting. -5. Custom use-case: For monitoring other critical aspects that are specific to your use case. [Talk to founders](https://calendly.com/shahules/30min) +1. Faithfulness: This feature assists in identifying and quantifying instances of hallucination. +2. Bad Retrieval: This feature helps identify and quantify poor context retrievals. +3. Bad Response: This feature assists in recognizing and quantifying evasive, harmful, or toxic responses. +4. Bad Format: This feature enables the detection and quantification of responses with incorrect formatting. +5. Custom Use-Case: For monitoring other critical aspects that are specific to your use-case, [Talk to the founders](https://calendly.com/shahules/30min). diff --git a/docs/getstarted/testset_generation.md b/docs/getstarted/testset_generation.md index 8735b3d12..5fc0e4073 100644 --- a/docs/getstarted/testset_generation.md +++ b/docs/getstarted/testset_generation.md @@ -1,7 +1,7 @@ (get-started-testset-generation)= -# Synthetic test data generation +# Generate a Synthetic Test Set -This tutorial is designed to help you create a synthetic evaluation dataset for assessing your RAG pipeline. To achieve this, we will utilize open-ai models, so please ensure you have your OpenAI API key ready and accessible within your environment. +This tutorial guides you in creating a synthetic evaluation dataset for assessing your RAG pipeline. For this purpose, we will utilize OpenAI models. Ensure that your OpenAI API key is readily accessible within your environment. ```{code-block} python import os @@ -11,7 +11,7 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key" ## Documents -To begin, we require a collection of documents to generate synthetic Question/Context/Answer samples. Here, we will employ the langchain document loader to load documents. +Initially, a collection of documents is needed to generate synthetic `Question/Context/Ground_Truth` samples. For this, we'll use the LangChain document loader to load documents. ```{code-block} python :caption: Load documents from directory @@ -21,9 +21,9 @@ documents = loader.load() ``` :::{note} -Each Document object contains a metadata dictionary, which can be used to store additional information about the document which can be accessed with `Document.metadata`. Please ensure that the metadata dictionary contains a key called `file_name` as this will be used in the generation process. The `file_name` attribute in metadata is used to identify chunks belonging to the same document. For example, pages belonging to the same research publication can be identifies using filename. +Each Document object contains a metadata dictionary, which can be used to store additional information about the document accessible via `Document.metadata`. Ensure that the metadata dictionary includes a key called `file_name`, as it will be utilized in the generation process. The `file_name` attribute in metadata is used to identify chunks belonging to the same document. For instance, pages belonging to the same research publication can be identified using the filename. -An example of how to do this is shown below. +Here's an example of how to do this: ```{code-block} python for document in documents: @@ -31,11 +31,11 @@ for document in documents: ``` ::: -At this point, we have a set of documents at our disposal, which will serve as the basis for creating synthetic Question/Context/Answer triplets. +At this point, we have a set of documents ready to be used as a foundation for generating synthetic `Question/Context/Ground_Truth` samples. ## Data Generation -We will now import and use Ragas' `Testsetgenerator` to promptly generate a synthetic test set from the loaded documents. +Now, we'll import and use Ragas' `TestsetGenerator` to quickly generate a synthetic test set from the loaded documents. ```{code-block} python :caption: Create 10 samples using default configuration @@ -49,12 +49,12 @@ generator = TestsetGenerator.with_openai() testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}) ``` -Subsequently, we can export the results into a Pandas DataFrame. +Then, we can export the results into a Pandas DataFrame. -```{code-block} +```{code-block} python :caption: Export to Pandas testset.to_pandas() ```

test-outputs -

+

\ No newline at end of file diff --git a/docs/howtos/index.md b/docs/howtos/index.md index 4586ce894..878d64111 100644 --- a/docs/howtos/index.md +++ b/docs/howtos/index.md @@ -1,5 +1,5 @@ (how-to-guides)= -# How-to Guides +# πŸ› οΈ How-to Guides The how-to guides offer a more comprehensive overview of all the tools Ragas diff --git a/docs/references/index.rst b/docs/references/index.rst index 866c72f88..ac29beb22 100644 --- a/docs/references/index.rst +++ b/docs/references/index.rst @@ -1,5 +1,5 @@ .. _references: -References +πŸ“– References ========== Reference documents for the ``ragas`` package.