diff --git a/docs/integrations/langchain.ipynb b/docs/integrations/langchain.ipynb index 2e1b2f0d5..a87b3dbd1 100644 --- a/docs/integrations/langchain.ipynb +++ b/docs/integrations/langchain.ipynb @@ -25,17 +25,6 @@ "nest_asyncio.apply()" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8333f65e", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, { "cell_type": "markdown", "id": "842e32dc", @@ -62,7 +51,9 @@ "\n", "llm = ChatOpenAI()\n", "qa_chain = RetrievalQA.from_chain_type(\n", - " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True,\n", + " llm,\n", + " retriever=index.vectorstore.as_retriever(),\n", + " return_source_documents=True,\n", ")" ] }, @@ -115,15 +106,17 @@ "]\n", "\n", "eval_answers = [\n", - " \"8,804,000\", # incorrect answer\n", - " \"Queens\", # incorrect answer\n", + " \"8,804,000\", # incorrect answer\n", + " \"Queens\", # incorrect answer\n", " \"New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.\",\n", " \"New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.\",\n", - " 'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.',\n", + " \"The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.\",\n", "]\n", "\n", - "examples = [{\"query\": q, \"ground_truths\": [eval_answers[i]]} \n", - " for i, q in enumerate(eval_questions)]" + "examples = [\n", + " {\"query\": q, \"ground_truths\": [eval_answers[i]]}\n", + " for i, q in enumerate(eval_questions)\n", + "]" ] }, { @@ -196,7 +189,12 @@ "outputs": [], "source": [ "from ragas.langchain.evalchain import RagasEvaluatorChain\n", - "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall\n", + "from ragas.metrics import (\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_relevancy,\n", + " context_recall,\n", + ")\n", "\n", "# create evaluation chains\n", "faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)\n", @@ -469,10 +467,15 @@ "from langchain.smith import RunEvalConfig, run_on_dataset\n", "\n", "evaluation_config = RunEvalConfig(\n", - " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain, context_recall_chain],\n", + " custom_evaluators=[\n", + " faithfulness_chain,\n", + " answer_rel_chain,\n", + " context_rel_chain,\n", + " context_recall_chain,\n", + " ],\n", " prediction_key=\"result\",\n", ")\n", - " \n", + "\n", "result = run_on_dataset(\n", " client,\n", " dataset_name,\n", diff --git a/docs/integrations/llamaindex.ipynb b/docs/integrations/llamaindex.ipynb index 5ffb84761..f07a5f8a5 100644 --- a/docs/integrations/llamaindex.ipynb +++ b/docs/integrations/llamaindex.ipynb @@ -1,490 +1,473 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "d2451aff", - "metadata": {}, - "source": [ - "# Evaluating LlamaIndex\n", - "\n", - "[LlamaIndex](https://github.com/jerryjliu/llama_index) is a data framework for LLM applications to ingest, structure, and access private or domain-specific data. Makes it super easy to connect LLMs with your own data. But in order to figure out the best configuration for llamaIndex and your data you need a object measure of the performance. This is where ragas comes in. Ragas will help you evaluate your `QueryEngine` and gives you the confidence to tweak the configuration to get hightest score.\n", - "\n", - "This guide assumes you have familarity with the LlamaIndex framework." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "f0e3f9ab", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "37a9b094", - "metadata": {}, - "outputs": [], - "source": [ - "# attach to the same event-loop\n", - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "id": "abaf6538", - "metadata": {}, - "source": [ - "## Building the `VectorStoreIndex` and `QueryEngine`\n", - "\n", - "To start lets build an `VectorStoreIndex` over the New York Citie's [wikipedia page](https://en.wikipedia.org/wiki/New_York_City) as an example and use ragas to evaluate it." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "37c4a1cb", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n", - "\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "9019540b", - "metadata": {}, - "source": [ - "load the data, build the `VectorStoreIndex` and create the `QueryEngine`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "85e75230", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader(\"./nyc_wikipedia/\").load_data()\n", - "vector_index = VectorStoreIndex.from_documents(\n", - " documents, service_context=ServiceContext.from_defaults(chunk_size=512)\n", - ")\n", - "\n", - "query_engine = vector_index.as_query_engine()" - ] - }, - { - "cell_type": "markdown", - "id": "13d676c0", - "metadata": {}, - "source": [ - "Lets try an sample question to see if it is working" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a25026c2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "New York City was named in honor of the Duke of York, who would become King James II of England. In 1664, King Charles II appointed the Duke as proprietor of the former territory of New Netherland, including the city of New Amsterdam, when England seized it from Dutch control. The city was then renamed New York in his honor.\n" - ] - } - ], - "source": [ - "response_vector = query_engine.query(\"How did New York City get its name?\")\n", - "\n", - "print(response_vector)" - ] - }, - { - "cell_type": "markdown", - "id": "b678501e", - "metadata": {}, - "source": [ - "## Evaluating with Ragas\n", - "\n", - "Now that we have a `QueryEngine` for the `VectorStoreIndex` we can use the llama_index integration Ragas has to evaluate it. \n", - "\n", - "In order to run an evaluation with Ragas and LlamaIndex you need 3 things\n", - "\n", - "1. LlamaIndex `QueryEngine`: what we will be evaluating\n", - "2. Metrics: Ragas defines a set of metrics that can measure different aspects of the `QueryEngine`. The available metrics and their meaning can be found [here](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)\n", - "3. Questions: A list of questions that ragas will test the `QueryEngine` against. " - ] - }, - { - "cell_type": "markdown", - "id": "145109ad", - "metadata": {}, - "source": [ - "first lets generate the questions. Ideally you should use that you see in production so that the distribution of question with which we evaluate matches the distribution of questions seen in production. This ensures that the scores reflect the performance seen in production.\n", - "\n", - "We're using the `DatasetGenerator` from LlamaIndex for this." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "751dc988", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from llama_index.evaluation import DatasetGenerator\n", - "\n", - "question_generator = DatasetGenerator.from_documents(documents)\n", - "# generate 5 question\n", - "eval_questions = question_generator.generate_questions_from_nodes(5)\n", - "\n", - "len(eval_questions)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d1203c9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['What is the population of New York City as of 2020?',\n", - " 'Which city is the second-largest in the United States after New York City?',\n", - " 'What is the geographical and demographic center of the Northeast megalopolis?',\n", - " 'How many people live within 250 miles of New York City?',\n", - " 'What is the largest metropolitan economy in the world as of 2021?']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# lets see the questions\n", - "eval_questions" - ] - }, - { - "cell_type": "markdown", - "id": "843bddb8", - "metadata": {}, - "source": [ - "Now lets import the metrics we will be using to evaluate" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "9875132a", - "metadata": {}, - "outputs": [], - "source": [ - "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy\n", - "from ragas.metrics.critique import harmfulness\n", - "\n", - "metrics = [faithfulness, answer_relevancy, context_relevancy, harmfulness]" - ] - }, - { - "cell_type": "markdown", - "id": "8ae4a2d1", - "metadata": {}, - "source": [ - "Finally lets run the evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "05633cc2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "evaluating with [faithfulness]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.48s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "evaluating with [answer_relevancy]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Map: 0%| | 0/5 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionanswercontextsfaithfulnessanswer_relevancycontext_ relevancyharmfulness
0What is the population of New York City as of ...\\nThe population of New York City as of 2020 i...[Aeromedical Staging Squadron, and a military ...1.00.9040.1174090
1Which city is the second-largest in the United...\\nLos Angeles is the second-largest city in th...[New York, often called New York City or NYC, ...1.00.9320.0358470
2What is the geographical and demographic cente...\\nNew York City is the geographical and demogr...[New York, often called New York City or NYC, ...1.00.9130.1048230
3How many people live within 250 miles of New Y...\\nOver 58 million people live within 250 miles...[Aeromedical Staging Squadron, and a military ...1.00.8930.1228700
4What is the largest metropolitan economy in th...\\nThe largest metropolitan economy in the worl...[New York, often called New York City or NYC, ...1.00.9110.0441900
\n", - "" - ], - "text/plain": [ - " question \\\n", - "0 What is the population of New York City as of ... \n", - "1 Which city is the second-largest in the United... \n", - "2 What is the geographical and demographic cente... \n", - "3 How many people live within 250 miles of New Y... \n", - "4 What is the largest metropolitan economy in th... \n", - "\n", - " answer \\\n", - "0 \\nThe population of New York City as of 2020 i... \n", - "1 \\nLos Angeles is the second-largest city in th... \n", - "2 \\nNew York City is the geographical and demogr... \n", - "3 \\nOver 58 million people live within 250 miles... \n", - "4 \\nThe largest metropolitan economy in the worl... \n", - "\n", - " contexts faithfulness \\\n", - "0 [Aeromedical Staging Squadron, and a military ... 1.0 \n", - "1 [New York, often called New York City or NYC, ... 1.0 \n", - "2 [New York, often called New York City or NYC, ... 1.0 \n", - "3 [Aeromedical Staging Squadron, and a military ... 1.0 \n", - "4 [New York, often called New York City or NYC, ... 1.0 \n", - "\n", - " answer_relevancy context_ relevancy harmfulness \n", - "0 0.904 0.117409 0 \n", - "1 0.932 0.035847 0 \n", - "2 0.913 0.104823 0 \n", - "3 0.893 0.122870 0 \n", - "4 0.911 0.044190 0 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.to_pandas()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } + "cells": [ + { + "cell_type": "markdown", + "id": "d2451aff", + "metadata": {}, + "source": [ + "# Evaluating LlamaIndex\n", + "\n", + "[LlamaIndex](https://github.com/jerryjliu/llama_index) is a data framework for LLM applications to ingest, structure, and access private or domain-specific data. Makes it super easy to connect LLMs with your own data. But in order to figure out the best configuration for llamaIndex and your data you need a object measure of the performance. This is where ragas comes in. Ragas will help you evaluate your `QueryEngine` and gives you the confidence to tweak the configuration to get hightest score.\n", + "\n", + "This guide assumes you have familarity with the LlamaIndex framework." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "37a9b094", + "metadata": {}, + "outputs": [], + "source": [ + "# attach to the same event-loop\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "abaf6538", + "metadata": {}, + "source": [ + "## Building the `VectorStoreIndex` and `QueryEngine`\n", + "\n", + "To start lets build an `VectorStoreIndex` over the New York Citie's [wikipedia page](https://en.wikipedia.org/wiki/New_York_City) as an example and use ragas to evaluate it." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "37c4a1cb", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "9019540b", + "metadata": {}, + "source": [ + "load the data, build the `VectorStoreIndex` and create the `QueryEngine`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "85e75230", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader(\"./nyc_wikipedia/\").load_data()\n", + "vector_index = VectorStoreIndex.from_documents(\n", + " documents, service_context=ServiceContext.from_defaults(chunk_size=512)\n", + ")\n", + "\n", + "query_engine = vector_index.as_query_engine()" + ] + }, + { + "cell_type": "markdown", + "id": "13d676c0", + "metadata": {}, + "source": [ + "Lets try an sample question to see if it is working" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a25026c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "New York City was named in honor of the Duke of York, who would become King James II of England. In 1664, King Charles II appointed the Duke as proprietor of the former territory of New Netherland, including the city of New Amsterdam, when England seized it from Dutch control. The city was then renamed New York in his honor.\n" + ] + } + ], + "source": [ + "response_vector = query_engine.query(\"How did New York City get its name?\")\n", + "\n", + "print(response_vector)" + ] + }, + { + "cell_type": "markdown", + "id": "b678501e", + "metadata": {}, + "source": [ + "## Evaluating with Ragas\n", + "\n", + "Now that we have a `QueryEngine` for the `VectorStoreIndex` we can use the llama_index integration Ragas has to evaluate it. \n", + "\n", + "In order to run an evaluation with Ragas and LlamaIndex you need 3 things\n", + "\n", + "1. LlamaIndex `QueryEngine`: what we will be evaluating\n", + "2. Metrics: Ragas defines a set of metrics that can measure different aspects of the `QueryEngine`. The available metrics and their meaning can be found [here](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)\n", + "3. Questions: A list of questions that ragas will test the `QueryEngine` against. " + ] + }, + { + "cell_type": "markdown", + "id": "145109ad", + "metadata": {}, + "source": [ + "first lets generate the questions. Ideally you should use that you see in production so that the distribution of question with which we evaluate matches the distribution of questions seen in production. This ensures that the scores reflect the performance seen in production but to start off we'll be using a few example question." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "751dc988", + "metadata": {}, + "outputs": [], + "source": [ + "eval_questions = [\n", + " \"What is the population of New York City as of 2020?\",\n", + " \"Which borough of New York City has the highest population?\",\n", + " \"What is the economic significance of New York City?\",\n", + " \"How did New York City get its name?\",\n", + " \"What is the significance of the Statue of Liberty in New York City?\",\n", + "]\n", + "\n", + "eval_answers = [\n", + " \"8,804,000\", # incorrect answer\n", + " \"Queens\", # incorrect answer\n", + " \"New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.\",\n", + " \"New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.\",\n", + " \"The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.\",\n", + "]\n", + "\n", + "eval_answers = [[a] for a in eval_answers]" + ] + }, + { + "cell_type": "markdown", + "id": "843bddb8", + "metadata": {}, + "source": [ + "Now lets import the metrics we will be using to evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9875132a", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.metrics import (\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_relevancy,\n", + " context_recall,\n", + ")\n", + "from ragas.metrics.critique import harmfulness\n", + "\n", + "metrics = [\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_relevancy,\n", + " harmfulness,\n", + " context_recall,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "8ae4a2d1", + "metadata": {}, + "source": [ + "Finally lets run the evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "05633cc2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [faithfulness]\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [01:12<00:00, 72.16s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_relevancy]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.74s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_ relevancy]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:39<00:00, 39.72s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [harmfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.26s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_recall]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:31<00:00, 31.83s/it]\n" + ] + } + ], + "source": [ + "from ragas.llama_index import evaluate\n", + "\n", + "result = evaluate(query_engine, metrics, eval_questions, eval_answers)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f927a943", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ragas_score': 0.4150, 'faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_ relevancy': 0.1622, 'harmfulness': 0.0000, 'context_recall': 1.0000}\n" + ] + } + ], + "source": [ + "# final scores\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "878b6b82", + "metadata": {}, + "source": [ + "You can convert into a pandas dataframe to run more analysis on it." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b96311e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswercontextsground_truthsfaithfulnessanswer_relevancycontext_ relevancyharmfulnesscontext_recall
0What is the population of New York City as of ...\\nThe population of New York City as of 2020 i...[Aeromedical Staging Squadron, and a military ...[8,804,000]1.00.9999990.16134501.0
1Which borough of New York City has the highest...\\nThe borough of Manhattan has the highest pop...[co-extensive with New York County, the boroug...[Queens]0.00.9985280.04634201.0
2What is the economic significance of New York ...\\nNew York City is a major global economic cen...[health care and life sciences, medical techno...[New York City's economic significance is vast...1.00.9039370.40788001.0
3How did New York City get its name?\\nNew York City was named in honor of the Duke...[a US$1 billion research and education center ...[New York City got its name when it came under...1.00.9298090.05719501.0
4What is the significance of the Statue of Libe...\\nThe Statue of Liberty is a symbol of the Uni...[(stylized I ❤ NY) is both a logo and a song t...[The Statue of Liberty in New York City holds ...0.50.9426810.13844901.0
\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 What is the population of New York City as of ... \n", + "1 Which borough of New York City has the highest... \n", + "2 What is the economic significance of New York ... \n", + "3 How did New York City get its name? \n", + "4 What is the significance of the Statue of Libe... \n", + "\n", + " answer \\\n", + "0 \\nThe population of New York City as of 2020 i... \n", + "1 \\nThe borough of Manhattan has the highest pop... \n", + "2 \\nNew York City is a major global economic cen... \n", + "3 \\nNew York City was named in honor of the Duke... \n", + "4 \\nThe Statue of Liberty is a symbol of the Uni... \n", + "\n", + " contexts \\\n", + "0 [Aeromedical Staging Squadron, and a military ... \n", + "1 [co-extensive with New York County, the boroug... \n", + "2 [health care and life sciences, medical techno... \n", + "3 [a US$1 billion research and education center ... \n", + "4 [(stylized I ❤ NY) is both a logo and a song t... \n", + "\n", + " ground_truths faithfulness \\\n", + "0 [8,804,000] 1.0 \n", + "1 [Queens] 0.0 \n", + "2 [New York City's economic significance is vast... 1.0 \n", + "3 [New York City got its name when it came under... 1.0 \n", + "4 [The Statue of Liberty in New York City holds ... 0.5 \n", + "\n", + " answer_relevancy context_ relevancy harmfulness context_recall \n", + "0 0.999999 0.161345 0 1.0 \n", + "1 0.998528 0.046342 0 1.0 \n", + "2 0.903937 0.407880 0 1.0 \n", + "3 0.929809 0.057195 0 1.0 \n", + "4 0.942681 0.138449 0 1.0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.to_pandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb index 89804059f..e6ca7edb5 100644 --- a/docs/quickstart.ipynb +++ b/docs/quickstart.ipynb @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "b658e02f", "metadata": {}, "outputs": [ @@ -87,7 +87,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6c415f76ed4f4c969f87986ee05f2fb1", + "model_id": "e481f1b6ae824149aaf5afe96330fda3", "version_major": 2, "version_minor": 0 }, @@ -109,7 +109,7 @@ "})" ] }, - "execution_count": 1, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -141,12 +141,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "f17bcf9d", "metadata": {}, "outputs": [], "source": [ - "from ragas.metrics import context_relevancy, answer_relevancy, faithfulness, context_recall\n", + "from ragas.metrics import (\n", + " context_relevancy,\n", + " answer_relevancy,\n", + " faithfulness,\n", + " context_recall,\n", + ")\n", "from ragas.metrics.critique import harmfulness" ] }, @@ -180,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "22eb6f97", "metadata": {}, "outputs": [ @@ -195,7 +200,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|████████████████████████████████████████████████████████████| 2/2 [05:28<00:00, 164.33s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:06<00:00, 6.57s/it]\n" ] }, { @@ -209,7 +214,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|████████████████████████████████████████████████████████████| 2/2 [09:24<00:00, 282.03s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:28<00:00, 28.82s/it]\n" ] }, { @@ -223,7 +228,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████████████████████████| 2/2 [01:22<00:00, 41.37s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:07<00:00, 7.53s/it]\n" ] }, { @@ -237,7 +242,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|████████████████████████████████████████████████████████████| 2/2 [13:02<00:00, 391.15s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:24<00:00, 24.13s/it]\n" ] }, { @@ -251,16 +256,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████████████████████████| 2/2 [02:10<00:00, 65.37s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:07<00:00, 7.31s/it]\n" ] }, { "data": { "text/plain": [ - "{'ragas_score': 0.4400, 'context_ relevancy': 0.2339, 'faithfulness': 0.7689, 'answer_relevancy': 0.9260, 'context_recall': 0.4107, 'harmfulness': 0.0000}" + "{'ragas_score': 0.3482, 'context_ relevancy': 0.1296, 'faithfulness': 0.8889, 'answer_relevancy': 0.9285, 'context_recall': 0.6370, 'harmfulness': 0.0000}" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -269,8 +274,14 @@ "from ragas import evaluate\n", "\n", "result = evaluate(\n", - " fiqa_eval[\"baseline\"],\n", - " metrics=[context_relevancy, faithfulness, answer_relevancy, context_recall, harmfulness],\n", + " fiqa_eval[\"baseline\"].select(range(3)),\n", + " metrics=[\n", + " context_relevancy,\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_recall,\n", + " harmfulness,\n", + " ],\n", ")\n", "\n", "result" diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 6d0c80896..2cb96b862 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -8,7 +8,11 @@ from ragas._analytics import EvaluationEvent, track from ragas.metrics.base import Metric from ragas.metrics.critique import AspectCritique -from ragas.validation import validate_column_dtypes, validate_evaluation_modes +from ragas.validation import ( + remap_column_names, + validate_column_dtypes, + validate_evaluation_modes, +) def evaluate( @@ -72,19 +76,22 @@ def evaluate( raise ValueError("Provide dataset!") if metrics is None: - from ragas.metrics import answer_relevancy, context_relevancy, faithfulness + from ragas.metrics import ( + answer_relevancy, + context_recall, + context_relevancy, + faithfulness, + ) + + metrics = [answer_relevancy, context_relevancy, faithfulness, context_recall] - metrics = [answer_relevancy, context_relevancy, faithfulness] + # remap column names from the dataset + dataset = remap_column_names(dataset, column_map) # validation validate_evaluation_modes(dataset, metrics) validate_column_dtypes(dataset) - # select columns from the dataset - dataset = dataset.from_dict( - {column_map[name]: dataset[column_map[name]] for name in dataset.column_names} - ) - # run the evaluation on dataset with different metrics # initialize all the models in the metrics [m.init_model() for m in metrics] diff --git a/src/ragas/langchain/__init__.py b/src/ragas/langchain/__init__.py index 07570a8fd..039bc0cc8 100644 --- a/src/ragas/langchain/__init__.py +++ b/src/ragas/langchain/__init__.py @@ -1 +1,3 @@ from ragas.langchain.evalchain import RagasEvaluatorChain + +__all__ = ["RagasEvaluatorChain"] diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py index 2daeb9c14..fb19d9404 100644 --- a/src/ragas/llama_index/evaluation.py +++ b/src/ragas/llama_index/evaluation.py @@ -16,6 +16,7 @@ def evaluate( query_engine: BaseQueryEngine, metrics: list[Metric], questions: list[str], + ground_truths: t.Optional[list[str]] = None, ) -> Result: """ Run evaluation of llama_index QueryEngine with different metrics @@ -28,6 +29,8 @@ def evaluate( The ragas metrics to use for evaluation. questions : list[str] List of questions to evaluate on + ground_truths : list[str], optional + List of ground_truths answer to the question to evaluate on. Returns ------- @@ -79,14 +82,14 @@ def evaluate( for r in responses: answers.append(r.response) contexts.append([c.node.get_content() for c in r.source_nodes]) - - ds = Dataset.from_dict( - { - "question": questions, - "answer": answers, - "contexts": contexts, - } - ) + dataset_dict = { + "question": questions, + "answer": answers, + "contexts": contexts, + } + if ground_truths is not None: + dataset_dict["ground_truths"] = ground_truths + ds = Dataset.from_dict(dataset_dict) result = ragas_evaluate(ds, metrics) return result diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index b385e9e26..fbb2e17f3 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,6 +1,6 @@ from ragas.metrics.answer_relevance import AnswerRelevancy, answer_relevancy -from ragas.metrics.context_relevance import ContextRelevancy, context_relevancy from ragas.metrics.context_recall import ContextRecall, context_recall +from ragas.metrics.context_relevance import ContextRelevancy, context_relevancy from ragas.metrics.critique import AspectCritique from ragas.metrics.faithfulnes import Faithfulness, faithfulness @@ -13,5 +13,5 @@ "context_relevancy", "AspectCritique", "ContextRecall", - "context_recall" + "context_recall", ] diff --git a/src/ragas/validation.py b/src/ragas/validation.py index 50c1487d8..ee2f3d379 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -5,6 +5,16 @@ from ragas.metrics.base import EvaluationMode, Metric +def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset: + """ + Remap the column names in case dataset uses different column names + """ + inverse_column_map = {v: k for k, v in column_map.items()} + return dataset.from_dict( + {inverse_column_map[name]: dataset[name] for name in dataset.column_names} + ) + + def validate_column_dtypes(ds: Dataset): for column_names in ["question", "answer"]: if column_names in ds.features: