From adc5e6a827aba49ccd604372da6a7e4560cf7732 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 8 Sep 2025 19:36:23 -0700 Subject: [PATCH 1/7] add page for responsible ai with .net --- .openpublishing.redirection.ai.json | 41 +++++++++++++++---- .../evaluate-ai-response.md | 0 .../evaluate-safety.md | 0 .../evaluate-with-reporting.md | 0 .../libraries.md} | 6 +-- docs/ai/evaluation/responsible-ai.md | 37 +++++++++++++++++ .../snippets/evaluate-ai-responses/MyTests.cs | 0 .../evaluate-ai-responses/TestAI.csproj | 0 .../EvaluateResponseSafety.csproj | 0 .../evaluate-safety/MSTestSettings.cs | 0 .../snippets/evaluate-safety/MyTests.cs | 0 .../evaluate-with-reporting/MyTests.cs | 0 .../TestAIWithReporting.csproj | 0 .../WordCountEvaluator.cs | 0 docs/ai/toc.yml | 10 +++-- 15 files changed, 80 insertions(+), 14 deletions(-) rename docs/ai/{quickstarts => evaluation}/evaluate-ai-response.md (100%) rename docs/ai/{tutorials => evaluation}/evaluate-safety.md (100%) rename docs/ai/{tutorials => evaluation}/evaluate-with-reporting.md (100%) rename docs/ai/{conceptual/evaluation-libraries.md => evaluation/libraries.md} (95%) create mode 100644 docs/ai/evaluation/responsible-ai.md rename docs/ai/{quickstarts => evaluation}/snippets/evaluate-ai-responses/MyTests.cs (100%) rename docs/ai/{quickstarts => evaluation}/snippets/evaluate-ai-responses/TestAI.csproj (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-safety/EvaluateResponseSafety.csproj (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-safety/MSTestSettings.cs (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-safety/MyTests.cs (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-with-reporting/MyTests.cs (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-with-reporting/TestAIWithReporting.csproj (100%) rename docs/ai/{tutorials => evaluation}/snippets/evaluate-with-reporting/WordCountEvaluator.cs (100%) diff --git a/.openpublishing.redirection.ai.json b/.openpublishing.redirection.ai.json index 2cda8aab7cd31..4752ef8fe52be 100644 --- a/.openpublishing.redirection.ai.json +++ b/.openpublishing.redirection.ai.json @@ -2,15 +2,22 @@ "redirections": [ { "source_path_from_root": "/docs/ai/ai-extensions.md", - "redirect_url": "/dotnet/ai/microsoft-extensions-ai" + "redirect_url": "/dotnet/ai/microsoft-extensions-ai", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/conceptual/agents.md", "redirect_url": "/dotnet/ai" }, + { + "source_path_from_root": "/docs/ai/conceptual/evaluation-libraries.md", + "redirect_url": "/dotnet/ai/evaluation/libraries", + "redirect_document_id": true + }, { "source_path_from_root": "/docs/ai/get-started/dotnet-ai-overview.md", - "redirect_url": "/dotnet/ai/overview" + "redirect_url": "/dotnet/ai/overview", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/how-to/app-service-db-auth.md", @@ -24,6 +31,11 @@ "source_path_from_root": "/docs/ai/how-to/work-with-local-models.md", "redirect_url": "/dotnet/ai" }, + { + "source_path_from_root": "/docs/ai/quickstarts/evaluate-ai-response.md", + "redirect_url": "/dotnet/ai/evaluation/evaluate-ai-response", + "redirect_document_id": true + }, { "source_path_from_root": "/docs/ai/quickstarts/get-started-azure-openai.md", "redirect_url": "/dotnet/ai/quickstarts/build-chat-app" @@ -38,7 +50,8 @@ }, { "source_path_from_root": "/docs/ai/quickstarts/quickstart-assistants.md", - "redirect_url": "/dotnet/ai/quickstarts/create-assistant" + "redirect_url": "/dotnet/ai/quickstarts/create-assistant", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/quickstarts/quickstart-azure-openai-tool.md", @@ -46,19 +59,33 @@ }, { "source_path_from_root": "/docs/ai/quickstarts/quickstart-local-ai.md", - "redirect_url": "/dotnet/ai/quickstarts/chat-local-model" + "redirect_url": "/dotnet/ai/quickstarts/chat-local-model", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/quickstarts/quickstart-openai-generate-images.md", - "redirect_url": "/dotnet/ai/quickstarts/generate-images" + "redirect_url": "/dotnet/ai/quickstarts/generate-images", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/quickstarts/quickstart-openai-summarize-text.md", - "redirect_url": "/dotnet/ai/quickstarts/prompt-model" + "redirect_url": "/dotnet/ai/quickstarts/prompt-model", + "redirect_document_id": true }, { "source_path_from_root": "/docs/ai/tutorials/llm-eval.md", - "redirect_url": "/dotnet/ai/quickstarts/evaluate-ai-response" + "redirect_url": "/dotnet/ai/quickstarts/evaluate-ai-response", + "redirect_document_id": true + }, + { + "source_path_from_root": "/docs/ai/tutorials/evaluate-safety.md", + "redirect_url": "/dotnet/ai/evaluation/evaluate-safety", + "redirect_document_id": true + }, + { + "source_path_from_root": "/docs/ai/tutorials/evaluate-with-reporting.md", + "redirect_url": "/dotnet/ai/evaluation/evaluate-with-reporting", + "redirect_document_id": true } ] } diff --git a/docs/ai/quickstarts/evaluate-ai-response.md b/docs/ai/evaluation/evaluate-ai-response.md similarity index 100% rename from docs/ai/quickstarts/evaluate-ai-response.md rename to docs/ai/evaluation/evaluate-ai-response.md diff --git a/docs/ai/tutorials/evaluate-safety.md b/docs/ai/evaluation/evaluate-safety.md similarity index 100% rename from docs/ai/tutorials/evaluate-safety.md rename to docs/ai/evaluation/evaluate-safety.md diff --git a/docs/ai/tutorials/evaluate-with-reporting.md b/docs/ai/evaluation/evaluate-with-reporting.md similarity index 100% rename from docs/ai/tutorials/evaluate-with-reporting.md rename to docs/ai/evaluation/evaluate-with-reporting.md diff --git a/docs/ai/conceptual/evaluation-libraries.md b/docs/ai/evaluation/libraries.md similarity index 95% rename from docs/ai/conceptual/evaluation-libraries.md rename to docs/ai/evaluation/libraries.md index 377248b45fa97..68df8deeaaa38 100644 --- a/docs/ai/conceptual/evaluation-libraries.md +++ b/docs/ai/evaluation/libraries.md @@ -6,14 +6,14 @@ ms.date: 07/24/2025 --- # The Microsoft.Extensions.AI.Evaluation libraries -The Microsoft.Extensions.AI.Evaluation libraries simplify the process of evaluating the quality and accuracy of responses generated by AI models in .NET intelligent apps. Various metrics measure aspects like relevance, truthfulness, coherence, and completeness of the responses. Evaluations are crucial in testing, because they help ensure that the AI model performs as expected and provides reliable and accurate results. +The Microsoft.Extensions.AI.Evaluation libraries simplify the process of evaluating the quality and safety of responses generated by AI models in .NET intelligent apps. Various quality metrics measure aspects like relevance, truthfulness, coherence, and completeness of the responses. Safety metrics measure aspects like hate and unfairness, violence, and sexual content. Evaluations are crucial in testing, because they help ensure that the AI model performs as expected and provides reliable and accurate results. The evaluation libraries, which are built on top of the [Microsoft.Extensions.AI abstractions](../microsoft-extensions-ai.md), are composed of the following NuGet packages: - [📦 Microsoft.Extensions.AI.Evaluation](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) – Defines the core abstractions and types for supporting evaluation. - [📦 Microsoft.Extensions.AI.Evaluation.NLP](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains [evaluators](#nlp-evaluators) that evaluate the similarity of an LLM's response text to one or more reference responses using natural language processing (NLP) metrics. These evaluators aren't LLM or AI-based; they use traditional NLP techniques such as text tokenization and n-gram analysis to evaluate text similarity. - [📦 Microsoft.Extensions.AI.Evaluation.Quality](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) – Contains [evaluators](#quality-evaluators) that assess the quality of LLM responses in an app according to metrics such as relevance and completeness. These evaluators use the LLM directly to perform evaluations. -- [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) – Contains [evaluators](#safety-evaluators), such as the `ProtectedMaterialEvaluator` and `ContentHarmEvaluator`, that use the [Azure AI Foundry](/azure/ai-foundry/) Evaluation service to perform evaluations. +- [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) – Contains [evaluators](#safety-evaluators), such as the `ProtectedMaterialEvaluator` and `ContentHarmEvaluator`, that use the [Azure AI Foundry](/azure/ai-foundry/) evaluation service to perform evaluations. - [📦 Microsoft.Extensions.AI.Evaluation.Reporting](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) – Contains support for caching LLM responses, storing the results of evaluations, and generating reports from that data. - [📦 Microsoft.Extensions.AI.Evaluation.Reporting.Azure](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the reporting library with an implementation for caching LLM responses and storing the evaluation results in an [Azure Storage](/azure/storage/common/storage-introduction) container. - [📦 Microsoft.Extensions.AI.Evaluation.Console](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) – A command-line tool for generating reports and managing evaluation data. @@ -60,7 +60,7 @@ NLP evaluators evaluate the quality of an LLM response by comparing it to a refe ### Safety evaluators -Safety evaluators check for presence of harmful, inappropriate, or unsafe content in a response. They rely on the Azure AI Foundry Evaluation service, which uses a model that's fine tuned to perform evaluations. +Safety evaluators check for presence of harmful, inappropriate, or unsafe content in a response. They rely on the Azure AI Foundry evaluation service, which uses a model that's fine tuned to perform evaluations. | Evaluator type | Metric | Description | |---------------------------------------------------------------------------|--------------------|-------------| diff --git a/docs/ai/evaluation/responsible-ai.md b/docs/ai/evaluation/responsible-ai.md new file mode 100644 index 0000000000000..713d3400e78cd --- /dev/null +++ b/docs/ai/evaluation/responsible-ai.md @@ -0,0 +1,37 @@ +--- +title: Responsible AI with .NET +description: Learn what responsible AI is and how you can use .NET to evaluate the safety of your AI apps. +ms.date: 09/08/2025 +ai-usage: ai-assisted +--- + +# Responsible AI with .NET + +*Responsible AI* refers to the practice of designing, developing, and deploying artificial intelligence systems in a way that is ethical, transparent, and aligned with human values. It emphasizes fairness, accountability, privacy, and safety to ensure that AI technologies benefit individuals and society as a whole. As AI becomes increasingly integrated into applications and decision-making processes, prioritizing responsible AI is of utmost importance. + +Microsoft has identified [six principles](https://www.microsoft.com/ai/responsible-ai) for responsible AI: + +- Fairness +- Reliability and safety +- Privacy and security +- Inclusiveness +- Transparency +- Accountability + +If you're building an AI app with .NET, the [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) package provides evaluators to help ensure that the responses your app generates meet the standards for responsible AI. These safety evaluators use the [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) to perform evaluations. They include metrics for hate and unfairness, groundedness, ungrounded inference of human attributes, and the presence of: + +- Protected material +- Self-harm content +- Sexual content +- Violent content +- Vulnerable code +- Indirect attacks + +For more information about the safety evaluators, see [Safety evaluators](libraries.md#safety-evaluators). To get started with the Microsoft.Extensions.AI.Evaluation.Safety evaluators, see [Tutorial: Evaluate response safety with caching and reporting](evaluate-safety.md). + +## See also + +- [Responsible AI at Microsoft](https://www.microsoft.com/ai/responsible-ai) +- [Training: Embrace responsible AI principles and practices](/training/modules/embrace-responsible-ai-principles-practices/) +- [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) +- [Azure AI Content Safety](/azure/ai-services/content-safety/overview) diff --git a/docs/ai/quickstarts/snippets/evaluate-ai-responses/MyTests.cs b/docs/ai/evaluation/snippets/evaluate-ai-responses/MyTests.cs similarity index 100% rename from docs/ai/quickstarts/snippets/evaluate-ai-responses/MyTests.cs rename to docs/ai/evaluation/snippets/evaluate-ai-responses/MyTests.cs diff --git a/docs/ai/quickstarts/snippets/evaluate-ai-responses/TestAI.csproj b/docs/ai/evaluation/snippets/evaluate-ai-responses/TestAI.csproj similarity index 100% rename from docs/ai/quickstarts/snippets/evaluate-ai-responses/TestAI.csproj rename to docs/ai/evaluation/snippets/evaluate-ai-responses/TestAI.csproj diff --git a/docs/ai/tutorials/snippets/evaluate-safety/EvaluateResponseSafety.csproj b/docs/ai/evaluation/snippets/evaluate-safety/EvaluateResponseSafety.csproj similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-safety/EvaluateResponseSafety.csproj rename to docs/ai/evaluation/snippets/evaluate-safety/EvaluateResponseSafety.csproj diff --git a/docs/ai/tutorials/snippets/evaluate-safety/MSTestSettings.cs b/docs/ai/evaluation/snippets/evaluate-safety/MSTestSettings.cs similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-safety/MSTestSettings.cs rename to docs/ai/evaluation/snippets/evaluate-safety/MSTestSettings.cs diff --git a/docs/ai/tutorials/snippets/evaluate-safety/MyTests.cs b/docs/ai/evaluation/snippets/evaluate-safety/MyTests.cs similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-safety/MyTests.cs rename to docs/ai/evaluation/snippets/evaluate-safety/MyTests.cs diff --git a/docs/ai/tutorials/snippets/evaluate-with-reporting/MyTests.cs b/docs/ai/evaluation/snippets/evaluate-with-reporting/MyTests.cs similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-with-reporting/MyTests.cs rename to docs/ai/evaluation/snippets/evaluate-with-reporting/MyTests.cs diff --git a/docs/ai/tutorials/snippets/evaluate-with-reporting/TestAIWithReporting.csproj b/docs/ai/evaluation/snippets/evaluate-with-reporting/TestAIWithReporting.csproj similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-with-reporting/TestAIWithReporting.csproj rename to docs/ai/evaluation/snippets/evaluate-with-reporting/TestAIWithReporting.csproj diff --git a/docs/ai/tutorials/snippets/evaluate-with-reporting/WordCountEvaluator.cs b/docs/ai/evaluation/snippets/evaluate-with-reporting/WordCountEvaluator.cs similarity index 100% rename from docs/ai/tutorials/snippets/evaluate-with-reporting/WordCountEvaluator.cs rename to docs/ai/evaluation/snippets/evaluate-with-reporting/WordCountEvaluator.cs diff --git a/docs/ai/toc.yml b/docs/ai/toc.yml index 6f87e87e4daf3..e867cf33701cb 100644 --- a/docs/ai/toc.yml +++ b/docs/ai/toc.yml @@ -83,16 +83,18 @@ items: href: /azure/ai-services/openai/how-to/risks-safety-monitor?toc=/dotnet/ai/toc.json&bc=/dotnet/ai/toc.json - name: Evaluation items: + - name: Responsible AI with .NET + href: evaluation/responsible-ai.md - name: The Microsoft.Extensions.AI.Evaluation libraries - href: conceptual/evaluation-libraries.md + href: evaluation/libraries.md - name: Tutorials items: - name: "Quickstart: Evaluate the quality of a response" - href: quickstarts/evaluate-ai-response.md + href: evaluation/evaluate-ai-response.md - name: "Evaluate response quality with caching and reporting" - href: tutorials/evaluate-with-reporting.md + href: evaluation/evaluate-with-reporting.md - name: "Evaluate response safety with caching and reporting" - href: tutorials/evaluate-safety.md + href: evaluation/evaluate-safety.md - name: Advanced items: - name: Sample interface implementations From a6e642dde6d4ae0caf0a5b6cada578655ea0daae Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 8 Sep 2025 19:41:55 -0700 Subject: [PATCH 2/7] replace redirects --- docs/ai/evaluation/evaluate-ai-response.md | 4 ++-- docs/ai/evaluation/evaluate-safety.md | 4 ++-- docs/ai/evaluation/libraries.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/ai/evaluation/evaluate-ai-response.md b/docs/ai/evaluation/evaluate-ai-response.md index e81bbea6c0e7f..b6d8bf0a79ea5 100644 --- a/docs/ai/evaluation/evaluate-ai-response.md +++ b/docs/ai/evaluation/evaluate-ai-response.md @@ -10,7 +10,7 @@ ms.topic: quickstart In this quickstart, you create an MSTest app to evaluate the quality of a chat response from an OpenAI model. The test app uses the [Microsoft.Extensions.AI.Evaluation](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) libraries. > [!NOTE] -> This quickstart demonstrates the simplest usage of the evaluation API. Notably, it doesn't demonstrate use of the [response caching](../conceptual/evaluation-libraries.md#cached-responses) and [reporting](../conceptual/evaluation-libraries.md#reporting) functionality, which are important if you're authoring unit tests that run as part of an "offline" evaluation pipeline. The scenario shown in this quickstart is suitable in use cases such as "online" evaluation of AI responses within production code and logging scores to telemetry, where caching and reporting aren't relevant. For a tutorial that demonstrates the caching and reporting functionality, see [Tutorial: Evaluate a model's response with response caching and reporting](../tutorials/evaluate-with-reporting.md) +> This quickstart demonstrates the simplest usage of the evaluation API. Notably, it doesn't demonstrate use of the [response caching](/dotnet/ai/evaluation/libraries#cached-responses) and [reporting](/dotnet/ai/evaluation/libraries#reporting) functionality, which are important if you're authoring unit tests that run as part of an "offline" evaluation pipeline. The scenario shown in this quickstart is suitable in use cases such as "online" evaluation of AI responses within production code and logging scores to telemetry, where caching and reporting aren't relevant. For a tutorial that demonstrates the caching and reporting functionality, see [Tutorial: Evaluate a model's response with response caching and reporting](/dotnet/ai/evaluation/evaluate-with-reporting) ## Prerequisites @@ -103,4 +103,4 @@ If you no longer need them, delete the Azure OpenAI resource and GPT-4 model dep ## Next steps - Evaluate the responses from different OpenAI models. -- Add response caching and reporting to your evaluation code. For more information, see [Tutorial: Evaluate a model's response with response caching and reporting](../tutorials/evaluate-with-reporting.md). +- Add response caching and reporting to your evaluation code. For more information, see [Tutorial: Evaluate a model's response with response caching and reporting](/dotnet/ai/evaluation/evaluate-with-reporting). diff --git a/docs/ai/evaluation/evaluate-safety.md b/docs/ai/evaluation/evaluate-safety.md index 6ae7fc152e7d6..6fc8a6d07b40b 100644 --- a/docs/ai/evaluation/evaluate-safety.md +++ b/docs/ai/evaluation/evaluate-safety.md @@ -110,7 +110,7 @@ Complete the following steps to create an MSTest project. > [!NOTE] > This code example passes the LLM as `originalChatClient` to . The reason to include the LLM chat client here is to enable getting a chat response from the LLM, and notably, to enable response caching for it. (If you don't want to cache the LLM's response, you can create a separate, local to fetch the response from the LLM.) Instead of passing a , if you already have a for an LLM from another reporting configuration, you can pass that instead, using the overload. > - > Similarly, if you configure both [LLM-based evaluators](../conceptual/evaluation-libraries.md#quality-evaluators) and [Azure AI Foundry Evaluation service–based evaluators](../conceptual/evaluation-libraries.md#safety-evaluators) in the reporting configuration, you also need to pass the LLM to . Then it returns a that can talk to both types of evaluators. + > Similarly, if you configure both [LLM-based evaluators](/dotnet/ai/evaluation/libraries#quality-evaluators) and [Azure AI Foundry Evaluation service–based evaluators](/dotnet/ai/evaluation/libraries#safety-evaluators) in the reporting configuration, you also need to pass the LLM to . Then it returns a that can talk to both types of evaluators. 1. Add a method to define the [chat options](xref:Microsoft.Extensions.AI.ChatOptions) and ask the model for a response to a given question. @@ -148,6 +148,6 @@ To generate a report to view the evaluation results, see [Generate a report](eva This tutorial covers the basics of evaluating content safety. As you create your test suite, consider the following next steps: -- Configure additional evaluators, such as the [quality evaluators](../conceptual/evaluation-libraries.md#quality-evaluators). For an example, see the AI samples repo [quality and safety evaluation example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example10_RunningQualityAndSafetyEvaluatorsTogether.cs). +- Configure additional evaluators, such as the [quality evaluators](/dotnet/ai/evaluation/libraries#quality-evaluators). For an example, see the AI samples repo [quality and safety evaluation example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example10_RunningQualityAndSafetyEvaluatorsTogether.cs). - Evaluate the content safety of generated images. For an example, see the AI samples repo [image response example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example09_RunningSafetyEvaluatorsAgainstResponsesWithImages.cs). - In real-world evaluations, you might not want to validate individual results, since the LLM responses and evaluation scores can vary over time as your product (and the models used) evolve. You might not want individual evaluation tests to fail and block builds in your CI/CD pipelines when this happens. Instead, in such cases, it might be better to rely on the generated report and track the overall trends for evaluation scores across different scenarios over time (and only fail individual builds in your CI/CD pipelines when there's a significant drop in evaluation scores across multiple different tests). diff --git a/docs/ai/evaluation/libraries.md b/docs/ai/evaluation/libraries.md index 68df8deeaaa38..4e501c65e3080 100644 --- a/docs/ai/evaluation/libraries.md +++ b/docs/ai/evaluation/libraries.md @@ -86,7 +86,7 @@ The library contains support for storing evaluation results and generating repor :::image type="content" source="../media/ai-extensions/pipeline-report.jpg" lightbox="../media/ai-extensions/pipeline-report.jpg" alt-text="Screenshot of an AI evaluation report in an Azure DevOps pipeline."::: -The `dotnet aieval` tool, which ships as part of the `Microsoft.Extensions.AI.Evaluation.Console` package, includes functionality for generating reports and managing the stored evaluation data and cached responses. For more information, see [Generate a report](../tutorials/evaluate-with-reporting.md#generate-a-report). +The `dotnet aieval` tool, which ships as part of the `Microsoft.Extensions.AI.Evaluation.Console` package, includes functionality for generating reports and managing the stored evaluation data and cached responses. For more information, see [Generate a report](/dotnet/ai/evaluation/evaluate-with-reporting#generate-a-report). ## Configuration From 062648e912c4095bba99d0a4e701394ae740d246 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 8 Sep 2025 19:42:47 -0700 Subject: [PATCH 3/7] move media file --- .../media/evaluation-report.png | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/ai/{tutorials => evaluation}/media/evaluation-report.png (100%) diff --git a/docs/ai/tutorials/media/evaluation-report.png b/docs/ai/evaluation/media/evaluation-report.png similarity index 100% rename from docs/ai/tutorials/media/evaluation-report.png rename to docs/ai/evaluation/media/evaluation-report.png From 47fb6de98a5ab03026035a9de9b805df9b479aa1 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Mon, 8 Sep 2025 19:50:11 -0700 Subject: [PATCH 4/7] use relative links --- docs/ai/evaluation/evaluate-ai-response.md | 4 ++-- docs/ai/evaluation/evaluate-safety.md | 4 ++-- docs/ai/evaluation/libraries.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/ai/evaluation/evaluate-ai-response.md b/docs/ai/evaluation/evaluate-ai-response.md index b6d8bf0a79ea5..5814bd26bbc9d 100644 --- a/docs/ai/evaluation/evaluate-ai-response.md +++ b/docs/ai/evaluation/evaluate-ai-response.md @@ -10,7 +10,7 @@ ms.topic: quickstart In this quickstart, you create an MSTest app to evaluate the quality of a chat response from an OpenAI model. The test app uses the [Microsoft.Extensions.AI.Evaluation](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) libraries. > [!NOTE] -> This quickstart demonstrates the simplest usage of the evaluation API. Notably, it doesn't demonstrate use of the [response caching](/dotnet/ai/evaluation/libraries#cached-responses) and [reporting](/dotnet/ai/evaluation/libraries#reporting) functionality, which are important if you're authoring unit tests that run as part of an "offline" evaluation pipeline. The scenario shown in this quickstart is suitable in use cases such as "online" evaluation of AI responses within production code and logging scores to telemetry, where caching and reporting aren't relevant. For a tutorial that demonstrates the caching and reporting functionality, see [Tutorial: Evaluate a model's response with response caching and reporting](/dotnet/ai/evaluation/evaluate-with-reporting) +> This quickstart demonstrates the simplest usage of the evaluation API. Notably, it doesn't demonstrate use of the [response caching](libraries.md#cached-responses) and [reporting](libraries.md#reporting) functionality, which are important if you're authoring unit tests that run as part of an "offline" evaluation pipeline. The scenario shown in this quickstart is suitable in use cases such as "online" evaluation of AI responses within production code and logging scores to telemetry, where caching and reporting aren't relevant. For a tutorial that demonstrates the caching and reporting functionality, see [Tutorial: Evaluate a model's response with response caching and reporting](evaluate-with-reporting.md) ## Prerequisites @@ -103,4 +103,4 @@ If you no longer need them, delete the Azure OpenAI resource and GPT-4 model dep ## Next steps - Evaluate the responses from different OpenAI models. -- Add response caching and reporting to your evaluation code. For more information, see [Tutorial: Evaluate a model's response with response caching and reporting](/dotnet/ai/evaluation/evaluate-with-reporting). +- Add response caching and reporting to your evaluation code. For more information, see [Tutorial: Evaluate a model's response with response caching and reporting](evaluate-with-reporting.md). diff --git a/docs/ai/evaluation/evaluate-safety.md b/docs/ai/evaluation/evaluate-safety.md index 6fc8a6d07b40b..7a0c686130802 100644 --- a/docs/ai/evaluation/evaluate-safety.md +++ b/docs/ai/evaluation/evaluate-safety.md @@ -110,7 +110,7 @@ Complete the following steps to create an MSTest project. > [!NOTE] > This code example passes the LLM as `originalChatClient` to . The reason to include the LLM chat client here is to enable getting a chat response from the LLM, and notably, to enable response caching for it. (If you don't want to cache the LLM's response, you can create a separate, local to fetch the response from the LLM.) Instead of passing a , if you already have a for an LLM from another reporting configuration, you can pass that instead, using the overload. > - > Similarly, if you configure both [LLM-based evaluators](/dotnet/ai/evaluation/libraries#quality-evaluators) and [Azure AI Foundry Evaluation service–based evaluators](/dotnet/ai/evaluation/libraries#safety-evaluators) in the reporting configuration, you also need to pass the LLM to . Then it returns a that can talk to both types of evaluators. + > Similarly, if you configure both [LLM-based evaluators](libraries.md#quality-evaluators) and [Azure AI Foundry Evaluation service–based evaluators](libraries.md#safety-evaluators) in the reporting configuration, you also need to pass the LLM to . Then it returns a that can talk to both types of evaluators. 1. Add a method to define the [chat options](xref:Microsoft.Extensions.AI.ChatOptions) and ask the model for a response to a given question. @@ -148,6 +148,6 @@ To generate a report to view the evaluation results, see [Generate a report](eva This tutorial covers the basics of evaluating content safety. As you create your test suite, consider the following next steps: -- Configure additional evaluators, such as the [quality evaluators](/dotnet/ai/evaluation/libraries#quality-evaluators). For an example, see the AI samples repo [quality and safety evaluation example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example10_RunningQualityAndSafetyEvaluatorsTogether.cs). +- Configure additional evaluators, such as the [quality evaluators](libraries.md#quality-evaluators). For an example, see the AI samples repo [quality and safety evaluation example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example10_RunningQualityAndSafetyEvaluatorsTogether.cs). - Evaluate the content safety of generated images. For an example, see the AI samples repo [image response example](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/reporting/ReportingExamples.Example09_RunningSafetyEvaluatorsAgainstResponsesWithImages.cs). - In real-world evaluations, you might not want to validate individual results, since the LLM responses and evaluation scores can vary over time as your product (and the models used) evolve. You might not want individual evaluation tests to fail and block builds in your CI/CD pipelines when this happens. Instead, in such cases, it might be better to rely on the generated report and track the overall trends for evaluation scores across different scenarios over time (and only fail individual builds in your CI/CD pipelines when there's a significant drop in evaluation scores across multiple different tests). diff --git a/docs/ai/evaluation/libraries.md b/docs/ai/evaluation/libraries.md index 4e501c65e3080..9033644d6130a 100644 --- a/docs/ai/evaluation/libraries.md +++ b/docs/ai/evaluation/libraries.md @@ -86,7 +86,7 @@ The library contains support for storing evaluation results and generating repor :::image type="content" source="../media/ai-extensions/pipeline-report.jpg" lightbox="../media/ai-extensions/pipeline-report.jpg" alt-text="Screenshot of an AI evaluation report in an Azure DevOps pipeline."::: -The `dotnet aieval` tool, which ships as part of the `Microsoft.Extensions.AI.Evaluation.Console` package, includes functionality for generating reports and managing the stored evaluation data and cached responses. For more information, see [Generate a report](/dotnet/ai/evaluation/evaluate-with-reporting#generate-a-report). +The `dotnet aieval` tool, which ships as part of the `Microsoft.Extensions.AI.Evaluation.Console` package, includes functionality for generating reports and managing the stored evaluation data and cached responses. For more information, see [Generate a report](evaluate-with-reporting.md#generate-a-report). ## Configuration From b5ca5025f95e40ffe2cb1b7a340514f62412d43c Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Thu, 11 Sep 2025 13:45:36 -0700 Subject: [PATCH 5/7] respond to feedback --- .openpublishing.redirection.ai.json | 2 +- docs/ai/evaluation/responsible-ai.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.openpublishing.redirection.ai.json b/.openpublishing.redirection.ai.json index 4752ef8fe52be..2234d13b121d9 100644 --- a/.openpublishing.redirection.ai.json +++ b/.openpublishing.redirection.ai.json @@ -74,7 +74,7 @@ }, { "source_path_from_root": "/docs/ai/tutorials/llm-eval.md", - "redirect_url": "/dotnet/ai/quickstarts/evaluate-ai-response", + "redirect_url": "/dotnet/ai/evaluation/evaluate-ai-response", "redirect_document_id": true }, { diff --git a/docs/ai/evaluation/responsible-ai.md b/docs/ai/evaluation/responsible-ai.md index 713d3400e78cd..4a6103d2e530b 100644 --- a/docs/ai/evaluation/responsible-ai.md +++ b/docs/ai/evaluation/responsible-ai.md @@ -18,14 +18,14 @@ Microsoft has identified [six principles](https://www.microsoft.com/ai/responsib - Transparency - Accountability -If you're building an AI app with .NET, the [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) package provides evaluators to help ensure that the responses your app generates meet the standards for responsible AI. These safety evaluators use the [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) to perform evaluations. They include metrics for hate and unfairness, groundedness, ungrounded inference of human attributes, and the presence of: +If you're building an AI app with .NET, the [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) package provides evaluators to help ensure that the responses, both text and image, that your app generates meet the standards for responsible AI. The evaluators can also detect problematic content in user input. These safety evaluators use the [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) to perform evaluations. They include metrics for hate and unfairness, groundedness, ungrounded inference of human attributes, and the presence of: - Protected material - Self-harm content - Sexual content - Violent content -- Vulnerable code -- Indirect attacks +- Vulnerable code (text-based only) +- Indirect attacks (text-based only) For more information about the safety evaluators, see [Safety evaluators](libraries.md#safety-evaluators). To get started with the Microsoft.Extensions.AI.Evaluation.Safety evaluators, see [Tutorial: Evaluate response safety with caching and reporting](evaluate-safety.md). From 2608f7f051eeb2159c16d40252b20693a61d1384 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Thu, 11 Sep 2025 13:55:07 -0700 Subject: [PATCH 6/7] fix build warniung --- .openpublishing.redirection.ai.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.openpublishing.redirection.ai.json b/.openpublishing.redirection.ai.json index 2234d13b121d9..b1215f1dbffff 100644 --- a/.openpublishing.redirection.ai.json +++ b/.openpublishing.redirection.ai.json @@ -74,8 +74,7 @@ }, { "source_path_from_root": "/docs/ai/tutorials/llm-eval.md", - "redirect_url": "/dotnet/ai/evaluation/evaluate-ai-response", - "redirect_document_id": true + "redirect_url": "/dotnet/ai/evaluation/evaluate-ai-response" }, { "source_path_from_root": "/docs/ai/tutorials/evaluate-safety.md", From 7e6dd703c223598ccca181924786b20708f9b242 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Thu, 11 Sep 2025 13:57:36 -0700 Subject: [PATCH 7/7] tweak --- docs/ai/evaluation/responsible-ai.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ai/evaluation/responsible-ai.md b/docs/ai/evaluation/responsible-ai.md index 4a6103d2e530b..d2279d30b61ea 100644 --- a/docs/ai/evaluation/responsible-ai.md +++ b/docs/ai/evaluation/responsible-ai.md @@ -18,7 +18,7 @@ Microsoft has identified [six principles](https://www.microsoft.com/ai/responsib - Transparency - Accountability -If you're building an AI app with .NET, the [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) package provides evaluators to help ensure that the responses, both text and image, that your app generates meet the standards for responsible AI. The evaluators can also detect problematic content in user input. These safety evaluators use the [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) to perform evaluations. They include metrics for hate and unfairness, groundedness, ungrounded inference of human attributes, and the presence of: +If you're building an AI app with .NET, the [📦 Microsoft.Extensions.AI.Evaluation.Safety](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) package provides evaluators to help ensure that the responses your app generates, both text and image, meet the standards for responsible AI. The evaluators can also detect problematic content in user input. These safety evaluators use the [Azure AI Foundry evaluation service](/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) to perform evaluations. They include metrics for hate and unfairness, groundedness, ungrounded inference of human attributes, and the presence of: - Protected material - Self-harm content