From 8c6ddf54adf91e2fbf00034fef413ccfde3769d6 Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Fri, 5 Apr 2024 16:30:45 -0700 Subject: [PATCH] feat: Vertex Rapid Evaluation SDK and Prompt Template for Vertex Prompt Management Public Preview PiperOrigin-RevId: 622310413 --- setup.py | 7 + tests/unit/vertexai/test_evaluation.py | 440 +++++++++++++ vertexai/preview/evaluation/__init__.py | 37 ++ vertexai/preview/evaluation/_base.py | 79 +++ vertexai/preview/evaluation/_eval_tasks.py | 411 ++++++++++++ vertexai/preview/evaluation/_evaluation.py | 567 ++++++++++++++++ vertexai/preview/evaluation/constants.py | 180 ++++++ .../preview/evaluation/metrics/__init__.py | 29 + vertexai/preview/evaluation/metrics/_base.py | 61 ++ .../metrics/_instance_evaluation.py | 603 ++++++++++++++++++ .../preview/evaluation/prompt_template.py | 84 +++ vertexai/preview/evaluation/utils.py | 176 +++++ 12 files changed, 2674 insertions(+) create mode 100644 tests/unit/vertexai/test_evaluation.py create mode 100644 vertexai/preview/evaluation/__init__.py create mode 100644 vertexai/preview/evaluation/_base.py create mode 100644 vertexai/preview/evaluation/_eval_tasks.py create mode 100644 vertexai/preview/evaluation/_evaluation.py create mode 100644 vertexai/preview/evaluation/constants.py create mode 100644 vertexai/preview/evaluation/metrics/__init__.py create mode 100644 vertexai/preview/evaluation/metrics/_base.py create mode 100644 vertexai/preview/evaluation/metrics/_instance_evaluation.py create mode 100644 vertexai/preview/evaluation/prompt_template.py create mode 100644 vertexai/preview/evaluation/utils.py diff --git a/setup.py b/setup.py index 23a2d5ef59..2149e83c13 100644 --- a/setup.py +++ b/setup.py @@ -145,6 +145,11 @@ "pydantic < 3", ] +rapid_evaluation_extra_require = [ + "nest_asyncio >= 1.0.0, < 1.6.0", + "pandas >= 1.0.0, < 2.2.0", +] + full_extra_require = list( set( tensorboard_extra_require @@ -162,6 +167,7 @@ + preview_extra_require + ray_extra_require + reasoning_engine_extra_require + + rapid_evaluation_extra_require ) ) testing_extra_require = ( @@ -246,6 +252,7 @@ "ray": ray_extra_require, "ray_testing": ray_testing_extra_require, "reasoningengine": reasoning_engine_extra_require, + "rapid_evaluation": rapid_evaluation_extra_require, }, python_requires=">=3.8", classifiers=[ diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py new file mode 100644 index 0000000000..c330506792 --- /dev/null +++ b/tests/unit/vertexai/test_evaluation.py @@ -0,0 +1,440 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from unittest import mock + +from google.cloud import aiplatform +import vertexai +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform.metadata import metadata +from google.cloud.aiplatform_v1beta1.services import ( + evaluation_service as gapic_evaluation_services, +) +from google.cloud.aiplatform_v1beta1.types import ( + evaluation_service as gapic_evaluation_service_types, +) +from vertexai.preview import evaluation +from vertexai.preview.evaluation import utils +import pandas as pd +import pytest + + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_METRICS = [ + "exact_match", + "bleu", + "rouge_1", + "rouge_2", + "rouge_l", + "rouge_l_sum", + "coherence", + "fluency", + "safety", + "groundedness", + "fulfillment", + "summarization_quality", + "summarization_helpfulness", + "summarization_verbosity", + "question_answering_quality", + "question_answering_relevance", + "question_answering_helpfulness", + "question_answering_correctness", +] +_TEST_EVAL_DATASET = pd.DataFrame( + { + "response": ["test", "text"], + "reference": ["test", "ref"], + "context": ["test", "context"], + "instruction": ["test", "instruction"], + } +) +_TEST_EVAL_DATASET_WITHOUT_RESPONSE = pd.DataFrame( + { + "reference": ["test", "ref"], + "context": ["test", "context"], + "instruction": ["test", "instruction"], + } +) + +_TEST_JSONL_FILE_CONTENT = """{"prompt": "prompt", "reference": "reference"}\n +{"prompt":"test", "reference": "test"}\n +""" +_TEST_CSV_FILE_CONTENT = """reference,context,instruction\ntest,test,test\n +text,text,text\n +""" + + +_MOCK_EXACT_MATCH_RESULT = [ + gapic_evaluation_service_types.EvaluateInstancesResponse( + exact_match_results=gapic_evaluation_service_types.ExactMatchResults( + exact_match_metric_values=[ + gapic_evaluation_service_types.ExactMatchMetricValue(score=1.0), + ] + ) + ), + gapic_evaluation_service_types.EvaluateInstancesResponse( + exact_match_results=gapic_evaluation_service_types.ExactMatchResults( + exact_match_metric_values=[ + gapic_evaluation_service_types.ExactMatchMetricValue(score=0.0), + ] + ) + ), +] + +_MOCK_FLUENCY_RESULT = [ + gapic_evaluation_service_types.EvaluateInstancesResponse( + fluency_result=gapic_evaluation_service_types.FluencyResult( + score=5, explanation="explanation", confidence=1.0 + ) + ), + gapic_evaluation_service_types.EvaluateInstancesResponse( + fluency_result=gapic_evaluation_service_types.FluencyResult( + score=4, explanation="explanation", confidence=0.5 + ) + ), +] + + +@pytest.fixture +def mock_async_event_loop(): + with mock.patch("asyncio.get_event_loop") as mock_async_event_loop: + yield mock_async_event_loop + + +@pytest.fixture +def mock_experiment_tracker(): + with mock.patch.object( + metadata, "_experiment_tracker", autospec=True + ) as mock_experiment_tracker: + yield mock_experiment_tracker + + +@pytest.mark.usefixtures("google_auth_mock") +class TestEvaluation: + def setup_method(self): + vertexai.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + def teardown_method(self): + initializer.global_pool.shutdown(wait=True) + + def test_create_eval_task(self): + test_experiment = "test_experiment_name" + test_content_column_name = "test_content_column_name" + test_reference_column_name = "test_reference_column_name" + test_response_column_name = "test_response_column_name" + + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET, + metrics=_TEST_METRICS, + experiment=test_experiment, + content_column_name=test_content_column_name, + reference_column_name=test_reference_column_name, + response_column_name=test_response_column_name, + ) + + assert test_eval_task.dataset.equals(_TEST_EVAL_DATASET) + assert test_eval_task.metrics == _TEST_METRICS + assert test_eval_task.experiment == test_experiment + assert test_eval_task.content_column_name == test_content_column_name + assert test_eval_task.reference_column_name == test_reference_column_name + assert test_eval_task.response_column_name == test_response_column_name + + def test_evaluate_saved_response(self, mock_async_event_loop): + eval_dataset = _TEST_EVAL_DATASET + test_metrics = _TEST_METRICS + mock_summary_metrics = { + "row_count": 2, + "mock_metric/mean": 0.5, + "mock_metric/std": 0.5, + } + mock_metrics_table = pd.DataFrame( + { + "response": ["test", "text"], + "reference": ["test", "ref"], + "mock_metric": [1.0, 0.0], + } + ) + mock_async_event_loop.return_value.run_until_complete.return_value = ( + mock_summary_metrics, + mock_metrics_table, + ) + + test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics) + test_result = test_eval_task.evaluate() + + assert test_result.summary_metrics == mock_summary_metrics + assert test_result.metrics_table.equals(mock_metrics_table) + + @pytest.mark.parametrize("api_transport", ["grpc", "rest"]) + def test_compute_automatic_metrics(self, api_transport): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + api_transport=api_transport, + ) + eval_dataset = pd.DataFrame( + { + "response": ["test", "text"], + "reference": ["test", "ref"], + } + ) + test_metrics = ["exact_match"] + test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics) + mock_metric_results = _MOCK_EXACT_MATCH_RESULT + with mock.patch.object( + target=gapic_evaluation_services.EvaluationServiceAsyncClient, + attribute="evaluate_instances", + side_effect=mock_metric_results, + ): + test_result = test_eval_task.evaluate() + + assert test_result.summary_metrics["row_count"] == 2 + assert test_result.summary_metrics["exact_match/mean"] == 0.5 + assert test_result.summary_metrics["exact_match/std"] == pytest.approx(0.7, 0.1) + assert list(test_result.metrics_table.columns.values) == [ + "response", + "reference", + "exact_match", + ] + assert test_result.metrics_table[["response", "reference"]].equals(eval_dataset) + assert list(test_result.metrics_table["exact_match"].values) == [1.0, 0.0] + + @pytest.mark.parametrize("api_transport", ["grpc", "rest"]) + def test_compute_pointwise_metrics(self, api_transport): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + api_transport=api_transport, + ) + eval_dataset = pd.DataFrame( + { + "response": ["test", "text"], + } + ) + test_metrics = ["fluency"] + test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics) + mock_metric_results = _MOCK_FLUENCY_RESULT + with mock.patch.object( + target=gapic_evaluation_services.EvaluationServiceAsyncClient, + attribute="evaluate_instances", + side_effect=mock_metric_results, + ): + test_result = test_eval_task.evaluate() + + assert test_result.summary_metrics["row_count"] == 2 + assert test_result.summary_metrics["fluency/mean"] == 4.5 + assert test_result.summary_metrics["fluency/std"] == pytest.approx(0.7, 0.1) + assert set(test_result.metrics_table.columns.values) == set( + [ + "response", + "fluency", + "fluency/explanation", + "fluency/confidence", + ] + ) + assert test_result.metrics_table[["response"]].equals(eval_dataset) + assert list(test_result.metrics_table["fluency"].values) == [5, 4] + assert list(test_result.metrics_table["fluency/explanation"].values) == [ + "explanation", + "explanation", + ] + assert list(test_result.metrics_table["fluency/confidence"].values) == [ + 1.0, + 0.5, + ] + + +@pytest.mark.usefixtures("google_auth_mock") +class TestEvaluationErrors: + def setup_method(self): + vertexai.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + def teardown_method(self): + initializer.global_pool.shutdown(wait=True) + + def test_evaluate_empty_metrics(self): + test_eval_task = evaluation.EvalTask(dataset=_TEST_EVAL_DATASET, metrics=[]) + with pytest.raises(ValueError, match="Metrics cannot be empty."): + test_eval_task.evaluate() + + def test_evaluate_invalid_metrics(self): + metric_name = "invalid_metric" + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET, metrics=[metric_name] + ) + with pytest.raises( + ValueError, match=f"Metric name: {metric_name} not supported." + ): + test_eval_task.evaluate() + + def test_evaluate_invalid_experiment_run_name(self): + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET, metrics=_TEST_METRICS + ) + with pytest.raises(ValueError, match="Experiment is not set"): + test_eval_task.evaluate(experiment_run_name="invalid_experiment_run_name") + + with pytest.raises(ValueError, match="Experiment is not set"): + test_eval_task.display_runs() + + def test_evaluate_experiment_name_already_exists(self, mock_experiment_tracker): + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET, + metrics=_TEST_METRICS, + experiment="test_eval_experiment_name", + ) + mock_experiment_tracker.experiment_run.return_value = "experiment_run_1" + with pytest.raises(ValueError, match="Experiment run already exists"): + test_eval_task.evaluate(experiment_run_name="experiment_run_2") + + def test_evaluate_invalid_dataset_content_column(self): + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE, + metrics=_TEST_METRICS, + ) + with pytest.raises(KeyError, match="Required column `content` not found"): + test_eval_task.evaluate(model=mock.MagicMock()) + + def test_evaluate_invalid_prompt_template_placeholder(self): + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE, + metrics=_TEST_METRICS, + ) + with pytest.raises(ValueError, match="Failed to complete prompt template"): + test_eval_task.evaluate( + prompt_template="test_prompt_template {invalid_placeholder}", + ) + + +@pytest.mark.usefixtures("google_auth_mock") +class TestEvaluationUtils: + def setup_method(self): + vertexai.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + ) + + def teardown_method(self): + initializer.global_pool.shutdown(wait=True) + + def test_create_evaluation_service_async_client(self): + client = utils.create_evaluation_service_async_client() + assert isinstance(client, utils._EvaluationServiceAsyncClientWithOverride) + + def test_load_dataset_from_dataframe(self): + data = {"col1": [1, 2], "col2": ["a", "b"]} + df = pd.DataFrame(data) + loaded_df = utils.load_dataset(df) + assert loaded_df.equals(df) + + def test_load_dataset_from_dict(self): + data = {"col1": [1, 2], "col2": ["a", "b"]} + loaded_df = utils.load_dataset(data) + assert isinstance(loaded_df, pd.DataFrame) + assert loaded_df.to_dict("list") == data + + def test_load_dataset_from_gcs_jsonl(self): + source = "gs://test_bucket/test_file.jsonl" + with mock.patch.object( + utils, + "_read_gcs_file_contents", + return_value=_TEST_JSONL_FILE_CONTENT, + ): + loaded_df = utils.load_dataset(source) + + assert isinstance(loaded_df, pd.DataFrame) + assert loaded_df.to_dict("list") == { + "prompt": ["prompt", "test"], + "reference": ["reference", "test"], + } + + def test_load_dataset_from_gcs_csv(self): + source = "gs://test_bucket/test_file.csv" + with mock.patch.object( + utils, "_read_gcs_file_contents", return_value=_TEST_CSV_FILE_CONTENT + ): + loaded_df = utils.load_dataset(source) + + assert isinstance(loaded_df, pd.DataFrame) + assert loaded_df.to_dict("list") == { + "reference": ["test", "text"], + "context": ["test", "text"], + "instruction": ["test", "text"], + } + + def test_load_dataset_from_bigquery(self): + source = "bq://project-id.dataset.table_name" + with mock.patch.object( + utils, "_load_bigquery", return_value=_TEST_EVAL_DATASET + ): + loaded_df = utils.load_dataset(source) + + assert isinstance(loaded_df, pd.DataFrame) + assert loaded_df.equals(_TEST_EVAL_DATASET) + + +class TestPromptTemplate: + def test_init(self): + template_str = "Hello, {name}!" + prompt_template = evaluation.PromptTemplate(template_str) + assert prompt_template.template == template_str + + def test_get_placeholders(self): + template_str = "Hello, {name}! Today is {day}." + prompt_template = evaluation.PromptTemplate(template_str) + assert prompt_template.placeholders == {"name", "day"} + + def test_format(self): + template_str = "Hello, {name}! Today is {day}." + prompt_template = evaluation.PromptTemplate(template_str) + completed_prompt = prompt_template.assemble(name="John", day="Monday") + assert str(completed_prompt) == "Hello, John! Today is Monday." + + def test_format_missing_placeholder(self): + template_str = "Hello, {name}!" + prompt_template = evaluation.PromptTemplate(template_str) + completed_prompt = prompt_template.assemble() + assert str(completed_prompt) == "Hello, {name}!" + assert prompt_template.placeholders == {"name"} + + def test_partial_format(self): + template_str = "Hello, {name}! Today is {day}." + prompt_template = evaluation.PromptTemplate(template_str) + partially_completed_prompt = prompt_template.assemble(name="John") + + assert isinstance(partially_completed_prompt, evaluation.PromptTemplate) + assert str(partially_completed_prompt) == "Hello, John! Today is {day}." + assert partially_completed_prompt.placeholders == {"day"} + + completed_prompt = partially_completed_prompt.assemble(day="Monday") + assert str(completed_prompt) == "Hello, John! Today is Monday." + + def test_str(self): + template_str = "Hello, world!" + prompt_template = evaluation.PromptTemplate(template_str) + assert str(prompt_template) == template_str + + def test_repr(self): + template_str = "Hello, {name}!" + prompt_template = evaluation.PromptTemplate(template_str) + assert repr(prompt_template) == f"PromptTemplate('{template_str}')" diff --git a/vertexai/preview/evaluation/__init__.py b/vertexai/preview/evaluation/__init__.py new file mode 100644 index 0000000000..67895b4377 --- /dev/null +++ b/vertexai/preview/evaluation/__init__.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Rapid GenAI Evaluation Module.""" + +from vertexai.preview.evaluation import _base +from vertexai.preview.evaluation import _eval_tasks +from vertexai.preview.evaluation import metrics +from vertexai.preview.evaluation import prompt_template + + +EvalResult = _base.EvalResult +EvalTask = _eval_tasks.EvalTask +CustomMetric = metrics.CustomMetric +make_metric = metrics.make_metric +PromptTemplate = prompt_template.PromptTemplate + +__all__ = [ + "CustomMetric", + "EvalResult", + "EvalTask", + "make_metric", + "PromptTemplate", +] diff --git a/vertexai/preview/evaluation/_base.py b/vertexai/preview/evaluation/_base.py new file mode 100644 index 0000000000..588e1e6eac --- /dev/null +++ b/vertexai/preview/evaluation/_base.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Base classes for evaluation.""" + + +import dataclasses +from typing import Dict, List, Optional, Union, TYPE_CHECKING + +from google.cloud.aiplatform_v1beta1.services import ( + evaluation_service as gapic_evaluation_services, +) +from vertexai.preview.evaluation.metrics import ( + _base as metrics_base, +) + +if TYPE_CHECKING: + import pandas as pd + + +@dataclasses.dataclass +class EvaluationRunConfig: + """Evaluation Run Configurations. + + Attributes: + dataset: The dataset to evaluate. + metrics: The list of metric names to evaluate, or a metrics bundle for an + evaluation task, or custom metric instances. + column_map: The dictionary of column name overrides in the dataset. + client: The asynchronous evaluation client. + """ + + dataset: "pd.DataFrame" + metrics: List[Union[str, metrics_base.CustomMetric]] + column_map: Dict[str, str] + client: gapic_evaluation_services.EvaluationServiceAsyncClient + + def validate_dataset_column(self, column_name: str) -> None: + """Validates that the column names in the column map are in the dataset. + + Args: + column_name: The column name to validate. + + Raises: + KeyError: If any of the column names are not in the dataset. + """ + if self.column_map.get(column_name, column_name) not in self.dataset.columns: + raise KeyError( + f"Required column `{self.column_map.get(column_name, column_name)}`" + " not found in the eval dataset. The columns in the provided dataset" + f" are {self.dataset.columns}." + ) + + +@dataclasses.dataclass +class EvalResult: + """Evaluation result. + + Attributes: + summary_metrics: The summary evaluation metrics for an evaluation run. + metrics_table: A table containing eval inputs, ground truth, and metrics per + row. + """ + + summary_metrics: Dict[str, float] + metrics_table: Optional["pd.DataFrame"] = None diff --git a/vertexai/preview/evaluation/_eval_tasks.py b/vertexai/preview/evaluation/_eval_tasks.py new file mode 100644 index 0000000000..cf86d12710 --- /dev/null +++ b/vertexai/preview/evaluation/_eval_tasks.py @@ -0,0 +1,411 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union +import uuid + +from google.api_core import exceptions +import vertexai +from google.cloud.aiplatform import base +from google.cloud.aiplatform.metadata import metadata +from vertexai import generative_models +from vertexai.preview.evaluation import _base as eval_base +from vertexai.preview.evaluation import _evaluation +from vertexai.preview.evaluation import utils +from vertexai.preview.evaluation.metrics import ( + _base as metrics_base, +) + +if TYPE_CHECKING: + import pandas as pd + +# pylint: disable=g-import-not-at-top +try: + from IPython import display as IPython_display +except ImportError: + IPython_display = None + +_LOGGER = base.Logger(__name__) + +EvalResult = eval_base.EvalResult +GenerativeModel = generative_models.GenerativeModel + + +class EvalTask: + """A class representing an EvalTask. + + An Evaluation Tasks is defined to measure the model's ability to perform a + certain task in response to specific prompts or inputs. Evaluation tasks must + contain an evaluation dataset, and a list of metrics to evaluate. Evaluation + tasks help developers compare propmpt templates, track experiments, compare + models and their settings, and assess the quality of the model's generated + text. + + Dataset details: + Default dataset column names: + * content_column_name: "content" + * reference_column_name: "reference" + * response_column_name: "response" + Requirement for different use cases: + * Bring your own prediction: A `response` column is required. Response + column name can be customized by providing `response_column_name` + parameter. + * Without prompt template: A column representing the input prompt to the + model is required. If `content_column_name` is not specified, the + eval dataset requires `content` column by default. The response + column is not used if present and new responses from the model are + generated with the content column and used for evaluation. + * With prompt template: Dataset must contain column names corresponding to + the placeholder names in the prompt template. For example, if prompt + template is "Instruction: {instruction}, context: {context}", the + dataset must contain `instruction` and `context` column. + + Metrics Details: + The supported metrics, metric bundle descriptions, grading rubrics, and + the required input fields can be found on the Vertex AI public + documentation. + + Usage: + 1. To perform bring your own prediction evaluation, provide the model + responses in the response column in the dataset. The response column name + is "response" by default, or specify `response_column_name` parameter to + customize. + + ``` + eval_dataset = pd.DataFrame({ + "reference": [...], + "response" : [...], + }) + eval_task = EvalTask( + dataset=eval_dataset, + metrics=["bleu", "rouge_l_sum", "coherence", "fluency"], + experiment="my-experiment", + ) + eval_result = eval_task.evaluate( + experiment_run_name="eval-experiment-run" + ) + ``` + + 2. To perform evaluation with built-in Gemini model inference, specify the + `model` parameter with a GenerativeModel instance. The default query + column name to the model is `content`. + + ``` + eval_dataset = pd.DataFrame({ + "reference": [...], + "content" : [...], + }) + result = EvalTask( + dataset=eval_dataset, + metrics=["exact_match", "bleu", "rouge_1", "rouge_2", + "rouge_l_sum"], + experiment="my-experiment", + ).evaluate( + model=GenerativeModel("gemini-pro"), + experiment_run_name="gemini-pro-eval-run" + ) + ``` + + 3. If a `prompt_template` is specified, the `content` column is not required. + Prompts can be assembled from the evaluation dataset, and all placeholder + names must be present in the dataset columns. + ``` + eval_dataset = pd.DataFrame({ + "context" : [...], + "instruction": [...], + "reference" : [...], + }) + result = EvalTask( + dataset=eval_dataset, + metrics=["summarization_quality"], + ).evaluate( + model=model, + prompt_template="{instruction}. Article: {context}. Summary:", + ) + ``` + + 4. To perform evaluation with custom model inference, specify the `model` + parameter with a custom prediction function. The `content` column in the + dataset is used to generate predictions with the custom model function for + evaluation. + + ``` + def custom_model_fn(input: str) -> str: + response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": input} + ] + ) + return response.choices[0].message.content + + eval_dataset = pd.DataFrame({ + "content" : [...], + "reference": [...], + }) + result = EvalTask( + dataset=eval_dataset, + metrics=["text_generation_similarity","text_generation_quality"], + experiment="my-experiment", + ).evaluate( + model=custom_model_fn, + experiment_run_name="gpt-eval-run" + ) + ``` + """ + + _resource_noun = "evalTasks" + + def __init__( + self, + *, + dataset: Union["pd.DataFrame", str, Dict[str, Any]], + metrics: List[ + Union[ + Literal[ + "exact_match", + "bleu", + "rouge_1", + "rouge_2", + "rouge_l", + "rouge_l_sum", + "coherence", + "fluency", + "safety", + "groundedness", + "fulfillment", + "summarization_quality", + "summarization_helpfulness", + "summarization_verbosity", + "question_answering_quality", + "question_answering_relevance", + "question_answering_helpfulness", + "question_answering_correctness", + "text_generation_similarity", + "text_generation_quality", + "text_generation_instruction_following", + "text_generation_safety", + "text_generation_factuality", + "summarization_pointwise_reference_free", + "qa_pointwise_reference_free", + "qa_pointwise_reference_based", + "tool_call_quality", + ], + metrics_base.CustomMetric, + ] + ], + experiment: Optional[str] = None, + content_column_name: str = "content", + reference_column_name: str = "reference", + response_column_name: str = "response", + ): + """Initializes an EvalTask. + + Args: + dataset: The dataset to be evaluated. + Supports the following dataset formats: + * pandas.DataFrame: Used directly for evaluation. + * Dict: Converted to a pandas DataFrame before evaluation. + * str: Interpreted as a file path or URI. Supported formats include: + * Local JSONL or CSV files: Loaded from the local filesystem. + * GCS JSONL or CSV files: Loaded from Google Cloud Storage + (e.g., 'gs://bucket/data.csv'). + * BigQuery table URI: Loaded from Google Cloud BigQuery + (e.g., 'bq://project-id.dataset.table_name'). + metrics: The list of metrics names to be evaluated, or a metrics + bundle for an evaluation task, or custom metric instances. + experiment: The name of the experiment to log the evaluations to. + content_column_name: The column name of content in the dataset to send to + the model. If not set, default to `content`. + reference_column_name: The column name of ground truth in the dataset. If + not set, default to `reference`. + response_column_name: The column name of model response in the dataset. If + not set, default to `response`. + """ + self.dataset = utils.load_dataset(dataset) + self.metrics = metrics + self.experiment = experiment + self.content_column_name = content_column_name + self.reference_column_name = reference_column_name + self.response_column_name = response_column_name + + def _evaluate_with_experiment( + self, + model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, + prompt_template: Optional[str] = None, + experiment_run_name: Optional[str] = None, + response_column_name: str = "response", + ) -> EvalResult: + """Runs an evaluation for the EvalTask with an experiment. + + Args: + model: A GenerativeModel instance or a custom model function to generate + responses to evaluate. If not provided, the evaluation is computed with + the `response` column in the `dataset`. + prompt_template: The prompt template to use for the evaluation. If not + set, the prompt template that was used to create the EvalTask will be + used. + experiment_run_name: The name of the experiment run to log the evaluation + to if an experiment is set for this EvalTask. If not provided, a random + unique experiment run name is used. + response_column_name: The column name of model response in the dataset. If + not set, default to `response`. + + Returns: + The evaluation result. + """ + self._validate_experiment_run() + with vertexai.preview.start_run(experiment_run_name): + self._log_eval_experiment_param(model, prompt_template) + eval_result = _evaluation.evaluate( + dataset=self.dataset, + metrics=self.metrics, + model=model, + prompt_template=prompt_template, + content_column_name=self.content_column_name, + reference_column_name=self.reference_column_name, + response_column_name=response_column_name or self.response_column_name, + ) + try: + vertexai.preview.log_metrics(eval_result.summary_metrics) + except (ValueError, TypeError, exceptions.InvalidArgument) as e: + _LOGGER.warning(f"Experiment metrics logging failed: {str(e)}") + return eval_result + + def evaluate( + self, + *, + model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, + prompt_template: Optional[str] = None, + experiment_run_name: Optional[str] = None, + response_column_name: str = "response", + ) -> EvalResult: + """Runs an evaluation for the EvalTask. + + Args: + model: A GenerativeModel instance or a custom model function to generate + responses to evaluate. If not provided, the evaluation is computed with + the `response` column in the `dataset`. + prompt_template: The prompt template to use for the evaluation. If not + set, the prompt template that was used to create the EvalTask will be + used. + experiment_run_name: The name of the experiment run to log the evaluation + to if an experiment is set for this EvalTask. If not provided, a random + unique experiment run name is used. + response_column_name: The column name of model response in the dataset. If + not set, default to `response`. + + Returns: + The evaluation result. + """ + global_experiment_name = metadata._experiment_tracker.experiment_name + if experiment_run_name and not self.experiment and not global_experiment_name: + raise ValueError( + "Experiment is not set. Please initialize EvalTask with an" + " experiment, or initialize a global experiment with " + "`vertexai.init(experiment='experiment_name')`for logging this" + " evaluation run." + ) + + experiment_run_name = experiment_run_name or f"{uuid.uuid4()}" + + if self.experiment and global_experiment_name: + metadata._experiment_tracker.set_experiment( + experiment=self.experiment, backing_tensorboard=False + ) + eval_result = self._evaluate_with_experiment( + model, prompt_template, experiment_run_name, response_column_name + ) + metadata._experiment_tracker.set_experiment( + experiment=global_experiment_name, backing_tensorboard=False + ) + elif self.experiment and not global_experiment_name: + metadata._experiment_tracker.set_experiment( + experiment=self.experiment, backing_tensorboard=False + ) + eval_result = self._evaluate_with_experiment( + model, prompt_template, experiment_run_name, response_column_name + ) + metadata._experiment_tracker.reset() + elif not self.experiment and global_experiment_name: + eval_result = self._evaluate_with_experiment( + model, prompt_template, experiment_run_name, response_column_name + ) + else: + eval_result = _evaluation.evaluate( + dataset=self.dataset, + metrics=self.metrics, + model=model, + prompt_template=prompt_template, + content_column_name=self.content_column_name, + reference_column_name=self.reference_column_name, + response_column_name=response_column_name or self.response_column_name, + ) + return eval_result + + def _validate_experiment_run(self) -> None: + """Checks if an experiment run already exists.""" + if metadata._experiment_tracker.experiment_run: + raise ValueError( + "Experiment run already exists. Please specify the name of the" + " experiment run to assign current session with in this evaluate" + " method." + ) + + def _log_eval_experiment_param( + self, + model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None, + prompt_template: Optional[str] = None, + ) -> None: + """Logs variable input parameters of an evaluation to an experiment run.""" + model_metadata = {} + + if prompt_template is not None: + model_metadata.update({"prompt_template": prompt_template}) + + if isinstance(model, GenerativeModel): + model_metadata.update( + { + "model_name": model._model_name, + } + ) + + if model._generation_config and isinstance(model._generation_config, dict): + # TODO(b/311221071): support logging GenerationConfig type. + model_metadata.update(**model._generation_config) + + if model._safety_settings and isinstance(model._safety_settings, dict): + # TODO(b/311221071): support logging List[SafetySetting] type. + safety_settings = model._safety_settings + safety_settings_as_str = { + category.name: threshold.name + for category, threshold in safety_settings.items() + } + model_metadata.update(safety_settings_as_str) + + if model_metadata: + _LOGGER.info(f"Logging Rapid Eval experiment metadata: {model_metadata}") + try: + vertexai.preview.log_params(model_metadata) + except (ValueError, TypeError) as e: + _LOGGER.warning(f"Experiment metadata logging failed: {str(e)}") + + def display_runs(self): + """Displays experiment runs associated with this EvalTask.""" + if not self.experiment: + raise ValueError("Experiment is not set.") + elif IPython_display: + IPython_display.display(vertexai.preview.get_experiment_df(self.experiment)) diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py new file mode 100644 index 0000000000..1c24664060 --- /dev/null +++ b/vertexai/preview/evaluation/_evaluation.py @@ -0,0 +1,567 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import asyncio +import collections +from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING, Tuple, Union, Callable + +from google.cloud.aiplatform import base +from google.cloud.aiplatform_v1beta1.types import ( + content as gapic_content_types, +) +from vertexai import generative_models +from vertexai.preview.evaluation import _base as evaluation_base +from vertexai.preview.evaluation import constants +from vertexai.preview.evaluation import ( + prompt_template as prompt_template_base, +) +from vertexai.preview.evaluation import utils +from vertexai.preview.evaluation.metrics import ( + _base as metrics_base, +) +from vertexai.preview.evaluation.metrics import ( + _instance_evaluation, +) + + +if TYPE_CHECKING: + import pandas as pd + +_LOGGER = base.Logger(__name__) +_METRICS_BUNDLE_TO_METRIC_NAMES = { + constants.MetricBundle.TEXT_GENERATION_SIMILARITY: ( + constants.Metric.EXACT_MATCH, + constants.Metric.BLEU, + constants.Metric.ROUGE_1, + constants.Metric.ROUGE_2, + constants.Metric.ROUGE_L, + constants.Metric.ROUGE_L_SUM, + ), + constants.MetricBundle.TEXT_GENERATION_QUALITY: ( + constants.Metric.COHERENCE, + constants.Metric.FLUENCY, + ), + constants.MetricBundle.TOOL_CALL_QUALITY: ( + constants.Metric.TOOL_CALL_VALID, + constants.Metric.TOOL_NAME_MATCH, + constants.Metric.TOOL_PARAMETER_KEY_MATCH, + constants.Metric.TOOL_PARAMETER_KV_MATCH, + ), + constants.MetricBundle.TEXT_GENERATION_INSTRUCTION_FOLLOWING: ( + constants.Metric.FULFILLMENT, + ), + constants.MetricBundle.TEXT_GENERATION_SAFETY: (constants.Metric.SAFETY,), + constants.MetricBundle.TEXT_GENERATION_FACTUALITY: (constants.Metric.GROUNDEDNESS,), + constants.MetricBundle.SUMMARIZATION_POINTWISE_REFERENCE_FREE: ( + constants.Metric.SUMMARIZATION_QUALITY, + constants.Metric.SUMMARIZATION_HELPFULNESS, + constants.Metric.SUMMARIZATION_VERBOSITY, + ), + constants.MetricBundle.QA_POINTWISE_REFERENCE_FREE: ( + constants.Metric.QUESTION_ANSWERING_QUALITY, + constants.Metric.QUESTION_ANSWERING_RELEVANCE, + constants.Metric.QUESTION_ANSWERING_HELPFULNESS, + ), + constants.MetricBundle.QA_POINTWISE_REFERENCE_BASED: ( + constants.Metric.QUESTION_ANSWERING_CORRECTNESS, + ), +} +_SUCCESSFUL_FINISH_REASONS = [ + gapic_content_types.Candidate.FinishReason.STOP, + # Many responses have this finish reason + gapic_content_types.Candidate.FinishReason.FINISH_REASON_UNSPECIFIED, +] + + +def _replace_metric_bundle_with_metrics( + metrics_list: List[Union[str, metrics_base.CustomMetric]], +) -> List[str]: + """Replaces metric bundles with corresponding metrics. + + Args: + metrics_list: The original list containing metrics bundle names. + + Returns: + The modified metrics list containing only metric names. + """ + modified_list = [] + + for item in metrics_list: + if item in _METRICS_BUNDLE_TO_METRIC_NAMES.keys(): + modified_list.extend(_METRICS_BUNDLE_TO_METRIC_NAMES[item]) + else: + modified_list.append(item) + + return modified_list + + +def _compute_custom_metrics( + row_dict: Dict[str, Any], + custom_metrics: List[metrics_base.CustomMetric], +) -> Dict[str, Any]: + """Computes custom metrics for a row. + + Args: + row_dict: A dictionary of an instance in the eval dataset. + custom_metrics: A list of CustomMetrics. + + Returns: + A dictionary of an instance containing custom metric results. + + Raises: + KeyError: If the custom metric function does not return a valid output. + """ + for custom_metric in custom_metrics: + metric_output = custom_metric.metric_function(row_dict) + if custom_metric.name in metric_output: + row_dict[custom_metric.name] = metric_output[custom_metric.name] + else: + raise KeyError( + f"Custom metric score `{custom_metric.name}` not found in the metric" + f" output {metric_output}. Please make sure the custom metric" + " function is valid, and the output dictionary uses" + f" `{custom_metric.name}` as the key for metric value." + ) + # Include additional metric results like explanation. + for key, value in metric_output.items(): + if key != custom_metric.name: + row_dict[f"{custom_metric.name}/{key}"] = value + return row_dict + + +def _separate_custom_metrics( + metrics: List[str], +) -> Tuple[List[str], List[metrics_base.CustomMetric],]: + """Separates the metrics list into API and custom metrics.""" + custom_metrics = [] + api_metrics = [] + for metric in metrics: + if isinstance(metric, metrics_base.CustomMetric): + custom_metrics.append(metric) + else: + api_metrics.append(metric) + return api_metrics, custom_metrics + + +def _compute_summary_metrics( + evaluation_run_config: evaluation_base.EvaluationRunConfig, + metrics_table: "pd.DataFrame", +) -> Dict[str, Any]: + """Computes summary metrics. + + Args: + evaluation_run_config: Evaluation Run Configurations. + metrics_table: A dataframe containing per-instance metrics results. + + Returns: + A dictionary containing summary metrics results and statistics. + """ + summary_metrics = {} + summary_metrics[constants.MetricResult.ROW_COUNT_KEY] = metrics_table.shape[0] + for metric in evaluation_run_config.metrics: + try: + # TODO(b/325078638): implement additional aggregate methods. + summary_metrics[f"{str(metric)}/mean"] = metrics_table.loc[ + :, str(metric) + ].mean() + summary_metrics[f"{str(metric)}/std"] = metrics_table.loc[ + :, str(metric) + ].std() + except (ValueError, KeyError): + _LOGGER.warning( + f"Failed to compute metric statistics for {metric}. This metric" + " output contains error from the Autorater." + ) + continue + return summary_metrics + + +def _generate_response_from_gemini( + model: generative_models.GenerativeModel, prompt: str +) -> str: + """Generates response from Gemini model. + + Args: + model: The Gemini model instance. + prompt: The prompt to send to the model. + + Returns: + The response from the model. + + Raises: + RuntimeError if the prompt or the response for the prompt is blocked for + safety reasons. + """ + response = model.generate_content(prompt) + try: + if not response.candidates: + raise RuntimeError( + f"The model response was blocked due to {response._raw_response.prompt_feedback.block_reason.name}.\n" + f"Blocke reason message: {response._raw_response.prompt_feedback.block_reason_message}.\n" + "The input prompt may be blocked for safety reasons.", + f"Prompt: {prompt}.", + ) + else: + candidate = response.candidates[0] + if candidate.finish_reason not in _SUCCESSFUL_FINISH_REASONS: + raise RuntimeError( + "The model response did not completed successfully.\n" + f"Finish reason: {candidate.finish_reason}.\n" + f"Finish message: {candidate.finish_message}.\n" + f"Safety ratings: {candidate.safety_ratings}.\n" + "Please adjsut the model safety_settings, or try a different prompt." + ) + return response.candidates[0].content.parts[0].text + except Exception: + raise RuntimeError( + "Failed to generate response candidates from Gemini model.\n" + f"Response: {response}.\n" + f"Prompt: {prompt}." + ) + + +def _generate_response_from_gemini_model( + model: generative_models.GenerativeModel, + evaluation_run_config: evaluation_base.EvaluationRunConfig, +) -> None: + """Generates responses from Gemini model. + + Args: + model: The Gemini model instance. + evaluation_run_config: Evaluation Run Configurations. + """ + if ( + constants.Dataset.COMPLETED_PROMPT_COLUMN + in evaluation_run_config.dataset.columns + ): + evaluation_run_config.dataset[ + constants.Dataset.MODEL_RESPONSE_COLUMN + ] = evaluation_run_config.dataset[ + constants.Dataset.COMPLETED_PROMPT_COLUMN + ].apply( + lambda x: _generate_response_from_gemini(model, x) + ) + else: + evaluation_run_config.dataset[ + constants.Dataset.MODEL_RESPONSE_COLUMN + ] = evaluation_run_config.dataset[ + evaluation_run_config.column_map[constants.Dataset.CONTENT_COLUMN] + ].apply( + lambda x: _generate_response_from_gemini(model, x) + ) + + +def _generate_response_from_custom_model_fn( + model_fn: Callable[[str], str], + evaluation_run_config: evaluation_base.EvaluationRunConfig, +) -> None: + """Generates responses from a custom model function. + + Args: + model_fn: The custom model function. + evaluation_run_config: Evaluation Run Configurations. + """ + try: + if ( + constants.Dataset.COMPLETED_PROMPT_COLUMN + in evaluation_run_config.dataset.columns + ): + evaluation_run_config.dataset[ + constants.Dataset.MODEL_RESPONSE_COLUMN + ] = evaluation_run_config.dataset[ + constants.Dataset.COMPLETED_PROMPT_COLUMN + ].apply( + model_fn + ) + else: + evaluation_run_config.dataset[ + constants.Dataset.MODEL_RESPONSE_COLUMN + ] = evaluation_run_config.dataset[ + evaluation_run_config.column_map[constants.Dataset.CONTENT_COLUMN] + ].apply( + model_fn + ) + except (ValueError, IndexError) as e: + _LOGGER.warning(f"Failed to generate response from model function: {e}") + + +def _check_placeholder_columns_exist( + dataset: "pd.DataFrame", placeholder_names_set: Set[str] +) -> None: + """Checks if all placeholder names exist in the dataset columns. + + Args: + dataset: The dataset to evaluate. + placeholder_names_set: A set of placeholder names. + + Raises: + ValueError: If any placeholder names do not exist in the dataset columns + or the prompt template is invalid. + """ + actual_column_names_set = set(dataset.columns) + if not placeholder_names_set.issubset(actual_column_names_set): + missing_columns = placeholder_names_set - actual_column_names_set + raise ValueError( + "Failed to complete prompt template: The following column(s) are" + f" missing: {', '.join(missing_columns)}" + ) + + +def _complete_prompt_for_dataset( + evaluation_run_config: evaluation_base.EvaluationRunConfig, prompt_template: str +) -> None: + """Adds a column in dataset for completed prompts from placeholder columns. + + Args: + evaluation_run_config: Evaluation Run Configurations. + prompt_template: A prompt template string with placeholders that can be + formatted with dataset columns. + + Returns: + The completed prompt template string to send to the model. + + Raises: + ValueError: If any placeholder names do not exist in the dataset columns + or the prompt template is invalid. + """ + prompt_template = prompt_template_base.PromptTemplate(prompt_template) + _check_placeholder_columns_exist( + evaluation_run_config.dataset, prompt_template.placeholders + ) + + try: + evaluation_run_config.dataset[ + constants.Dataset.COMPLETED_PROMPT_COLUMN + ] = evaluation_run_config.dataset.apply( + lambda row: str( + prompt_template.assemble( + **row[list(prompt_template.placeholders)].astype(str).to_dict(), + ) + ), + axis=1, + ) + except Exception as e: + raise ValueError(f"Failed to complete prompt: {e}") from e + + +def _parse_metric_results_to_dataframe( + instance_df: "pd.DataFrame", results: Dict[str, Any] +) -> Dict[str, Any]: + """Parses metric results to a pandas dataframe. + + Args: + instance_df: A dataframe containing per-instance metrics results. + results: A dictionary containing metric results. + + Returns: + A dataframe containing per-instance metrics results. Each metric result + can contain metric score, explanation, and confidence. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + 'Pandas is not installed. Please install the SDK using "pip install' + ' google-cloud-aiplatform[rapid_evaluation]"' + ) + metrics_table = pd.DataFrame(dict(zip(instance_df.columns, instance_df.values.T))) + + for metric_name, metric_results in results.items(): + scores = [ + result.get(constants.MetricResult.SCORE_KEY) for result in metric_results + ] + if ( + metric_name + in constants.Metric.MODEL_BASED_METRIC_LIST + + constants.Metric.PAIRWISE_METRIC_LIST + ): + explanations = [ + result.get(constants.MetricResult.EXPLANATION_KEY) + for result in metric_results + ] + confidences = [ + result.get(constants.MetricResult.CONFIDENCE_KEY) + for result in metric_results + ] + metrics_table[ + f"{metric_name}/{constants.MetricResult.EXPLANATION_KEY}" + ] = explanations + metrics_table[ + f"{metric_name}/{constants.MetricResult.CONFIDENCE_KEY}" + ] = confidences + + metrics_table[metric_name] = scores + + return metrics_table + + +async def _compute_metrics( + evaluation_run_config: evaluation_base.EvaluationRunConfig, +) -> Tuple[Dict[str, Any], "pd.DataFrame"]: + """Computes the metrics for the dataset. + + Args: + evaluation_run_config: Evaluation Run Configurations. + + Returns: + The evaluation results for the input metrics. + + Raises: + RuntimeError: The number of responses does not match the number of metrics. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + 'Pandas is not installed. Please install the SDK using "pip install' + ' google-cloud-aiplatform[rapid_evaluation]"' + ) + + api_metrics, custom_metrics = _separate_custom_metrics( + evaluation_run_config.metrics + ) + instance_list = [] + tasks_by_metric = collections.defaultdict(list) + for _, row in evaluation_run_config.dataset.iterrows(): + row_dict = _compute_custom_metrics(row.to_dict(), custom_metrics) + + instance_list.append(row_dict) + + for metric_name in api_metrics: + task = asyncio.create_task( + _instance_evaluation.evaluate_instances_async( + client=evaluation_run_config.client, + request=_instance_evaluation.build_request( + metric_name=metric_name, + row_dict=row_dict, + evaluation_run_config=evaluation_run_config, + ), + ) + ) + tasks_by_metric[metric_name].append(task) + + results_dict = { + metric_name: await asyncio.gather(*tasks) + for metric_name, tasks in tasks_by_metric.items() + } + + instance_df = pd.DataFrame.from_dict(instance_list) + metrics_table = _parse_metric_results_to_dataframe(instance_df, results_dict) + + summary_metrics = _compute_summary_metrics(evaluation_run_config, metrics_table) + return summary_metrics, metrics_table + + +def evaluate( + dataset: "pd.DataFrame", + metrics: List[Union[str, metrics_base.CustomMetric]], + *, + model: Optional[ + Union[generative_models.GenerativeModel, Callable[[str], str]] + ] = None, + prompt_template: Optional[str] = None, + content_column_name: str = "content", + reference_column_name: str = "reference", + response_column_name: str = "response", + context_column_name: str = "context", + instruction_column_name: str = "instruction", +) -> evaluation_base.EvalResult: + """Runs the evaluation for metrics. + + Args: + dataset: The dataset to evaluate. + metrics: The list of metrics names to evaluate, or a metrics bundle for an + evaluation task, or custom metric instances. + model: The GenerativeModel instance or a custom model function to generate + responses to evaluate. If not provided, the evaluation is computed with + the `response` column in the `dataset`. + prompt_template: A prompt template string compatible with `PromptTemplate` + class with placeholders that can be formatted with dataset columns to + create completed prompts. The placeholders can be represented in curly + braces `{placeholder}`, and must be included in the dataset columns if + specified. The placeholder names cannot contain spaces. + content_column_name: The column name of content in the dataset to send to + the model. If not set, default to `content`. + reference_column_name: The column name of ground truth in the dataset. If + not set, default to `reference`. + response_column_name: The column name of model response in the dataset. If + not set, default to `response`. + context_column_name: The column name of summary context in the dataset. If + not set, default to `context`. + instruction_column_name: The column name of the instruction prompt in the + dataset. If not set, default to `instruction`. + + Returns: + EvalResult with summary metrics and a metrics table for per-instance + metrics. + """ + + if not metrics: + raise ValueError("Metrics cannot be empty.") + + evaluation_run_config = evaluation_base.EvaluationRunConfig( + dataset=dataset, + metrics=_replace_metric_bundle_with_metrics(metrics), + column_map={ + constants.Dataset.CONTENT_COLUMN: content_column_name, + constants.Dataset.REFERENCE_COLUMN: reference_column_name, + constants.Dataset.MODEL_RESPONSE_COLUMN: response_column_name, + constants.Dataset.CONTEXT_COLUMN: context_column_name, + constants.Dataset.INSTRUCTION_COLUMN: instruction_column_name, + }, + client=utils.create_evaluation_service_async_client(), + ) + + if prompt_template: + _complete_prompt_for_dataset(evaluation_run_config, prompt_template) + + if model: + if prompt_template: + evaluation_run_config.validate_dataset_column( + constants.Dataset.COMPLETED_PROMPT_COLUMN + ) + else: + evaluation_run_config.validate_dataset_column( + constants.Dataset.CONTENT_COLUMN + ) + + if isinstance(model, generative_models.GenerativeModel): + _generate_response_from_gemini_model(model, evaluation_run_config) + elif callable(model): + _generate_response_from_custom_model_fn(model, evaluation_run_config) + else: + evaluation_run_config.validate_dataset_column( + constants.Dataset.MODEL_RESPONSE_COLUMN + ) + if set(evaluation_run_config.metrics).intersection( + set(constants.Metric.AUTOMATIC_METRIC_LIST) + ): + evaluation_run_config.validate_dataset_column( + constants.Dataset.REFERENCE_COLUMN + ) + + if asyncio.get_event_loop().is_running(): + asyncio.set_event_loop(asyncio.new_event_loop()) + loop = asyncio.get_event_loop() + + summary_metrics, metrics_table = loop.run_until_complete( + _compute_metrics(evaluation_run_config) + ) + + return evaluation_base.EvalResult( + summary_metrics=summary_metrics, metrics_table=metrics_table + ) diff --git a/vertexai/preview/evaluation/constants.py b/vertexai/preview/evaluation/constants.py new file mode 100644 index 0000000000..d25bdd7655 --- /dev/null +++ b/vertexai/preview/evaluation/constants.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Constants for evaluation.""" +import dataclasses + + +@dataclasses.dataclass(frozen=True) +class Metric: + """Namespace for Metrics.""" + + # Automatic Metrics. + EXACT_MATCH = "exact_match" + BLEU = "bleu" + ROUGE_1 = "rouge_1" + ROUGE_2 = "rouge_2" + ROUGE_L = "rouge_l" + ROUGE_L_SUM = "rouge_l_sum" + TOOL_CALL_VALID = "tool_call_valid" + TOOL_NAME_MATCH = "tool_name_match" + TOOL_PARAMETER_KEY_MATCH = "tool_parameter_key_match" + TOOL_PARAMETER_KV_MATCH = "tool_parameter_kv_match" + # Model-based Pointwise Metrics. + COHERENCE = "coherence" + FLUENCY = "fluency" + SAFETY = "safety" + GROUNDEDNESS = "groundedness" + FULFILLMENT = "fulfillment" + RESPONSE_RECALL = "response_recall" + SUMMARIZATION_QUALITY = "summarization_quality" + SUMMARIZATION_HELPFULNESS = "summarization_helpfulness" + SUMMARIZATION_VERBOSITY = "summarization_verbosity" + QUESTION_ANSWERING_QUALITY = "question_answering_quality" + QUESTION_ANSWERING_RELEVANCE = "question_answering_relevance" + QUESTION_ANSWERING_HELPFULNESS = "question_answering_helpfulness" + QUESTION_ANSWERING_CORRECTNESS = "question_answering_correctness" + RAG_CONTEXT_RECALL = "rag_context_recall" + # Side-by-side(SxS) Pairwise Metrics. + PAIRWISE_SUMMARIZATION_QUALITY = "pairwise_summarization_quality" + PAIRWISE_QUESTION_ANSWERING_QUALITY = "pairwise_question_answering_quality" + + AUTOMATIC_METRIC_LIST = ( + EXACT_MATCH, + BLEU, + ROUGE_1, + ROUGE_2, + ROUGE_L, + ROUGE_L_SUM, + TOOL_CALL_VALID, + TOOL_NAME_MATCH, + TOOL_PARAMETER_KEY_MATCH, + TOOL_PARAMETER_KV_MATCH, + ) + MODEL_BASED_METRIC_LIST = ( + COHERENCE, + FLUENCY, + SAFETY, + GROUNDEDNESS, + FULFILLMENT, + RESPONSE_RECALL, + SUMMARIZATION_QUALITY, + SUMMARIZATION_HELPFULNESS, + SUMMARIZATION_VERBOSITY, + QUESTION_ANSWERING_QUALITY, + QUESTION_ANSWERING_RELEVANCE, + QUESTION_ANSWERING_HELPFULNESS, + QUESTION_ANSWERING_CORRECTNESS, + RAG_CONTEXT_RECALL, + ) + PAIRWISE_METRIC_LIST = ( + PAIRWISE_SUMMARIZATION_QUALITY, + PAIRWISE_QUESTION_ANSWERING_QUALITY, + ) + + +@dataclasses.dataclass(frozen=True) +class MetricResult: + ROW_COUNT_KEY = "row_count" + SCORE_KEY = "score" + EXPLANATION_KEY = "explanation" + CONFIDENCE_KEY = "confidence" + PAIRWISE_CHOICE_KEY = "pairwise_choice" + + # Automatic Metrics. + EXACT_MATCH_RESULTS = "exact_match_results" + BLEU_RESULTS = "bleu_results" + ROUGE_RESULTS = "rouge_results" + TOOL_CALL_VALID_RESULTS = "tool_call_valid_results" + TOOL_NAME_MATCH_RESULTS = "tool_name_match_results" + TOOL_PARAMETER_KEY_MATCH_RESULTS = "tool_parameter_key_match_results" + TOOL_PARAMETER_KV_MATCH_RESULTS = "tool_parameter_kv_match_results" + # Model-based Pointwise Metrics. + COHERENCE_RESULT = "coherence_result" + FLUENCY_RESULT = "fluency_result" + SAFETY_RESULT = "safety_result" + GROUNDEDNESS_RESULT = "groundedness_result" + FULFILLMENT_RESULT = "fulfillment_result" + RESPONSE_RECALL_RESULT = "response_recall_result" + SUMMARIZATION_QUALITY_RESULT = "summarization_quality_result" + SUMMARIZATION_HELPFULNESS_RESULT = "summarization_helpfulness_result" + SUMMARIZATION_VERBOSITY_RESULT = "summarization_verbosity_result" + QUESTION_ANSWERING_QUALITY_RESULT = "question_answering_quality_result" + QUESTION_ANSWERING_RELEVANCE_RESULT = "question_answering_relevance_result" + QUESTION_ANSWERING_HELPFULNESS_RESULT = "question_answering_helpfulness_result" + QUESTION_ANSWERING_CORRECTNESS_RESULT = "question_answering_correctness_result" + RAG_CONTEXT_RECALL_RESULT = "rag_context_recall_result" + # Side-by-side(SxS) Pairwise Metrics. + PAIRWISE_SUMMARIZATION_QUALITY_RESULT = "pairwise_summarization_quality_result" + PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT = ( + "pairwise_question_answering_quality_result" + ) + + AUTOMATIC_METRIC_RESULTS_LIST = ( + EXACT_MATCH_RESULTS, + BLEU_RESULTS, + ROUGE_RESULTS, + TOOL_CALL_VALID_RESULTS, + TOOL_NAME_MATCH_RESULTS, + TOOL_PARAMETER_KEY_MATCH_RESULTS, + TOOL_PARAMETER_KV_MATCH_RESULTS, + ) + MODEL_BASED_METRIC_RESULT_LIST = ( + COHERENCE_RESULT, + FLUENCY_RESULT, + SAFETY_RESULT, + GROUNDEDNESS_RESULT, + FULFILLMENT_RESULT, + RESPONSE_RECALL_RESULT, + SUMMARIZATION_QUALITY_RESULT, + SUMMARIZATION_HELPFULNESS_RESULT, + SUMMARIZATION_VERBOSITY_RESULT, + QUESTION_ANSWERING_QUALITY_RESULT, + QUESTION_ANSWERING_RELEVANCE_RESULT, + QUESTION_ANSWERING_HELPFULNESS_RESULT, + QUESTION_ANSWERING_CORRECTNESS_RESULT, + RAG_CONTEXT_RECALL_RESULT, + ) + PAIRWISE_METRIC_RESULT_LIST = ( + PAIRWISE_SUMMARIZATION_QUALITY_RESULT, + PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT, + ) + + +@dataclasses.dataclass(frozen=True) +class MetricBundle: + """Namespace for MetricBundle.""" + + TEXT_GENERATION_SIMILARITY = "text_generation_similarity" + TEXT_GENERATION_QUALITY = "text_generation_quality" + TOOL_CALL_QUALITY = "tool_call_quality" + TEXT_GENERATION_INSTRUCTION_FOLLOWING = "text_generation_instruction_following" + TEXT_GENERATION_SAFETY = "text_generation_safety" + TEXT_GENERATION_FACTUALITY = "text_generation_factuality" + SUMMARIZATION_POINTWISE_REFERENCE_FREE = "summarization_pointwise_reference_free" + QA_POINTWISE_REFERENCE_FREE = "qa_pointwise_reference_free" + QA_POINTWISE_REFERENCE_BASED = "qa_pointwise_reference_based" + + +@dataclasses.dataclass(frozen=True) +class Dataset: + COMPLETED_PROMPT_COLUMN = "completed_prompt" + MODEL_RESPONSE_COLUMN = "response" + BASELINE_MODEL_RESPONSE_COLUMN = "baseline_model_response" + CONTEXT_COLUMN = "context" + REFERENCE_COLUMN = "reference" + CONTENT_COLUMN = "content" + INSTRUCTION_COLUMN = "instruction" diff --git a/vertexai/preview/evaluation/metrics/__init__.py b/vertexai/preview/evaluation/metrics/__init__.py new file mode 100644 index 0000000000..94d768a030 --- /dev/null +++ b/vertexai/preview/evaluation/metrics/__init__.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Evaluation Metrics Module.""" + +from vertexai.preview.evaluation.metrics import ( + _base, +) + +CustomMetric = _base.CustomMetric +make_metric = _base.make_metric + +__all__ = [ + "CustomMetric", + "make_metric", +] diff --git a/vertexai/preview/evaluation/metrics/_base.py b/vertexai/preview/evaluation/metrics/_base.py new file mode 100644 index 0000000000..35ab69aec5 --- /dev/null +++ b/vertexai/preview/evaluation/metrics/_base.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict + + +class CustomMetric: + """The custom evaluation metric. + + Attributes: + name: The name of the metric. + metric_function: The evaluation function. Must use the dataset row/instance + as the metric_function input. Returns per-instance metric result as a + dictionary. The metric score must mapped to the CustomMetric.name as key. + """ + + def __init__( + self, + name: str, + metric_function: Callable[ + [Dict[str, Any]], + Dict[str, Any], + ], + ): + """Initializes the evaluation metric.""" + self.name = name + self.metric_function = metric_function + + def __str__(self): + return self.name + + +def make_metric( + name: str, metric_function: Callable[[Dict[str, Any]], Dict[str, Any]] +) -> CustomMetric: + """Makes a custom metric. + + Args: + name: The name of the metric + metric_function: The evaluation function. Must use the dataset row/instance + as the metric_function input. Returns per-instance metric result as a + dictionary. The metric score must mapped to the CustomMetric.name as key. + + Returns: + A CustomMetric instance, can be passed to evaluate() function. + """ + return CustomMetric(name, metric_function) diff --git a/vertexai/preview/evaluation/metrics/_instance_evaluation.py b/vertexai/preview/evaluation/metrics/_instance_evaluation.py new file mode 100644 index 0000000000..dc7a8ddf2b --- /dev/null +++ b/vertexai/preview/evaluation/metrics/_instance_evaluation.py @@ -0,0 +1,603 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Library for Metrics Computation with Evaluation Service Async Client.""" + +from typing import Any, Dict + +from google import api_core +from google.cloud.aiplatform import base +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform_v1beta1.services import ( + evaluation_service as gapic_evaluation_services, +) +from google.cloud.aiplatform_v1beta1.types import ( + evaluation_service as gapic_evaluation_service_types, +) +from vertexai.preview.evaluation import ( + _base as eval_base, +) +from vertexai.preview.evaluation import constants + +from google.protobuf import json_format + +_LOGGER = base.Logger(__name__) +_METRIC_NAME_TO_METRIC_SPEC = { + # Automatic Metrics. + constants.Metric.EXACT_MATCH: (gapic_evaluation_service_types.ExactMatchSpec()), + constants.Metric.BLEU: gapic_evaluation_service_types.BleuSpec(), + constants.Metric.ROUGE_1: gapic_evaluation_service_types.RougeSpec( + rouge_type="rouge1" + ), + constants.Metric.ROUGE_2: gapic_evaluation_service_types.RougeSpec( + rouge_type="rouge2" + ), + constants.Metric.ROUGE_L: gapic_evaluation_service_types.RougeSpec( + rouge_type="rougeL" + ), + constants.Metric.ROUGE_L_SUM: gapic_evaluation_service_types.RougeSpec( + rouge_type="rougeLsum" + ), + constants.Metric.TOOL_CALL_VALID: ( + gapic_evaluation_service_types.ToolCallValidSpec() + ), + constants.Metric.TOOL_NAME_MATCH: ( + gapic_evaluation_service_types.ToolNameMatchSpec() + ), + constants.Metric.TOOL_PARAMETER_KV_MATCH: ( + gapic_evaluation_service_types.ToolParameterKVMatchSpec() + ), + constants.Metric.TOOL_PARAMETER_KEY_MATCH: ( + gapic_evaluation_service_types.ToolParameterKeyMatchSpec() + ), + # Model-based Pointwise Metrics. + constants.Metric.FLUENCY: gapic_evaluation_service_types.FluencySpec(), + constants.Metric.COHERENCE: gapic_evaluation_service_types.CoherenceSpec(), + constants.Metric.SAFETY: gapic_evaluation_service_types.SafetySpec(), + constants.Metric.GROUNDEDNESS: (gapic_evaluation_service_types.GroundednessSpec()), + constants.Metric.FULFILLMENT: (gapic_evaluation_service_types.FulfillmentSpec()), + constants.Metric.SUMMARIZATION_QUALITY: ( + gapic_evaluation_service_types.SummarizationQualitySpec() + ), + constants.Metric.SUMMARIZATION_HELPFULNESS: ( + gapic_evaluation_service_types.SummarizationHelpfulnessSpec() + ), + constants.Metric.SUMMARIZATION_VERBOSITY: ( + gapic_evaluation_service_types.SummarizationVerbositySpec() + ), + constants.Metric.QUESTION_ANSWERING_QUALITY: ( + gapic_evaluation_service_types.QuestionAnsweringQualitySpec() + ), + constants.Metric.QUESTION_ANSWERING_RELEVANCE: ( + gapic_evaluation_service_types.QuestionAnsweringRelevanceSpec() + ), + constants.Metric.QUESTION_ANSWERING_CORRECTNESS: ( + gapic_evaluation_service_types.QuestionAnsweringCorrectnessSpec( + use_reference=True + ) + ), + constants.Metric.QUESTION_ANSWERING_HELPFULNESS: ( + gapic_evaluation_service_types.QuestionAnsweringHelpfulnessSpec() + ), + # Side-by-side(SxS) Pairwise Metrics. + constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY: ( + gapic_evaluation_service_types.PairwiseSummarizationQualitySpec() + ), + constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY: ( + gapic_evaluation_service_types.PairwiseQuestionAnsweringQualitySpec() + ), +} + + +def build_request( + metric_name: str, + row_dict: Dict[str, Any], + evaluation_run_config: eval_base.EvaluationRunConfig, +) -> gapic_evaluation_service_types.EvaluateInstancesRequest: + """Builds a metric instance and form the request for the evaluation service. + + Args: + metric_name: The name of the metric to evaluate. + row_dict: An eval dataset instance in a dictionary. + evaluation_run_config: Evaluation Run Configurations. + + Returns: + A single EvaluateInstancesRequest. + """ + project = initializer.global_config.project + location = initializer.global_config.location + if not project or not location: + raise ValueError( + "No project or location specified. Please run `vertexai.init()` to" + " provide these parameters." + ) + location_path = ( + gapic_evaluation_services.EvaluationServiceAsyncClient.common_location_path( + project, location + ) + ) + + if metric_name not in _METRIC_NAME_TO_METRIC_SPEC: + raise ValueError(f"Metric name: {metric_name} not supported.") + metric_spec = _METRIC_NAME_TO_METRIC_SPEC[metric_name] + column_map = evaluation_run_config.column_map + prediction = row_dict.get( + column_map.get(constants.Dataset.MODEL_RESPONSE_COLUMN), "" + ) + baseline_prediction = row_dict.get( + column_map.get(constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN), "" + ) + reference = row_dict.get(column_map.get(constants.Dataset.REFERENCE_COLUMN), "") + context = row_dict.get(column_map.get(constants.Dataset.CONTEXT_COLUMN), "") + instruction = row_dict.get(column_map.get(constants.Dataset.INSTRUCTION_COLUMN), "") + + # Automatic Metrics. + if metric_name == constants.Metric.EXACT_MATCH: + instance = gapic_evaluation_service_types.ExactMatchInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.ExactMatchInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + exact_match_input=instance, + ) + if metric_name == constants.Metric.BLEU: + instance = gapic_evaluation_service_types.BleuInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.BleuInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + bleu_input=instance, + ) + if metric_name in ( + constants.Metric.ROUGE_1, + constants.Metric.ROUGE_2, + constants.Metric.ROUGE_L, + constants.Metric.ROUGE_L_SUM, + ): + instance = gapic_evaluation_service_types.RougeInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.RougeInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + rouge_input=instance, + ) + if metric_name == constants.Metric.TOOL_CALL_VALID: + instance = gapic_evaluation_service_types.ToolCallValidInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.ToolCallValidInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + tool_call_valid_input=instance, + ) + if metric_name == constants.Metric.TOOL_NAME_MATCH: + instance = gapic_evaluation_service_types.ToolNameMatchInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.ToolNameMatchInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + tool_name_match_input=instance, + ) + if metric_name == constants.Metric.TOOL_PARAMETER_KEY_MATCH: + instance = gapic_evaluation_service_types.ToolParameterKeyMatchInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.ToolParameterKeyMatchInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + tool_parameter_key_match_input=instance, + ) + if metric_name == constants.Metric.TOOL_PARAMETER_KV_MATCH: + instance = gapic_evaluation_service_types.ToolParameterKVMatchInput( + metric_spec=metric_spec, + instances=[ + gapic_evaluation_service_types.ToolParameterKVMatchInstance( + prediction=prediction, + reference=reference, + ) + ], + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + tool_parameter_kv_match_input=instance, + ) + # Model-based Pointwise Metrics. + if metric_name == constants.Metric.COHERENCE: + coherence_input = gapic_evaluation_service_types.CoherenceInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.CoherenceInstance( + prediction=prediction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + coherence_input=coherence_input, + ) + if metric_name == constants.Metric.FLUENCY: + fluency_input = gapic_evaluation_service_types.FluencyInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.FluencyInstance( + prediction=prediction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + fluency_input=fluency_input, + ) + if metric_name == constants.Metric.SAFETY: + safety_input = gapic_evaluation_service_types.SafetyInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.SafetyInstance( + prediction=prediction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + safety_input=safety_input, + ) + if metric_name == constants.Metric.GROUNDEDNESS: + groundedness_input = gapic_evaluation_service_types.GroundednessInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.GroundednessInstance( + prediction=prediction, context=context + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + groundedness_input=groundedness_input, + ) + if metric_name == constants.Metric.FULFILLMENT: + fulfillment_input = gapic_evaluation_service_types.FulfillmentInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.FulfillmentInstance( + prediction=prediction, instruction=instruction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + fulfillment_input=fulfillment_input, + ) + if metric_name == constants.Metric.RESPONSE_RECALL: + raise NotImplementedError("Response recall is not implemented.") + if metric_name == constants.Metric.SUMMARIZATION_QUALITY: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + summarization_quality_input = ( + gapic_evaluation_service_types.SummarizationQualityInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.SummarizationQualityInstance( + prediction=prediction, context=context, instruction=instruction + ), + ) + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + summarization_quality_input=summarization_quality_input, + ) + if metric_name == constants.Metric.SUMMARIZATION_HELPFULNESS: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + summarization_helpfulness_input = gapic_evaluation_service_types.SummarizationHelpfulnessInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.SummarizationHelpfulnessInstance( + prediction=prediction, context=context, instruction=instruction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + summarization_helpfulness_input=summarization_helpfulness_input, + ) + if metric_name == constants.Metric.SUMMARIZATION_VERBOSITY: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + summarization_verbosity_input = ( + gapic_evaluation_service_types.SummarizationVerbosityInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.SummarizationVerbosityInstance( + prediction=prediction, context=context, instruction=instruction + ), + ) + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + summarization_verbosity_input=summarization_verbosity_input, + ) + if metric_name == constants.Metric.QUESTION_ANSWERING_QUALITY: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + question_answering_quality_input = gapic_evaluation_service_types.QuestionAnsweringQualityInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.QuestionAnsweringQualityInstance( + prediction=prediction, context=context, instruction=instruction + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + question_answering_quality_input=question_answering_quality_input, + ) + if metric_name == constants.Metric.QUESTION_ANSWERING_HELPFULNESS: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + question_answering_helpfulness_input = gapic_evaluation_service_types.QuestionAnsweringHelpfulnessInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.QuestionAnsweringHelpfulnessInstance( + prediction=prediction, + context=context, + instruction=instruction, + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + question_answering_helpfulness_input=question_answering_helpfulness_input, + ) + if metric_name == constants.Metric.QUESTION_ANSWERING_RELEVANCE: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + question_answering_relevance_input = gapic_evaluation_service_types.QuestionAnsweringRelevanceInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.QuestionAnsweringRelevanceInstance( + prediction=prediction, + context=context, + instruction=instruction, + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + question_answering_relevance_input=question_answering_relevance_input, + ) + if metric_name == constants.Metric.QUESTION_ANSWERING_CORRECTNESS: + # TODO(b/330807319): allow set reference field after setting metric spec is allowed. + question_answering_correctness_input = gapic_evaluation_service_types.QuestionAnsweringCorrectnessInput( + metric_spec=metric_spec, + instance=gapic_evaluation_service_types.QuestionAnsweringCorrectnessInstance( + prediction=prediction, + context=context, + instruction=instruction, + reference=reference, + ), + ) + return gapic_evaluation_service_types.EvaluateInstancesRequest( + location=location_path, + question_answering_correctness_input=question_answering_correctness_input, + ) + if metric_name == constants.Metric.RAG_CONTEXT_RECALL: + raise NotImplementedError("RAG context recall is not implemented.") + # Side-by-side(SxS) Pairwise Metrics. + if metric_name == constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY: + raise NotImplementedError("Pairwise summarization quality is not implemented.") + if metric_name == constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY: + raise NotImplementedError( + "Pairwise question answering quality is not implemented." + ) + + +def _parse_autometric_results( + metric_result_dict: Dict[str, Any], +) -> Dict[str, Any]: + """Parses the automatic metric results from the evaluation results. + + Args: + metric_result_dict: The metric results dictionary. + + Returns: + A dictionary containing metric score of the metric. + """ + for value in metric_result_dict.values(): + # Only single instance requests are used by SDK. + return { + constants.MetricResult.SCORE_KEY: value[0].get( + constants.MetricResult.SCORE_KEY + ) + } + + +def _parse_pointwise_results( + metric_result_dict: Dict[str, Any], +) -> Dict[str, Any]: + """Parses the pointwise metric results from the evaluation results. + + Args: + metric_result_dict: The metric results dictionary. + + Returns: + A dictionary containing metric score, explanation, confidence of the + metric. + """ + return { + constants.MetricResult.SCORE_KEY: metric_result_dict.get( + constants.MetricResult.SCORE_KEY + ), + constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get( + constants.MetricResult.EXPLANATION_KEY + ), + constants.MetricResult.CONFIDENCE_KEY: metric_result_dict.get( + constants.MetricResult.CONFIDENCE_KEY + ), + } + + +def _parse_pairwise_results( + metric_result_dict: Dict[str, Any], +) -> Dict[str, Any]: + """Parses the pairwise metric results from the evaluation results. + + s + + Args: + metric_result_dict: The metric results dictionary. + + Returns: + A dictionary containing metric score, explanation, confidence of the + metric. + """ + return { + # TODO(b/330598854): handle pairwise choice. + constants.MetricResult.PAIRWISE_CHOICE_KEY: metric_result_dict.get( + constants.MetricResult.PAIRWISE_CHOICE_KEY, + gapic_evaluation_service_types.PairwiseChoice.PAIRWISE_CHOICE_UNSPECIFIED, + ), + constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get( + constants.MetricResult.EXPLANATION_KEY + ), + constants.MetricResult.CONFIDENCE_KEY: metric_result_dict.get( + constants.MetricResult.CONFIDENCE_KEY + ), + } + + +def _handle_response( + response: gapic_evaluation_service_types.EvaluateInstancesResponse, +) -> Dict[str, Any]: + """Handles the response from the evaluation service. + + Args: + response: The response from the evaluation service. + + Returns: + The metric score of the evaluation. + """ + metric_type = response._pb.WhichOneof("evaluation_results") + + # Automatic Metrics. + if metric_type == constants.MetricResult.EXACT_MATCH_RESULTS: + metric_result = response.exact_match_results + elif metric_type == constants.MetricResult.BLEU_RESULTS: + metric_result = response.bleu_results + elif metric_type == constants.MetricResult.ROUGE_RESULTS: + metric_result = response.rouge_results + elif metric_type == constants.MetricResult.TOOL_CALL_VALID_RESULTS: + metric_result = response.tool_call_valid_results + elif metric_type == constants.MetricResult.TOOL_NAME_MATCH_RESULTS: + metric_result = response.tool_name_match_results + elif metric_type == constants.MetricResult.TOOL_PARAMETER_KEY_MATCH_RESULTS: + metric_result = response.tool_parameter_key_match_results + elif metric_type == constants.MetricResult.TOOL_PARAMETER_KV_MATCH_RESULTS: + metric_result = response.tool_parameter_kv_match_results + # Model-based Pointwise Metrics. + elif metric_type == constants.MetricResult.COHERENCE_RESULT: + metric_result = response.coherence_result + elif metric_type == constants.MetricResult.FULFILLMENT_RESULT: + metric_result = response.fulfillment_result + elif metric_type == constants.MetricResult.FLUENCY_RESULT: + metric_result = response.fluency_result + elif metric_type == constants.MetricResult.SAFETY_RESULT: + metric_result = response.safety_result + elif metric_type == constants.MetricResult.GROUNDEDNESS_RESULT: + metric_result = response.groundedness_result + elif metric_type == constants.MetricResult.RESPONSE_RECALL_RESULT: + metric_result = response.response_recall_result + elif metric_type == constants.MetricResult.SUMMARIZATION_QUALITY_RESULT: + metric_result = response.summarization_quality_result + elif metric_type == constants.MetricResult.SUMMARIZATION_HELPFULNESS_RESULT: + metric_result = response.summarization_helpfulness_result + elif metric_type == constants.MetricResult.SUMMARIZATION_VERBOSITY_RESULT: + metric_result = response.summarization_verbosity_result + elif metric_type == constants.MetricResult.QUESTION_ANSWERING_QUALITY_RESULT: + metric_result = response.question_answering_quality_result + elif metric_type == constants.MetricResult.QUESTION_ANSWERING_RELEVANCE_RESULT: + metric_result = response.question_answering_relevance_result + elif metric_type == constants.MetricResult.QUESTION_ANSWERING_HELPFULNESS_RESULT: + metric_result = response.question_answering_helpfulness_result + elif metric_type == constants.MetricResult.QUESTION_ANSWERING_CORRECTNESS_RESULT: + metric_result = response.question_answering_correctness_result + elif metric_type == constants.MetricResult.RAG_CONTEXT_RECALL_RESULT: + metric_result = response.rag_context_recall_result + # Side-by-side(SxS) Pairwise Metrics. + elif metric_type == constants.MetricResult.PAIRWISE_SUMMARIZATION_QUALITY_RESULT: + metric_result = response.pairwise_summarization_quality_result + elif ( + metric_type == constants.MetricResult.PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT + ): + metric_result = response.pairwise_question_answering_quality_result + else: + raise ValueError(f"Unknown metric type: {metric_type}") + + metric_result_dict = json_format.MessageToDict( + metric_result._pb, preserving_proto_field_name=True + ) + + if metric_type in constants.MetricResult.AUTOMATIC_METRIC_RESULTS_LIST: + result = _parse_autometric_results(metric_result_dict) + elif metric_type in constants.MetricResult.MODEL_BASED_METRIC_RESULT_LIST: + result = _parse_pointwise_results(metric_result_dict) + elif metric_type in constants.MetricResult.PAIRWISE_METRIC_RESULT_LIST: + result = _parse_pairwise_results(metric_result_dict) + else: + raise ValueError(f"Unknown metric type: {metric_type}") + return result + + +async def evaluate_instances_async( + client: gapic_evaluation_services.EvaluationServiceAsyncClient, + request: gapic_evaluation_service_types.EvaluateInstancesRequest, +): + """Evaluates an instance asynchronously. + + Args: + client: The client to use for evaluation. + request: An EvaluateInstancesRequest. + + Returns: + The metric score of the evaluation. + """ + + response = await client.evaluate_instances( + request=request, + retry=api_core.retry_async.AsyncRetry( + initial=0.250, + maximum=90.0, + multiplier=1.45, + deadline=600.0, + predicate=api_core.retry.if_exception_type( + api_core.exceptions.Aborted, + api_core.exceptions.DeadlineExceeded, + api_core.exceptions.InternalServerError, + api_core.exceptions.ResourceExhausted, + api_core.exceptions.ServiceUnavailable, + api_core.exceptions.Unknown, + api_core.exceptions.Cancelled, + ), + ), + ) + return _handle_response(response) diff --git a/vertexai/preview/evaluation/prompt_template.py b/vertexai/preview/evaluation/prompt_template.py new file mode 100644 index 0000000000..14b0f6bd6a --- /dev/null +++ b/vertexai/preview/evaluation/prompt_template.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import string +from typing import Set + + +class PromptTemplate: + """A prompt template for creating prompts with placeholders. + + The `PromptTemplate` class allows users to define a template string with + placeholders represented in curly braces `{placeholder}`. The placeholder + names cannot contain spaces. These placeholders can be replaced with specific + values using the `assemble` method, providing flexibility in generating + dynamic prompts. + + Example Usage: + + ``` + template_str = "Hello, {name}! Today is {day}. How are you?" + prompt_template = PromptTemplate(template_str) + completed_prompt = prompt_template.assemble(name="John", day="Monday") + print(completed_prompt) + ``` + + Attributes: + template: The template string containing placeholders for replacement. + placeholders: A set of placeholder names from the template string. + """ + + def __init__(self, template: str): + """Initializes the PromptTemplate with a given template. + + Args: + template: The template string with placeholders. Placeholders should be + represented in curly braces `{placeholder}`. + """ + self.template = str(template) + self.placeholders = self._get_placeholders() + + def _get_placeholders(self) -> Set[str]: + """Extracts and return a set of placeholder names from the template.""" + return set( + field_name + for _, field_name, _, _ in string.Formatter().parse(self.template) + if field_name is not None + ) + + def assemble(self, **kwargs) -> "PromptTemplate": + """Replaces only the provided placeholders in the template with specific values. + + Args: + **kwargs: Keyword arguments where keys are placeholder names and values + are the replacements. + + Returns: + A new PromptTemplate instance with the updated template string. + """ + replaced_values = { + key: kwargs.get(key, "{" + key + "}") for key in self.placeholders + } + new_template = self.template.format(**replaced_values) + return PromptTemplate(new_template) + + def __str__(self) -> str: + """Returns the template string.""" + return self.template + + def __repr__(self) -> str: + """Returns a string representation of the PromptTemplate.""" + return f"PromptTemplate('{self.template}')" diff --git a/vertexai/preview/evaluation/utils.py b/vertexai/preview/evaluation/utils.py new file mode 100644 index 0000000000..c85caba690 --- /dev/null +++ b/vertexai/preview/evaluation/utils.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import io +import os +from typing import Any, Dict, Optional, Union, TYPE_CHECKING + +from google.cloud import bigquery +from google.cloud import storage +from google.cloud.aiplatform import compat +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import utils +from google.cloud.aiplatform_v1beta1.services import ( + evaluation_service as gapic_evaluation_services, +) + +if TYPE_CHECKING: + import pandas as pd + +_BQ_PREFIX = "bq://" +_GCS_PREFIX = "gs://" + + +class _EvaluationServiceAsyncClientWithOverride(utils.ClientWithOverride): + _is_temporary = False + _default_version = compat.V1BETA1 + _version_map = ( + ( + compat.V1BETA1, + gapic_evaluation_services.EvaluationServiceAsyncClient, + ), + ) + + +def create_evaluation_service_async_client( + api_base_path_override: Optional[str] = None, +) -> _EvaluationServiceAsyncClientWithOverride: + """Creates an aync client for the evaluation service. + + Args: + api_base_path_override: Optional. Override default api base path. + + Returns: + Instantiated Vertex AI EvaluationService async client with optional overrides. + """ + return initializer.global_config.create_client( + client_class=_EvaluationServiceAsyncClientWithOverride, + location_override=initializer.global_config.location, + api_base_path_override=api_base_path_override, + ) + + +def load_dataset(source: Union[str, "pd.DataFrame", Dict[str, Any]]) -> "pd.DataFrame": + """Loads dataset from various sources into a DataFrame. + + Args: + source: The data source. Can be the following formats: + - pd.DataFrame: Used directly for evaluation. + - dict: Converted to a pandas DataFrame before evaluation. + - str: Interpreted as a file path or URI. Supported formats include: + * Local JSONL or CSV files: Loaded from the local filesystem. + * GCS JSONL or CSV files: Loaded from Google Cloud Storage + (e.g., 'gs://bucket/data.csv'). + * BigQuery table URI: Loaded from Google Cloud BigQuery + (e.g., 'bq://project-id.dataset.table_name'). + + Returns: + The dataset in pandas DataFrame format. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + 'Pandas is not installed. Please install the SDK using "pip install' + ' google-cloud-aiplatform[rapid_evaluation]"' + ) + + if isinstance(source, pd.DataFrame): + return source.copy() + elif isinstance(source, dict): + return pd.DataFrame(source) + elif isinstance(source, str): + if source.startswith(_BQ_PREFIX): + return _load_bigquery(source[len(_BQ_PREFIX) :]) + + _, extension = os.path.splitext(source) + file_type = extension.lower()[1:] + + if file_type == "jsonl": + return _load_jsonl(source) + elif file_type == "csv": + return _load_csv(source) + else: + raise ValueError(f"Unsupported file type: {file_type}") + else: + raise TypeError( + "Unsupported dataset type. Must be DataFrame, dictionary, or" " filepath." + ) + + +def _load_jsonl(filepath: str) -> "pd.DataFrame": + """Loads data from a JSONL file into a DataFrame.""" + try: + import pandas as pd + except ImportError: + raise ImportError( + 'Pandas is not installed. Please install the SDK using "pip install' + ' google-cloud-aiplatform[rapid_evaluation]"' + ) + if filepath.startswith(_GCS_PREFIX): + file_contents = _read_gcs_file_contents(filepath) + return pd.read_json(file_contents, lines=True) + else: + with open(filepath, "r") as f: + return pd.read_json(f, lines=True) + + +def _load_csv(filepath: str) -> "pd.DataFrame": + """Loads data from a CSV file into a DataFrame.""" + try: + import pandas as pd + except ImportError: + raise ImportError( + 'Pandas is not installed. Please install the SDK using "pip install' + ' google-cloud-aiplatform[rapid_evaluation]"' + ) + if filepath.startswith(_GCS_PREFIX): + file_contents = _read_gcs_file_contents(filepath) + return pd.read_csv(io.StringIO(file_contents), encoding="utf-8") + else: + return pd.read_csv(filepath, encoding="utf-8") + + +def _load_bigquery(table_id: str) -> "pd.DataFrame": + """Loads data from a BigQuery table into a DataFrame.""" + + client = bigquery.Client( + project=initializer.global_config.project, + credentials=initializer.global_config.credentials, + ) + table = client.get_table(table_id) + return client.list_rows(table).to_dataframe() + + +def _read_gcs_file_contents(filepath: str) -> str: + """Reads the contents of a file from Google Cloud Storage. + + Args: + filepath: The GCS file path (e.g., 'gs://bucket_name/file.csv') + + Returns: + The contents of the file. + """ + + client = storage.Client( + project=initializer.global_config.project, + credentials=initializer.global_config.credentials, + ) + bucket_name, blob_path = filepath[len(_GCS_PREFIX) :].split("/", 1) + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(blob_path) + return blob.download_as_string().decode("utf-8")