From 6727fb3761e91d32359e13b1c28494d2f16d165a Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Thu, 10 Jul 2025 20:07:44 -0700 Subject: [PATCH] feat: GenAI SDK client(evals) - Add Generate Rubrics API config and internal method PiperOrigin-RevId: 781795016 --- .../replays/test_internal_generate_rubrics.py | 170 +++++++++++++ vertexai/_genai/evals.py | 188 ++++++++++++++ vertexai/_genai/types.py | 240 ++++++++++++++++++ 3 files changed, 598 insertions(+) create mode 100644 tests/unit/vertexai/genai/replays/test_internal_generate_rubrics.py diff --git a/tests/unit/vertexai/genai/replays/test_internal_generate_rubrics.py b/tests/unit/vertexai/genai/replays/test_internal_generate_rubrics.py new file mode 100644 index 0000000000..ce8e24138c --- /dev/null +++ b/tests/unit/vertexai/genai/replays/test_internal_generate_rubrics.py @@ -0,0 +1,170 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=protected-access,bad-continuation,missing-function-docstring + + +from tests.unit.vertexai.genai.replays import pytest_helper +from vertexai._genai import types + +_TEST_RUBRIC_GENERATION_PROMPT = """SPECIAL INSTRUCTION: think silently. Silent thinking token budget: 16384. + +You are a teacher who is responsible for scoring a student\'s response to a prompt. In order to score that response, you must write down a rubric for each prompt. That rubric states what properties the response must have in order to be a valid response to the prompt. Properties are weighted by importance via the "importance" field. + +Rubric requirements: +- Properties either exist or don\'t exist. +- Properties can be either implicit in the prompt or made explicit by the prompt. +- Make sure to always include the correct expected human language as one of the properties. If the prompt asks for code, the programming language should be covered by a separate property. +- The correct expected language may be explicit in the text of the prompt but is usually simply implicit in the prompt itself. +- Be as comprehensive as possible with the list of properties in the rubric. +- All properties in the rubric must be in English, regardless of the language of the prompt. +- Rubric properties should not specify correct answers in their descriptions, e.g. to math and factoid questions if the prompt calls for such an answer. Rather, it should check that the response contains an answer and optional supporting evidence if relevant, and assume some other process will later validate correctness. A rubric property should however call out any false premises present in the prompt. + +About importance: +- Most properties will be of medium importance by default. +- Properties of high importance are critical to be fulfilled in a good response. +- Properties of low importance are considered optional or supplementary nice-to-haves. + +You will see prompts in many different languages, not just English. For each prompt you see, you will write down this rubric in JSON format. + +IMPORTANT: Never respond to the prompt given. Only write a rubric. + +Example: +What is the tallest building in the world? + +```json +{ + "criteria":[ + { + "rubric_id": "00001", + "property": "The response is in English.", + "type": "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE", + "importance": "high" + }, + { + "rubric_id": "00002", + "property": "Contains the name of the tallest building in the world.", + "type": "QA_ANSWER:FACTOID", + "importance": "high" + }, + { + "rubric_id": "00003", + "property": "Contains the exact height of the tallest building.", + "type": "QA_SUPPORTING_EVIDENCE:HEIGHT", + "importance": "low" + }, + { + "rubric_id": "00004", + "property": "Contains the location of the tallest building.", + "type": "QA_SUPPORTING_EVIDENCE:LOCATION", + "importance": "low" + }, + ... + ] +} +``` + +Write me a letter to my HOA asking them to reconsider the fees they are asking me to pay because I haven\'t mowed my lawn on time. I have been very busy at work. +```json +{ + "criteria": [ + { + "rubric_id": "00001", + "property": "The response is in English.", + "type": "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE", + "importance": "high" + }, + { + "rubric_id": "00002", + "property": "The response is formatted as a letter.", + "type": "FORMAT_REQUIREMENT:FORMAL_LETTER", + "importance": "medium" + }, + { + "rubric_id": "00003", + "property": "The letter is addressed to the Homeowners Association (HOA).", + "type": "CONTENT_REQUIREMENT:ADDRESSEE", + "importance": "medium" + }, + { + "rubric_id": "00004", + "property": "The letter explains that the sender has not mowed their lawn on time.", + "type": "CONTENT_REQUIREMENT:BACKGROUND_CONTEXT:TARDINESS", + "importance": "medium" + }, + { + "rubric_id": "00005", + "property": "The letter provides a reason for not mowing the lawn, specifically being busy at work.", + "type": "CONTENT_REQUIREMENT:EXPLANATION:EXCUSE:BUSY", + "importance": "medium" + }, + { + "rubric_id": "00006", + "property": "The letter discusses that the sender has been in compliance until now.", + "type": "OPTIONAL_CONTENT:SUPPORTING_EVIDENCE:COMPLIANCE", + "importance": "low" + }, + { + "rubric_id": "00007", + "property": "The letter requests that the HOA reconsider the fees associated with not mowing the lawn on time.", + "type": "CONTENT_REQUIREMENT:REQUEST:FEE_WAIVER", + "importance": "high" + }, + { + "rubric_id": "00008", + "property": "The letter maintains a polite and respectful tone.", + "type": "CONTENT_REQUIREMENT:FORMALITY:FORMAL", + "importance": "high" + }, + { + "rubric_id": "00009", + "property": "The letter includes a closing (e.g., \'Sincerely\') and the sender\'s name.", + "type": "CONTENT_REQUIREMENT:SIGNATURE", + "importance": "medium" + } + ] +} +``` + +Now write a rubric for the following user prompt. Remember to write only the rubric, NOT response to the prompt. + +User prompt: +{prompt}""" + + +def test_internal_method_generate_rubrics(client): + """Tests the internal _generate_rubrics method.""" + test_contents = [ + types.Content( + parts=[ + types.Part( + text="Generate a short story about a friendly dragon.", + ), + ], + ) + ] + response = client.evals._generate_rubrics( + contents=test_contents, + rubric_generation_spec=types.RubricGenerationSpec( + prompt_template=_TEST_RUBRIC_GENERATION_PROMPT, + ), + ) + assert len(response.generated_rubrics) >= 1 + + +pytestmark = pytest_helper.setup( + file=__file__, + globals_for_file=globals(), + test_method="evals._generate_rubrics", +) diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 1913d7fb71..d06087deed 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -664,6 +664,65 @@ def _EvaluateInstancesRequestParameters_to_vertex( return to_object +def _RubricGenerationSpec_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["prompt_template"]) is not None: + setv( + to_object, + ["promptTemplate"], + getv(from_object, ["prompt_template"]), + ) + + if getv(from_object, ["generator_model_config"]) is not None: + setv( + to_object, + ["model_config"], + getv(from_object, ["generator_model_config"]), + ) + + if getv(from_object, ["rubric_content_type"]) is not None: + setv( + to_object, + ["rubricContentType"], + getv(from_object, ["rubric_content_type"]), + ) + + if getv(from_object, ["rubric_type_ontology"]) is not None: + setv( + to_object, + ["rubricTypeOntology"], + getv(from_object, ["rubric_type_ontology"]), + ) + + return to_object + + +def _GenerateInstanceRubricsRequest_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["contents"]) is not None: + setv(to_object, ["contents"], getv(from_object, ["contents"])) + + if getv(from_object, ["rubric_generation_spec"]) is not None: + setv( + to_object, + ["rubricGenerationSpec"], + _RubricGenerationSpec_to_vertex( + getv(from_object, ["rubric_generation_spec"]), to_object + ), + ) + + if getv(from_object, ["config"]) is not None: + setv(to_object, ["config"], getv(from_object, ["config"])) + + return to_object + + def _EvaluateInstancesResponse_from_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -790,6 +849,21 @@ def _EvaluateInstancesResponse_from_vertex( return to_object +def _GenerateInstanceRubricsResponse_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["generatedRubrics"]) is not None: + setv( + to_object, + ["generated_rubrics"], + getv(from_object, ["generatedRubrics"]), + ) + + return to_object + + class Evals(_api_module.BaseModule): def _evaluate_instances( self, @@ -869,6 +943,62 @@ def _evaluate_instances( self._api_client._verify_response(return_value) return return_value + def _generate_rubrics( + self, + *, + contents: list[genai_types.ContentOrDict], + rubric_generation_spec: types.RubricGenerationSpecOrDict, + config: Optional[types.RubricGenerationConfigOrDict] = None, + ) -> types.GenerateInstanceRubricsResponse: + """Generates rubrics for a given prompt.""" + + parameter_model = types._GenerateInstanceRubricsRequest( + contents=contents, + rubric_generation_spec=rubric_generation_spec, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _GenerateInstanceRubricsRequest_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = ":generateInstanceRubrics".format_map(request_url_dict) + else: + path = ":generateInstanceRubrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = self._api_client.request("post", path, request_dict, http_options) + + response_dict = "" if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _GenerateInstanceRubricsResponse_from_vertex(response_dict) + + return_value = types.GenerateInstanceRubricsResponse._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + def run(self) -> types.EvaluateInstancesResponse: """Evaluates an instance of a model. @@ -1133,6 +1263,64 @@ async def _evaluate_instances( self._api_client._verify_response(return_value) return return_value + async def _generate_rubrics( + self, + *, + contents: list[genai_types.ContentOrDict], + rubric_generation_spec: types.RubricGenerationSpecOrDict, + config: Optional[types.RubricGenerationConfigOrDict] = None, + ) -> types.GenerateInstanceRubricsResponse: + """Generates rubrics for a given prompt.""" + + parameter_model = types._GenerateInstanceRubricsRequest( + contents=contents, + rubric_generation_spec=rubric_generation_spec, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _GenerateInstanceRubricsRequest_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = ":generateInstanceRubrics".format_map(request_url_dict) + else: + path = ":generateInstanceRubrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = await self._api_client.async_request( + "post", path, request_dict, http_options + ) + + response_dict = "" if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _GenerateInstanceRubricsResponse_from_vertex(response_dict) + + return_value = types.GenerateInstanceRubricsResponse._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + async def batch_evaluate( self, *, diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py index 951d662f4a..5067131bbd 100644 --- a/vertexai/_genai/types.py +++ b/vertexai/_genai/types.py @@ -218,6 +218,17 @@ class Language(_common.CaseInSensitiveEnum): """Python >= 3.10, with numpy and simpy available.""" +class RubricContentType(_common.CaseInSensitiveEnum): + """Specifies the type of rubric content to generate.""" + + PROPERTY = "PROPERTY" + """Generate rubrics based on properties.""" + NL_QUESTION_ANSWER = "NL_QUESTION_ANSWER" + """Generate rubrics in an NL question answer format.""" + PYTHON_CODE_ASSERTION = "PYTHON_CODE_ASSERTION" + """Generate rubrics in a unit test format.""" + + class GenerateMemoriesResponseGeneratedMemoryAction(_common.CaseInSensitiveEnum): """The action to take.""" @@ -234,6 +245,19 @@ class GenerateMemoriesResponseGeneratedMemoryAction(_common.CaseInSensitiveEnum) """The memory was deleted.""" +class Importance(_common.CaseInSensitiveEnum): + """Importance level of the rubric.""" + + IMPORTANCE_UNSPECIFIED = "IMPORTANCE_UNSPECIFIED" + """Importance is not specified.""" + HIGH = "HIGH" + """High importance.""" + MEDIUM = "MEDIUM" + """Medium importance.""" + LOW = "LOW" + """Low importance.""" + + class BleuInstance(_common.BaseModel): """Bleu instance.""" @@ -2082,6 +2106,222 @@ class EvaluateInstancesResponseDict(TypedDict, total=False): ] +class RubricGenerationSpec(_common.BaseModel): + """Spec for generating rubrics.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Template for the prompt used to generate rubrics. + The details should be updated based on the most-recent recipe requirements.""", + ) + generator_model_config: Optional[AutoraterConfig] = Field( + default=None, + description="""Configuration for the model used in rubric generation. + Configs including sampling count and base model can be specified here. + Flipping is not supported for rubric generation.""", + ) + rubric_content_type: Optional[RubricContentType] = Field( + default=None, + description="""The type of rubric content to be generated.""", + ) + rubric_type_ontology: Optional[list[str]] = Field( + default=None, + description="""An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""", + ) + + +class RubricGenerationSpecDict(TypedDict, total=False): + """Spec for generating rubrics.""" + + prompt_template: Optional[str] + """Template for the prompt used to generate rubrics. + The details should be updated based on the most-recent recipe requirements.""" + + generator_model_config: Optional[AutoraterConfigDict] + """Configuration for the model used in rubric generation. + Configs including sampling count and base model can be specified here. + Flipping is not supported for rubric generation.""" + + rubric_content_type: Optional[RubricContentType] + """The type of rubric content to be generated.""" + + rubric_type_ontology: Optional[list[str]] + """An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""" + + +RubricGenerationSpecOrDict = Union[RubricGenerationSpec, RubricGenerationSpecDict] + + +class RubricGenerationConfig(_common.BaseModel): + """Config for generating rubrics.""" + + http_options: Optional[HttpOptions] = Field( + default=None, description="""Used to override HTTP request options.""" + ) + + +class RubricGenerationConfigDict(TypedDict, total=False): + """Config for generating rubrics.""" + + http_options: Optional[HttpOptionsDict] + """Used to override HTTP request options.""" + + +RubricGenerationConfigOrDict = Union[RubricGenerationConfig, RubricGenerationConfigDict] + + +class _GenerateInstanceRubricsRequest(_common.BaseModel): + """Parameters for generating rubrics.""" + + contents: Optional[list[genai_types.Content]] = Field( + default=None, + description="""The prompt to generate rubrics from. For single-turn queries, this is a single instance. For multi-turn queries, this is a repeated field that contains conversation history + latest request.""", + ) + rubric_generation_spec: Optional[RubricGenerationSpec] = Field( + default=None, + description="""Specification for how the rubrics should be generated.""", + ) + config: Optional[RubricGenerationConfig] = Field(default=None, description="""""") + + +class _GenerateInstanceRubricsRequestDict(TypedDict, total=False): + """Parameters for generating rubrics.""" + + contents: Optional[list[genai_types.Content]] + """The prompt to generate rubrics from. For single-turn queries, this is a single instance. For multi-turn queries, this is a repeated field that contains conversation history + latest request.""" + + rubric_generation_spec: Optional[RubricGenerationSpecDict] + """Specification for how the rubrics should be generated.""" + + config: Optional[RubricGenerationConfigDict] + """""" + + +_GenerateInstanceRubricsRequestOrDict = Union[ + _GenerateInstanceRubricsRequest, _GenerateInstanceRubricsRequestDict +] + + +class RubricContentProperty(_common.BaseModel): + """Defines criteria based on a specific property.""" + + description: Optional[str] = Field( + default=None, + description="""Description of the property being evaluated. + Example: "The model's response is grammatically correct." """, + ) + + +class RubricContentPropertyDict(TypedDict, total=False): + """Defines criteria based on a specific property.""" + + description: Optional[str] + """Description of the property being evaluated. + Example: "The model's response is grammatically correct." """ + + +RubricContentPropertyOrDict = Union[RubricContentProperty, RubricContentPropertyDict] + + +class RubricContent(_common.BaseModel): + """Content of the rubric, defining the testable criteria.""" + + property: Optional[RubricContentProperty] = Field( + default=None, + description="""Evaluation criteria based on a specific property.""", + ) + + +class RubricContentDict(TypedDict, total=False): + """Content of the rubric, defining the testable criteria.""" + + property: Optional[RubricContentPropertyDict] + """Evaluation criteria based on a specific property.""" + + +RubricContentOrDict = Union[RubricContent, RubricContentDict] + + +class Rubric(_common.BaseModel): + """Message representing a single testable criterion for evaluation. + + One input prompt could have multiple rubrics. + """ + + rubric_id: Optional[str] = Field( + default=None, + description="""Required. Unique identifier for the rubric. + This ID is used to refer to this rubric, e.g., in RubricVerdict.""", + ) + content: Optional[RubricContent] = Field( + default=None, + description="""Required. The actual testable criteria for the rubric.""", + ) + type: Optional[str] = Field( + default=None, + description="""Optional. A type designator for the rubric, which can inform how it's + evaluated or interpreted by systems or users. + It's recommended to use consistent, well-defined, upper snake_case strings. + Examples: "SUMMARIZATION_QUALITY", "SAFETY_HARMFUL_CONTENT", + "INSTRUCTION_ADHERENCE".""", + ) + importance: Optional[Importance] = Field( + default=None, + description="""Optional. The relative importance of this rubric.""", + ) + + +class RubricDict(TypedDict, total=False): + """Message representing a single testable criterion for evaluation. + + One input prompt could have multiple rubrics. + """ + + rubric_id: Optional[str] + """Required. Unique identifier for the rubric. + This ID is used to refer to this rubric, e.g., in RubricVerdict.""" + + content: Optional[RubricContentDict] + """Required. The actual testable criteria for the rubric.""" + + type: Optional[str] + """Optional. A type designator for the rubric, which can inform how it's + evaluated or interpreted by systems or users. + It's recommended to use consistent, well-defined, upper snake_case strings. + Examples: "SUMMARIZATION_QUALITY", "SAFETY_HARMFUL_CONTENT", + "INSTRUCTION_ADHERENCE".""" + + importance: Optional[Importance] + """Optional. The relative importance of this rubric.""" + + +RubricOrDict = Union[Rubric, RubricDict] + + +class GenerateInstanceRubricsResponse(_common.BaseModel): + """Response for generating rubrics.""" + + generated_rubrics: Optional[list[Rubric]] = Field( + default=None, description="""A list of generated rubrics.""" + ) + + +class GenerateInstanceRubricsResponseDict(TypedDict, total=False): + """Response for generating rubrics.""" + + generated_rubrics: Optional[list[RubricDict]] + """A list of generated rubrics.""" + + +GenerateInstanceRubricsResponseOrDict = Union[ + GenerateInstanceRubricsResponse, GenerateInstanceRubricsResponseDict +] + + class OptimizeConfig(_common.BaseModel): """Config for Prompt Optimizer."""