From 8c6ddf54adf91e2fbf00034fef413ccfde3769d6 Mon Sep 17 00:00:00 2001
From: Jason Dai <jsndai@google.com>
Date: Fri, 5 Apr 2024 16:30:45 -0700
Subject: [PATCH] feat: Vertex Rapid Evaluation SDK and Prompt Template for
 Vertex Prompt Management Public Preview

PiperOrigin-RevId: 622310413
---
 setup.py                                      |   7 +
 tests/unit/vertexai/test_evaluation.py        | 440 +++++++++++++
 vertexai/preview/evaluation/__init__.py       |  37 ++
 vertexai/preview/evaluation/_base.py          |  79 +++
 vertexai/preview/evaluation/_eval_tasks.py    | 411 ++++++++++++
 vertexai/preview/evaluation/_evaluation.py    | 567 ++++++++++++++++
 vertexai/preview/evaluation/constants.py      | 180 ++++++
 .../preview/evaluation/metrics/__init__.py    |  29 +
 vertexai/preview/evaluation/metrics/_base.py  |  61 ++
 .../metrics/_instance_evaluation.py           | 603 ++++++++++++++++++
 .../preview/evaluation/prompt_template.py     |  84 +++
 vertexai/preview/evaluation/utils.py          | 176 +++++
 12 files changed, 2674 insertions(+)
 create mode 100644 tests/unit/vertexai/test_evaluation.py
 create mode 100644 vertexai/preview/evaluation/__init__.py
 create mode 100644 vertexai/preview/evaluation/_base.py
 create mode 100644 vertexai/preview/evaluation/_eval_tasks.py
 create mode 100644 vertexai/preview/evaluation/_evaluation.py
 create mode 100644 vertexai/preview/evaluation/constants.py
 create mode 100644 vertexai/preview/evaluation/metrics/__init__.py
 create mode 100644 vertexai/preview/evaluation/metrics/_base.py
 create mode 100644 vertexai/preview/evaluation/metrics/_instance_evaluation.py
 create mode 100644 vertexai/preview/evaluation/prompt_template.py
 create mode 100644 vertexai/preview/evaluation/utils.py

diff --git a/setup.py b/setup.py
index 23a2d5ef59..2149e83c13 100644
--- a/setup.py
+++ b/setup.py
@@ -145,6 +145,11 @@
     "pydantic < 3",
 ]
 
+rapid_evaluation_extra_require = [
+    "nest_asyncio >= 1.0.0, < 1.6.0",
+    "pandas >= 1.0.0, < 2.2.0",
+]
+
 full_extra_require = list(
     set(
         tensorboard_extra_require
@@ -162,6 +167,7 @@
         + preview_extra_require
         + ray_extra_require
         + reasoning_engine_extra_require
+        + rapid_evaluation_extra_require
     )
 )
 testing_extra_require = (
@@ -246,6 +252,7 @@
         "ray": ray_extra_require,
         "ray_testing": ray_testing_extra_require,
         "reasoningengine": reasoning_engine_extra_require,
+        "rapid_evaluation": rapid_evaluation_extra_require,
     },
     python_requires=">=3.8",
     classifiers=[
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
new file mode 100644
index 0000000000..c330506792
--- /dev/null
+++ b/tests/unit/vertexai/test_evaluation.py
@@ -0,0 +1,440 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from unittest import mock
+
+from google.cloud import aiplatform
+import vertexai
+from google.cloud.aiplatform import initializer
+from google.cloud.aiplatform.metadata import metadata
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_evaluation_service_types,
+)
+from vertexai.preview import evaluation
+from vertexai.preview.evaluation import utils
+import pandas as pd
+import pytest
+
+
+_TEST_PROJECT = "test-project"
+_TEST_LOCATION = "us-central1"
+_TEST_METRICS = [
+    "exact_match",
+    "bleu",
+    "rouge_1",
+    "rouge_2",
+    "rouge_l",
+    "rouge_l_sum",
+    "coherence",
+    "fluency",
+    "safety",
+    "groundedness",
+    "fulfillment",
+    "summarization_quality",
+    "summarization_helpfulness",
+    "summarization_verbosity",
+    "question_answering_quality",
+    "question_answering_relevance",
+    "question_answering_helpfulness",
+    "question_answering_correctness",
+]
+_TEST_EVAL_DATASET = pd.DataFrame(
+    {
+        "response": ["test", "text"],
+        "reference": ["test", "ref"],
+        "context": ["test", "context"],
+        "instruction": ["test", "instruction"],
+    }
+)
+_TEST_EVAL_DATASET_WITHOUT_RESPONSE = pd.DataFrame(
+    {
+        "reference": ["test", "ref"],
+        "context": ["test", "context"],
+        "instruction": ["test", "instruction"],
+    }
+)
+
+_TEST_JSONL_FILE_CONTENT = """{"prompt": "prompt", "reference": "reference"}\n
+{"prompt":"test", "reference": "test"}\n
+"""
+_TEST_CSV_FILE_CONTENT = """reference,context,instruction\ntest,test,test\n
+text,text,text\n
+"""
+
+
+_MOCK_EXACT_MATCH_RESULT = [
+    gapic_evaluation_service_types.EvaluateInstancesResponse(
+        exact_match_results=gapic_evaluation_service_types.ExactMatchResults(
+            exact_match_metric_values=[
+                gapic_evaluation_service_types.ExactMatchMetricValue(score=1.0),
+            ]
+        )
+    ),
+    gapic_evaluation_service_types.EvaluateInstancesResponse(
+        exact_match_results=gapic_evaluation_service_types.ExactMatchResults(
+            exact_match_metric_values=[
+                gapic_evaluation_service_types.ExactMatchMetricValue(score=0.0),
+            ]
+        )
+    ),
+]
+
+_MOCK_FLUENCY_RESULT = [
+    gapic_evaluation_service_types.EvaluateInstancesResponse(
+        fluency_result=gapic_evaluation_service_types.FluencyResult(
+            score=5, explanation="explanation", confidence=1.0
+        )
+    ),
+    gapic_evaluation_service_types.EvaluateInstancesResponse(
+        fluency_result=gapic_evaluation_service_types.FluencyResult(
+            score=4, explanation="explanation", confidence=0.5
+        )
+    ),
+]
+
+
+@pytest.fixture
+def mock_async_event_loop():
+    with mock.patch("asyncio.get_event_loop") as mock_async_event_loop:
+        yield mock_async_event_loop
+
+
+@pytest.fixture
+def mock_experiment_tracker():
+    with mock.patch.object(
+        metadata, "_experiment_tracker", autospec=True
+    ) as mock_experiment_tracker:
+        yield mock_experiment_tracker
+
+
+@pytest.mark.usefixtures("google_auth_mock")
+class TestEvaluation:
+    def setup_method(self):
+        vertexai.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+        )
+
+    def teardown_method(self):
+        initializer.global_pool.shutdown(wait=True)
+
+    def test_create_eval_task(self):
+        test_experiment = "test_experiment_name"
+        test_content_column_name = "test_content_column_name"
+        test_reference_column_name = "test_reference_column_name"
+        test_response_column_name = "test_response_column_name"
+
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET,
+            metrics=_TEST_METRICS,
+            experiment=test_experiment,
+            content_column_name=test_content_column_name,
+            reference_column_name=test_reference_column_name,
+            response_column_name=test_response_column_name,
+        )
+
+        assert test_eval_task.dataset.equals(_TEST_EVAL_DATASET)
+        assert test_eval_task.metrics == _TEST_METRICS
+        assert test_eval_task.experiment == test_experiment
+        assert test_eval_task.content_column_name == test_content_column_name
+        assert test_eval_task.reference_column_name == test_reference_column_name
+        assert test_eval_task.response_column_name == test_response_column_name
+
+    def test_evaluate_saved_response(self, mock_async_event_loop):
+        eval_dataset = _TEST_EVAL_DATASET
+        test_metrics = _TEST_METRICS
+        mock_summary_metrics = {
+            "row_count": 2,
+            "mock_metric/mean": 0.5,
+            "mock_metric/std": 0.5,
+        }
+        mock_metrics_table = pd.DataFrame(
+            {
+                "response": ["test", "text"],
+                "reference": ["test", "ref"],
+                "mock_metric": [1.0, 0.0],
+            }
+        )
+        mock_async_event_loop.return_value.run_until_complete.return_value = (
+            mock_summary_metrics,
+            mock_metrics_table,
+        )
+
+        test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics)
+        test_result = test_eval_task.evaluate()
+
+        assert test_result.summary_metrics == mock_summary_metrics
+        assert test_result.metrics_table.equals(mock_metrics_table)
+
+    @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
+    def test_compute_automatic_metrics(self, api_transport):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            api_transport=api_transport,
+        )
+        eval_dataset = pd.DataFrame(
+            {
+                "response": ["test", "text"],
+                "reference": ["test", "ref"],
+            }
+        )
+        test_metrics = ["exact_match"]
+        test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics)
+        mock_metric_results = _MOCK_EXACT_MATCH_RESULT
+        with mock.patch.object(
+            target=gapic_evaluation_services.EvaluationServiceAsyncClient,
+            attribute="evaluate_instances",
+            side_effect=mock_metric_results,
+        ):
+            test_result = test_eval_task.evaluate()
+
+        assert test_result.summary_metrics["row_count"] == 2
+        assert test_result.summary_metrics["exact_match/mean"] == 0.5
+        assert test_result.summary_metrics["exact_match/std"] == pytest.approx(0.7, 0.1)
+        assert list(test_result.metrics_table.columns.values) == [
+            "response",
+            "reference",
+            "exact_match",
+        ]
+        assert test_result.metrics_table[["response", "reference"]].equals(eval_dataset)
+        assert list(test_result.metrics_table["exact_match"].values) == [1.0, 0.0]
+
+    @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
+    def test_compute_pointwise_metrics(self, api_transport):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            api_transport=api_transport,
+        )
+        eval_dataset = pd.DataFrame(
+            {
+                "response": ["test", "text"],
+            }
+        )
+        test_metrics = ["fluency"]
+        test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics)
+        mock_metric_results = _MOCK_FLUENCY_RESULT
+        with mock.patch.object(
+            target=gapic_evaluation_services.EvaluationServiceAsyncClient,
+            attribute="evaluate_instances",
+            side_effect=mock_metric_results,
+        ):
+            test_result = test_eval_task.evaluate()
+
+        assert test_result.summary_metrics["row_count"] == 2
+        assert test_result.summary_metrics["fluency/mean"] == 4.5
+        assert test_result.summary_metrics["fluency/std"] == pytest.approx(0.7, 0.1)
+        assert set(test_result.metrics_table.columns.values) == set(
+            [
+                "response",
+                "fluency",
+                "fluency/explanation",
+                "fluency/confidence",
+            ]
+        )
+        assert test_result.metrics_table[["response"]].equals(eval_dataset)
+        assert list(test_result.metrics_table["fluency"].values) == [5, 4]
+        assert list(test_result.metrics_table["fluency/explanation"].values) == [
+            "explanation",
+            "explanation",
+        ]
+        assert list(test_result.metrics_table["fluency/confidence"].values) == [
+            1.0,
+            0.5,
+        ]
+
+
+@pytest.mark.usefixtures("google_auth_mock")
+class TestEvaluationErrors:
+    def setup_method(self):
+        vertexai.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+        )
+
+    def teardown_method(self):
+        initializer.global_pool.shutdown(wait=True)
+
+    def test_evaluate_empty_metrics(self):
+        test_eval_task = evaluation.EvalTask(dataset=_TEST_EVAL_DATASET, metrics=[])
+        with pytest.raises(ValueError, match="Metrics cannot be empty."):
+            test_eval_task.evaluate()
+
+    def test_evaluate_invalid_metrics(self):
+        metric_name = "invalid_metric"
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET, metrics=[metric_name]
+        )
+        with pytest.raises(
+            ValueError, match=f"Metric name: {metric_name} not supported."
+        ):
+            test_eval_task.evaluate()
+
+    def test_evaluate_invalid_experiment_run_name(self):
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET, metrics=_TEST_METRICS
+        )
+        with pytest.raises(ValueError, match="Experiment is not set"):
+            test_eval_task.evaluate(experiment_run_name="invalid_experiment_run_name")
+
+        with pytest.raises(ValueError, match="Experiment is not set"):
+            test_eval_task.display_runs()
+
+    def test_evaluate_experiment_name_already_exists(self, mock_experiment_tracker):
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET,
+            metrics=_TEST_METRICS,
+            experiment="test_eval_experiment_name",
+        )
+        mock_experiment_tracker.experiment_run.return_value = "experiment_run_1"
+        with pytest.raises(ValueError, match="Experiment run already exists"):
+            test_eval_task.evaluate(experiment_run_name="experiment_run_2")
+
+    def test_evaluate_invalid_dataset_content_column(self):
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE,
+            metrics=_TEST_METRICS,
+        )
+        with pytest.raises(KeyError, match="Required column `content` not found"):
+            test_eval_task.evaluate(model=mock.MagicMock())
+
+    def test_evaluate_invalid_prompt_template_placeholder(self):
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE,
+            metrics=_TEST_METRICS,
+        )
+        with pytest.raises(ValueError, match="Failed to complete prompt template"):
+            test_eval_task.evaluate(
+                prompt_template="test_prompt_template {invalid_placeholder}",
+            )
+
+
+@pytest.mark.usefixtures("google_auth_mock")
+class TestEvaluationUtils:
+    def setup_method(self):
+        vertexai.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+        )
+
+    def teardown_method(self):
+        initializer.global_pool.shutdown(wait=True)
+
+    def test_create_evaluation_service_async_client(self):
+        client = utils.create_evaluation_service_async_client()
+        assert isinstance(client, utils._EvaluationServiceAsyncClientWithOverride)
+
+    def test_load_dataset_from_dataframe(self):
+        data = {"col1": [1, 2], "col2": ["a", "b"]}
+        df = pd.DataFrame(data)
+        loaded_df = utils.load_dataset(df)
+        assert loaded_df.equals(df)
+
+    def test_load_dataset_from_dict(self):
+        data = {"col1": [1, 2], "col2": ["a", "b"]}
+        loaded_df = utils.load_dataset(data)
+        assert isinstance(loaded_df, pd.DataFrame)
+        assert loaded_df.to_dict("list") == data
+
+    def test_load_dataset_from_gcs_jsonl(self):
+        source = "gs://test_bucket/test_file.jsonl"
+        with mock.patch.object(
+            utils,
+            "_read_gcs_file_contents",
+            return_value=_TEST_JSONL_FILE_CONTENT,
+        ):
+            loaded_df = utils.load_dataset(source)
+
+        assert isinstance(loaded_df, pd.DataFrame)
+        assert loaded_df.to_dict("list") == {
+            "prompt": ["prompt", "test"],
+            "reference": ["reference", "test"],
+        }
+
+    def test_load_dataset_from_gcs_csv(self):
+        source = "gs://test_bucket/test_file.csv"
+        with mock.patch.object(
+            utils, "_read_gcs_file_contents", return_value=_TEST_CSV_FILE_CONTENT
+        ):
+            loaded_df = utils.load_dataset(source)
+
+        assert isinstance(loaded_df, pd.DataFrame)
+        assert loaded_df.to_dict("list") == {
+            "reference": ["test", "text"],
+            "context": ["test", "text"],
+            "instruction": ["test", "text"],
+        }
+
+    def test_load_dataset_from_bigquery(self):
+        source = "bq://project-id.dataset.table_name"
+        with mock.patch.object(
+            utils, "_load_bigquery", return_value=_TEST_EVAL_DATASET
+        ):
+            loaded_df = utils.load_dataset(source)
+
+        assert isinstance(loaded_df, pd.DataFrame)
+        assert loaded_df.equals(_TEST_EVAL_DATASET)
+
+
+class TestPromptTemplate:
+    def test_init(self):
+        template_str = "Hello, {name}!"
+        prompt_template = evaluation.PromptTemplate(template_str)
+        assert prompt_template.template == template_str
+
+    def test_get_placeholders(self):
+        template_str = "Hello, {name}! Today is {day}."
+        prompt_template = evaluation.PromptTemplate(template_str)
+        assert prompt_template.placeholders == {"name", "day"}
+
+    def test_format(self):
+        template_str = "Hello, {name}! Today is {day}."
+        prompt_template = evaluation.PromptTemplate(template_str)
+        completed_prompt = prompt_template.assemble(name="John", day="Monday")
+        assert str(completed_prompt) == "Hello, John! Today is Monday."
+
+    def test_format_missing_placeholder(self):
+        template_str = "Hello, {name}!"
+        prompt_template = evaluation.PromptTemplate(template_str)
+        completed_prompt = prompt_template.assemble()
+        assert str(completed_prompt) == "Hello, {name}!"
+        assert prompt_template.placeholders == {"name"}
+
+    def test_partial_format(self):
+        template_str = "Hello, {name}! Today is {day}."
+        prompt_template = evaluation.PromptTemplate(template_str)
+        partially_completed_prompt = prompt_template.assemble(name="John")
+
+        assert isinstance(partially_completed_prompt, evaluation.PromptTemplate)
+        assert str(partially_completed_prompt) == "Hello, John! Today is {day}."
+        assert partially_completed_prompt.placeholders == {"day"}
+
+        completed_prompt = partially_completed_prompt.assemble(day="Monday")
+        assert str(completed_prompt) == "Hello, John! Today is Monday."
+
+    def test_str(self):
+        template_str = "Hello, world!"
+        prompt_template = evaluation.PromptTemplate(template_str)
+        assert str(prompt_template) == template_str
+
+    def test_repr(self):
+        template_str = "Hello, {name}!"
+        prompt_template = evaluation.PromptTemplate(template_str)
+        assert repr(prompt_template) == f"PromptTemplate('{template_str}')"
diff --git a/vertexai/preview/evaluation/__init__.py b/vertexai/preview/evaluation/__init__.py
new file mode 100644
index 0000000000..67895b4377
--- /dev/null
+++ b/vertexai/preview/evaluation/__init__.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Rapid GenAI Evaluation Module."""
+
+from vertexai.preview.evaluation import _base
+from vertexai.preview.evaluation import _eval_tasks
+from vertexai.preview.evaluation import metrics
+from vertexai.preview.evaluation import prompt_template
+
+
+EvalResult = _base.EvalResult
+EvalTask = _eval_tasks.EvalTask
+CustomMetric = metrics.CustomMetric
+make_metric = metrics.make_metric
+PromptTemplate = prompt_template.PromptTemplate
+
+__all__ = [
+    "CustomMetric",
+    "EvalResult",
+    "EvalTask",
+    "make_metric",
+    "PromptTemplate",
+]
diff --git a/vertexai/preview/evaluation/_base.py b/vertexai/preview/evaluation/_base.py
new file mode 100644
index 0000000000..588e1e6eac
--- /dev/null
+++ b/vertexai/preview/evaluation/_base.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Base classes for evaluation."""
+
+
+import dataclasses
+from typing import Dict, List, Optional, Union, TYPE_CHECKING
+
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+@dataclasses.dataclass
+class EvaluationRunConfig:
+    """Evaluation Run Configurations.
+
+    Attributes:
+      dataset: The dataset to evaluate.
+      metrics: The list of metric names to evaluate, or a metrics bundle for an
+        evaluation task, or custom metric instances.
+      column_map: The dictionary of column name overrides in the dataset.
+      client: The asynchronous evaluation client.
+    """
+
+    dataset: "pd.DataFrame"
+    metrics: List[Union[str, metrics_base.CustomMetric]]
+    column_map: Dict[str, str]
+    client: gapic_evaluation_services.EvaluationServiceAsyncClient
+
+    def validate_dataset_column(self, column_name: str) -> None:
+        """Validates that the column names in the column map are in the dataset.
+
+        Args:
+          column_name: The column name to validate.
+
+        Raises:
+          KeyError: If any of the column names are not in the dataset.
+        """
+        if self.column_map.get(column_name, column_name) not in self.dataset.columns:
+            raise KeyError(
+                f"Required column `{self.column_map.get(column_name, column_name)}`"
+                " not found in the eval dataset. The columns in the provided dataset"
+                f" are {self.dataset.columns}."
+            )
+
+
+@dataclasses.dataclass
+class EvalResult:
+    """Evaluation result.
+
+    Attributes:
+      summary_metrics: The summary evaluation metrics for an evaluation run.
+      metrics_table: A table containing eval inputs, ground truth, and metrics per
+        row.
+    """
+
+    summary_metrics: Dict[str, float]
+    metrics_table: Optional["pd.DataFrame"] = None
diff --git a/vertexai/preview/evaluation/_eval_tasks.py b/vertexai/preview/evaluation/_eval_tasks.py
new file mode 100644
index 0000000000..cf86d12710
--- /dev/null
+++ b/vertexai/preview/evaluation/_eval_tasks.py
@@ -0,0 +1,411 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
+import uuid
+
+from google.api_core import exceptions
+import vertexai
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform.metadata import metadata
+from vertexai import generative_models
+from vertexai.preview.evaluation import _base as eval_base
+from vertexai.preview.evaluation import _evaluation
+from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+# pylint: disable=g-import-not-at-top
+try:
+    from IPython import display as IPython_display
+except ImportError:
+    IPython_display = None
+
+_LOGGER = base.Logger(__name__)
+
+EvalResult = eval_base.EvalResult
+GenerativeModel = generative_models.GenerativeModel
+
+
+class EvalTask:
+    """A class representing an EvalTask.
+
+    An Evaluation Tasks is defined to measure the model's ability to perform a
+    certain task in response to specific prompts or inputs. Evaluation tasks must
+    contain an evaluation dataset, and a list of metrics to evaluate. Evaluation
+    tasks help developers compare propmpt templates, track experiments, compare
+    models and their settings, and assess the quality of the model's generated
+    text.
+
+    Dataset details:
+        Default dataset column names:
+            * content_column_name: "content"
+            * reference_column_name: "reference"
+            * response_column_name: "response"
+        Requirement for different use cases:
+          * Bring your own prediction: A `response` column is required. Response
+              column name can be customized by providing `response_column_name`
+              parameter.
+          * Without prompt template: A column representing the input prompt to the
+              model is required. If `content_column_name` is not specified, the
+              eval dataset requires `content` column by default. The response
+              column is not used if present and new responses from the model are
+              generated with the content column and used for evaluation.
+          * With prompt template: Dataset must contain column names corresponding to
+              the placeholder names in the prompt template. For example, if prompt
+              template is "Instruction: {instruction}, context: {context}", the
+              dataset must contain `instruction` and `context` column.
+
+    Metrics Details:
+        The supported metrics, metric bundle descriptions, grading rubrics, and
+        the required input fields can be found on the Vertex AI public
+        documentation.
+
+    Usage:
+        1. To perform bring your own prediction evaluation, provide the model
+        responses in the response column in the dataset. The response column name
+        is "response" by default, or specify `response_column_name` parameter to
+        customize.
+
+          ```
+          eval_dataset = pd.DataFrame({
+                  "reference": [...],
+                  "response" : [...],
+          })
+          eval_task = EvalTask(
+            dataset=eval_dataset,
+            metrics=["bleu", "rouge_l_sum", "coherence", "fluency"],
+            experiment="my-experiment",
+          )
+          eval_result = eval_task.evaluate(
+                experiment_run_name="eval-experiment-run"
+          )
+          ```
+
+        2. To perform evaluation with built-in Gemini model inference, specify the
+        `model` parameter with a GenerativeModel instance.  The default query
+        column name to the model is `content`.
+
+          ```
+          eval_dataset = pd.DataFrame({
+                "reference": [...],
+                "content"  : [...],
+          })
+          result = EvalTask(
+              dataset=eval_dataset,
+              metrics=["exact_match", "bleu", "rouge_1", "rouge_2",
+              "rouge_l_sum"],
+              experiment="my-experiment",
+          ).evaluate(
+              model=GenerativeModel("gemini-pro"),
+              experiment_run_name="gemini-pro-eval-run"
+          )
+          ```
+
+        3. If a `prompt_template` is specified, the `content` column is not required.
+        Prompts can be assembled from the evaluation dataset, and all placeholder
+        names must be present in the dataset columns.
+          ```
+          eval_dataset = pd.DataFrame({
+              "context"    : [...],
+              "instruction": [...],
+              "reference"  : [...],
+          })
+          result = EvalTask(
+              dataset=eval_dataset,
+              metrics=["summarization_quality"],
+          ).evaluate(
+              model=model,
+              prompt_template="{instruction}. Article: {context}. Summary:",
+          )
+          ```
+
+        4. To perform evaluation with custom model inference, specify the `model`
+        parameter with a custom prediction function. The `content` column in the
+        dataset is used to generate predictions with the custom model function for
+        evaluation.
+
+          ```
+          def custom_model_fn(input: str) -> str:
+            response = client.chat.completions.create(
+              model="gpt-3.5-turbo",
+              messages=[
+                {"role": "user", "content": input}
+              ]
+            )
+            return response.choices[0].message.content
+
+          eval_dataset = pd.DataFrame({
+                "content"  : [...],
+                "reference": [...],
+          })
+          result = EvalTask(
+              dataset=eval_dataset,
+              metrics=["text_generation_similarity","text_generation_quality"],
+              experiment="my-experiment",
+          ).evaluate(
+              model=custom_model_fn,
+              experiment_run_name="gpt-eval-run"
+          )
+          ```
+    """
+
+    _resource_noun = "evalTasks"
+
+    def __init__(
+        self,
+        *,
+        dataset: Union["pd.DataFrame", str, Dict[str, Any]],
+        metrics: List[
+            Union[
+                Literal[
+                    "exact_match",
+                    "bleu",
+                    "rouge_1",
+                    "rouge_2",
+                    "rouge_l",
+                    "rouge_l_sum",
+                    "coherence",
+                    "fluency",
+                    "safety",
+                    "groundedness",
+                    "fulfillment",
+                    "summarization_quality",
+                    "summarization_helpfulness",
+                    "summarization_verbosity",
+                    "question_answering_quality",
+                    "question_answering_relevance",
+                    "question_answering_helpfulness",
+                    "question_answering_correctness",
+                    "text_generation_similarity",
+                    "text_generation_quality",
+                    "text_generation_instruction_following",
+                    "text_generation_safety",
+                    "text_generation_factuality",
+                    "summarization_pointwise_reference_free",
+                    "qa_pointwise_reference_free",
+                    "qa_pointwise_reference_based",
+                    "tool_call_quality",
+                ],
+                metrics_base.CustomMetric,
+            ]
+        ],
+        experiment: Optional[str] = None,
+        content_column_name: str = "content",
+        reference_column_name: str = "reference",
+        response_column_name: str = "response",
+    ):
+        """Initializes an EvalTask.
+
+        Args:
+            dataset: The dataset to be evaluated.
+                Supports the following dataset formats:
+                * pandas.DataFrame: Used directly for evaluation.
+                * Dict: Converted to a pandas DataFrame before evaluation.
+                * str: Interpreted as a file path or URI. Supported formats include:
+                    * Local JSONL or CSV files:  Loaded from the local filesystem.
+                    * GCS JSONL or CSV files: Loaded from Google Cloud Storage
+                        (e.g., 'gs://bucket/data.csv').
+                    * BigQuery table URI: Loaded from Google Cloud BigQuery
+                        (e.g., 'bq://project-id.dataset.table_name').
+            metrics: The list of metrics names to be evaluated, or a metrics
+                bundle for an evaluation task, or custom metric instances.
+            experiment: The name of the experiment to log the evaluations to.
+            content_column_name: The column name of content in the dataset to send to
+                the model. If not set, default to `content`.
+            reference_column_name: The column name of ground truth in the dataset. If
+                not set, default to `reference`.
+            response_column_name: The column name of model response in the dataset. If
+                not set, default to `response`.
+        """
+        self.dataset = utils.load_dataset(dataset)
+        self.metrics = metrics
+        self.experiment = experiment
+        self.content_column_name = content_column_name
+        self.reference_column_name = reference_column_name
+        self.response_column_name = response_column_name
+
+    def _evaluate_with_experiment(
+        self,
+        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
+        prompt_template: Optional[str] = None,
+        experiment_run_name: Optional[str] = None,
+        response_column_name: str = "response",
+    ) -> EvalResult:
+        """Runs an evaluation for the EvalTask with an experiment.
+
+        Args:
+          model: A GenerativeModel instance or a custom model function to generate
+            responses to evaluate. If not provided, the evaluation is computed with
+            the `response` column in the `dataset`.
+          prompt_template: The prompt template to use for the evaluation. If not
+            set, the prompt template that was used to create the EvalTask will be
+            used.
+          experiment_run_name: The name of the experiment run to log the evaluation
+            to if an experiment is set for this EvalTask. If not provided, a random
+            unique experiment run name is used.
+          response_column_name: The column name of model response in the dataset. If
+            not set, default to `response`.
+
+        Returns:
+          The evaluation result.
+        """
+        self._validate_experiment_run()
+        with vertexai.preview.start_run(experiment_run_name):
+            self._log_eval_experiment_param(model, prompt_template)
+            eval_result = _evaluation.evaluate(
+                dataset=self.dataset,
+                metrics=self.metrics,
+                model=model,
+                prompt_template=prompt_template,
+                content_column_name=self.content_column_name,
+                reference_column_name=self.reference_column_name,
+                response_column_name=response_column_name or self.response_column_name,
+            )
+            try:
+                vertexai.preview.log_metrics(eval_result.summary_metrics)
+            except (ValueError, TypeError, exceptions.InvalidArgument) as e:
+                _LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
+        return eval_result
+
+    def evaluate(
+        self,
+        *,
+        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
+        prompt_template: Optional[str] = None,
+        experiment_run_name: Optional[str] = None,
+        response_column_name: str = "response",
+    ) -> EvalResult:
+        """Runs an evaluation for the EvalTask.
+
+        Args:
+          model: A GenerativeModel instance or a custom model function to generate
+            responses to evaluate. If not provided, the evaluation is computed with
+            the `response` column in the `dataset`.
+          prompt_template: The prompt template to use for the evaluation. If not
+            set, the prompt template that was used to create the EvalTask will be
+            used.
+          experiment_run_name: The name of the experiment run to log the evaluation
+            to if an experiment is set for this EvalTask. If not provided, a random
+            unique experiment run name is used.
+          response_column_name: The column name of model response in the dataset. If
+            not set, default to `response`.
+
+        Returns:
+          The evaluation result.
+        """
+        global_experiment_name = metadata._experiment_tracker.experiment_name
+        if experiment_run_name and not self.experiment and not global_experiment_name:
+            raise ValueError(
+                "Experiment is not set. Please initialize EvalTask with an"
+                " experiment, or initialize a global experiment with "
+                "`vertexai.init(experiment='experiment_name')`for logging this"
+                " evaluation run."
+            )
+
+        experiment_run_name = experiment_run_name or f"{uuid.uuid4()}"
+
+        if self.experiment and global_experiment_name:
+            metadata._experiment_tracker.set_experiment(
+                experiment=self.experiment, backing_tensorboard=False
+            )
+            eval_result = self._evaluate_with_experiment(
+                model, prompt_template, experiment_run_name, response_column_name
+            )
+            metadata._experiment_tracker.set_experiment(
+                experiment=global_experiment_name, backing_tensorboard=False
+            )
+        elif self.experiment and not global_experiment_name:
+            metadata._experiment_tracker.set_experiment(
+                experiment=self.experiment, backing_tensorboard=False
+            )
+            eval_result = self._evaluate_with_experiment(
+                model, prompt_template, experiment_run_name, response_column_name
+            )
+            metadata._experiment_tracker.reset()
+        elif not self.experiment and global_experiment_name:
+            eval_result = self._evaluate_with_experiment(
+                model, prompt_template, experiment_run_name, response_column_name
+            )
+        else:
+            eval_result = _evaluation.evaluate(
+                dataset=self.dataset,
+                metrics=self.metrics,
+                model=model,
+                prompt_template=prompt_template,
+                content_column_name=self.content_column_name,
+                reference_column_name=self.reference_column_name,
+                response_column_name=response_column_name or self.response_column_name,
+            )
+        return eval_result
+
+    def _validate_experiment_run(self) -> None:
+        """Checks if an experiment run already exists."""
+        if metadata._experiment_tracker.experiment_run:
+            raise ValueError(
+                "Experiment run already exists. Please specify the name of the"
+                " experiment run to assign current session with in this evaluate"
+                " method."
+            )
+
+    def _log_eval_experiment_param(
+        self,
+        model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
+        prompt_template: Optional[str] = None,
+    ) -> None:
+        """Logs variable input parameters of an evaluation to an experiment run."""
+        model_metadata = {}
+
+        if prompt_template is not None:
+            model_metadata.update({"prompt_template": prompt_template})
+
+        if isinstance(model, GenerativeModel):
+            model_metadata.update(
+                {
+                    "model_name": model._model_name,
+                }
+            )
+
+            if model._generation_config and isinstance(model._generation_config, dict):
+                # TODO(b/311221071): support logging GenerationConfig type.
+                model_metadata.update(**model._generation_config)
+
+            if model._safety_settings and isinstance(model._safety_settings, dict):
+                # TODO(b/311221071): support logging List[SafetySetting] type.
+                safety_settings = model._safety_settings
+                safety_settings_as_str = {
+                    category.name: threshold.name
+                    for category, threshold in safety_settings.items()
+                }
+                model_metadata.update(safety_settings_as_str)
+
+        if model_metadata:
+            _LOGGER.info(f"Logging Rapid Eval experiment metadata: {model_metadata}")
+            try:
+                vertexai.preview.log_params(model_metadata)
+            except (ValueError, TypeError) as e:
+                _LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")
+
+    def display_runs(self):
+        """Displays experiment runs associated with this EvalTask."""
+        if not self.experiment:
+            raise ValueError("Experiment is not set.")
+        elif IPython_display:
+            IPython_display.display(vertexai.preview.get_experiment_df(self.experiment))
diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py
new file mode 100644
index 0000000000..1c24664060
--- /dev/null
+++ b/vertexai/preview/evaluation/_evaluation.py
@@ -0,0 +1,567 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import asyncio
+import collections
+from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING, Tuple, Union, Callable
+
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform_v1beta1.types import (
+    content as gapic_content_types,
+)
+from vertexai import generative_models
+from vertexai.preview.evaluation import _base as evaluation_base
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation import (
+    prompt_template as prompt_template_base,
+)
+from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _base as metrics_base,
+)
+from vertexai.preview.evaluation.metrics import (
+    _instance_evaluation,
+)
+
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+_LOGGER = base.Logger(__name__)
+_METRICS_BUNDLE_TO_METRIC_NAMES = {
+    constants.MetricBundle.TEXT_GENERATION_SIMILARITY: (
+        constants.Metric.EXACT_MATCH,
+        constants.Metric.BLEU,
+        constants.Metric.ROUGE_1,
+        constants.Metric.ROUGE_2,
+        constants.Metric.ROUGE_L,
+        constants.Metric.ROUGE_L_SUM,
+    ),
+    constants.MetricBundle.TEXT_GENERATION_QUALITY: (
+        constants.Metric.COHERENCE,
+        constants.Metric.FLUENCY,
+    ),
+    constants.MetricBundle.TOOL_CALL_QUALITY: (
+        constants.Metric.TOOL_CALL_VALID,
+        constants.Metric.TOOL_NAME_MATCH,
+        constants.Metric.TOOL_PARAMETER_KEY_MATCH,
+        constants.Metric.TOOL_PARAMETER_KV_MATCH,
+    ),
+    constants.MetricBundle.TEXT_GENERATION_INSTRUCTION_FOLLOWING: (
+        constants.Metric.FULFILLMENT,
+    ),
+    constants.MetricBundle.TEXT_GENERATION_SAFETY: (constants.Metric.SAFETY,),
+    constants.MetricBundle.TEXT_GENERATION_FACTUALITY: (constants.Metric.GROUNDEDNESS,),
+    constants.MetricBundle.SUMMARIZATION_POINTWISE_REFERENCE_FREE: (
+        constants.Metric.SUMMARIZATION_QUALITY,
+        constants.Metric.SUMMARIZATION_HELPFULNESS,
+        constants.Metric.SUMMARIZATION_VERBOSITY,
+    ),
+    constants.MetricBundle.QA_POINTWISE_REFERENCE_FREE: (
+        constants.Metric.QUESTION_ANSWERING_QUALITY,
+        constants.Metric.QUESTION_ANSWERING_RELEVANCE,
+        constants.Metric.QUESTION_ANSWERING_HELPFULNESS,
+    ),
+    constants.MetricBundle.QA_POINTWISE_REFERENCE_BASED: (
+        constants.Metric.QUESTION_ANSWERING_CORRECTNESS,
+    ),
+}
+_SUCCESSFUL_FINISH_REASONS = [
+    gapic_content_types.Candidate.FinishReason.STOP,
+    # Many responses have this finish reason
+    gapic_content_types.Candidate.FinishReason.FINISH_REASON_UNSPECIFIED,
+]
+
+
+def _replace_metric_bundle_with_metrics(
+    metrics_list: List[Union[str, metrics_base.CustomMetric]],
+) -> List[str]:
+    """Replaces metric bundles with corresponding metrics.
+
+    Args:
+      metrics_list: The original list containing metrics bundle names.
+
+    Returns:
+      The modified metrics list containing only metric names.
+    """
+    modified_list = []
+
+    for item in metrics_list:
+        if item in _METRICS_BUNDLE_TO_METRIC_NAMES.keys():
+            modified_list.extend(_METRICS_BUNDLE_TO_METRIC_NAMES[item])
+        else:
+            modified_list.append(item)
+
+    return modified_list
+
+
+def _compute_custom_metrics(
+    row_dict: Dict[str, Any],
+    custom_metrics: List[metrics_base.CustomMetric],
+) -> Dict[str, Any]:
+    """Computes custom metrics for a row.
+
+    Args:
+        row_dict: A dictionary of an instance in the eval dataset.
+        custom_metrics: A list of CustomMetrics.
+
+    Returns:
+        A dictionary of an instance containing custom metric results.
+
+    Raises:
+        KeyError: If the custom metric function does not return a valid output.
+    """
+    for custom_metric in custom_metrics:
+        metric_output = custom_metric.metric_function(row_dict)
+        if custom_metric.name in metric_output:
+            row_dict[custom_metric.name] = metric_output[custom_metric.name]
+        else:
+            raise KeyError(
+                f"Custom metric score `{custom_metric.name}` not found in the metric"
+                f" output {metric_output}. Please make sure the custom metric"
+                " function is valid, and the output dictionary uses"
+                f" `{custom_metric.name}` as the key for metric value."
+            )
+        # Include additional metric results like explanation.
+        for key, value in metric_output.items():
+            if key != custom_metric.name:
+                row_dict[f"{custom_metric.name}/{key}"] = value
+    return row_dict
+
+
+def _separate_custom_metrics(
+    metrics: List[str],
+) -> Tuple[List[str], List[metrics_base.CustomMetric],]:
+    """Separates the metrics list into API and custom metrics."""
+    custom_metrics = []
+    api_metrics = []
+    for metric in metrics:
+        if isinstance(metric, metrics_base.CustomMetric):
+            custom_metrics.append(metric)
+        else:
+            api_metrics.append(metric)
+    return api_metrics, custom_metrics
+
+
+def _compute_summary_metrics(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+    metrics_table: "pd.DataFrame",
+) -> Dict[str, Any]:
+    """Computes summary metrics.
+
+    Args:
+        evaluation_run_config: Evaluation Run Configurations.
+        metrics_table: A dataframe containing per-instance metrics results.
+
+    Returns:
+        A dictionary containing summary metrics results and statistics.
+    """
+    summary_metrics = {}
+    summary_metrics[constants.MetricResult.ROW_COUNT_KEY] = metrics_table.shape[0]
+    for metric in evaluation_run_config.metrics:
+        try:
+            # TODO(b/325078638): implement additional aggregate methods.
+            summary_metrics[f"{str(metric)}/mean"] = metrics_table.loc[
+                :, str(metric)
+            ].mean()
+            summary_metrics[f"{str(metric)}/std"] = metrics_table.loc[
+                :, str(metric)
+            ].std()
+        except (ValueError, KeyError):
+            _LOGGER.warning(
+                f"Failed to compute metric statistics for {metric}. This metric"
+                " output contains error from the Autorater."
+            )
+            continue
+    return summary_metrics
+
+
+def _generate_response_from_gemini(
+    model: generative_models.GenerativeModel, prompt: str
+) -> str:
+    """Generates response from Gemini model.
+
+    Args:
+        model: The Gemini model instance.
+        prompt: The prompt to send to the model.
+
+    Returns:
+        The response from the model.
+
+    Raises:
+        RuntimeError if the prompt or the response for the prompt is blocked for
+        safety reasons.
+    """
+    response = model.generate_content(prompt)
+    try:
+        if not response.candidates:
+            raise RuntimeError(
+                f"The model response was blocked due to {response._raw_response.prompt_feedback.block_reason.name}.\n"
+                f"Blocke reason message: {response._raw_response.prompt_feedback.block_reason_message}.\n"
+                "The input prompt may be blocked for safety reasons.",
+                f"Prompt: {prompt}.",
+            )
+        else:
+            candidate = response.candidates[0]
+            if candidate.finish_reason not in _SUCCESSFUL_FINISH_REASONS:
+                raise RuntimeError(
+                    "The model response did not completed successfully.\n"
+                    f"Finish reason: {candidate.finish_reason}.\n"
+                    f"Finish message: {candidate.finish_message}.\n"
+                    f"Safety ratings: {candidate.safety_ratings}.\n"
+                    "Please adjsut the model safety_settings, or try a different prompt."
+                )
+            return response.candidates[0].content.parts[0].text
+    except Exception:
+        raise RuntimeError(
+            "Failed to generate response candidates from Gemini model.\n"
+            f"Response: {response}.\n"
+            f"Prompt: {prompt}."
+        )
+
+
+def _generate_response_from_gemini_model(
+    model: generative_models.GenerativeModel,
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Generates responses from Gemini model.
+
+    Args:
+        model: The Gemini model instance.
+        evaluation_run_config: Evaluation Run Configurations.
+    """
+    if (
+        constants.Dataset.COMPLETED_PROMPT_COLUMN
+        in evaluation_run_config.dataset.columns
+    ):
+        evaluation_run_config.dataset[
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+        ] = evaluation_run_config.dataset[
+            constants.Dataset.COMPLETED_PROMPT_COLUMN
+        ].apply(
+            lambda x: _generate_response_from_gemini(model, x)
+        )
+    else:
+        evaluation_run_config.dataset[
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+        ] = evaluation_run_config.dataset[
+            evaluation_run_config.column_map[constants.Dataset.CONTENT_COLUMN]
+        ].apply(
+            lambda x: _generate_response_from_gemini(model, x)
+        )
+
+
+def _generate_response_from_custom_model_fn(
+    model_fn: Callable[[str], str],
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> None:
+    """Generates responses from a custom model function.
+
+    Args:
+        model_fn: The custom model function.
+        evaluation_run_config: Evaluation Run Configurations.
+    """
+    try:
+        if (
+            constants.Dataset.COMPLETED_PROMPT_COLUMN
+            in evaluation_run_config.dataset.columns
+        ):
+            evaluation_run_config.dataset[
+                constants.Dataset.MODEL_RESPONSE_COLUMN
+            ] = evaluation_run_config.dataset[
+                constants.Dataset.COMPLETED_PROMPT_COLUMN
+            ].apply(
+                model_fn
+            )
+        else:
+            evaluation_run_config.dataset[
+                constants.Dataset.MODEL_RESPONSE_COLUMN
+            ] = evaluation_run_config.dataset[
+                evaluation_run_config.column_map[constants.Dataset.CONTENT_COLUMN]
+            ].apply(
+                model_fn
+            )
+    except (ValueError, IndexError) as e:
+        _LOGGER.warning(f"Failed to generate response from model function: {e}")
+
+
+def _check_placeholder_columns_exist(
+    dataset: "pd.DataFrame", placeholder_names_set: Set[str]
+) -> None:
+    """Checks if all placeholder names exist in the dataset columns.
+
+    Args:
+        dataset: The dataset to evaluate.
+        placeholder_names_set: A set of placeholder names.
+
+    Raises:
+        ValueError: If any placeholder names do not exist in the dataset columns
+        or the prompt template is invalid.
+    """
+    actual_column_names_set = set(dataset.columns)
+    if not placeholder_names_set.issubset(actual_column_names_set):
+        missing_columns = placeholder_names_set - actual_column_names_set
+        raise ValueError(
+            "Failed to complete prompt template: The following column(s) are"
+            f" missing: {', '.join(missing_columns)}"
+        )
+
+
+def _complete_prompt_for_dataset(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig, prompt_template: str
+) -> None:
+    """Adds a column in dataset for completed prompts from placeholder columns.
+
+    Args:
+        evaluation_run_config: Evaluation Run Configurations.
+        prompt_template: A prompt template string with placeholders that can be
+          formatted with dataset columns.
+
+    Returns:
+        The completed prompt template string to send to the model.
+
+    Raises:
+        ValueError: If any placeholder names do not exist in the dataset columns
+        or the prompt template is invalid.
+    """
+    prompt_template = prompt_template_base.PromptTemplate(prompt_template)
+    _check_placeholder_columns_exist(
+        evaluation_run_config.dataset, prompt_template.placeholders
+    )
+
+    try:
+        evaluation_run_config.dataset[
+            constants.Dataset.COMPLETED_PROMPT_COLUMN
+        ] = evaluation_run_config.dataset.apply(
+            lambda row: str(
+                prompt_template.assemble(
+                    **row[list(prompt_template.placeholders)].astype(str).to_dict(),
+                )
+            ),
+            axis=1,
+        )
+    except Exception as e:
+        raise ValueError(f"Failed to complete prompt: {e}") from e
+
+
+def _parse_metric_results_to_dataframe(
+    instance_df: "pd.DataFrame", results: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Parses metric results to a pandas dataframe.
+
+    Args:
+        instance_df: A dataframe containing per-instance metrics results.
+        results: A dictionary containing metric results.
+
+    Returns:
+        A dataframe containing per-instance metrics results. Each metric result
+        can contain metric score, explanation, and confidence.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[rapid_evaluation]"'
+        )
+    metrics_table = pd.DataFrame(dict(zip(instance_df.columns, instance_df.values.T)))
+
+    for metric_name, metric_results in results.items():
+        scores = [
+            result.get(constants.MetricResult.SCORE_KEY) for result in metric_results
+        ]
+        if (
+            metric_name
+            in constants.Metric.MODEL_BASED_METRIC_LIST
+            + constants.Metric.PAIRWISE_METRIC_LIST
+        ):
+            explanations = [
+                result.get(constants.MetricResult.EXPLANATION_KEY)
+                for result in metric_results
+            ]
+            confidences = [
+                result.get(constants.MetricResult.CONFIDENCE_KEY)
+                for result in metric_results
+            ]
+            metrics_table[
+                f"{metric_name}/{constants.MetricResult.EXPLANATION_KEY}"
+            ] = explanations
+            metrics_table[
+                f"{metric_name}/{constants.MetricResult.CONFIDENCE_KEY}"
+            ] = confidences
+
+        metrics_table[metric_name] = scores
+
+    return metrics_table
+
+
+async def _compute_metrics(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+) -> Tuple[Dict[str, Any], "pd.DataFrame"]:
+    """Computes the metrics for the dataset.
+
+    Args:
+      evaluation_run_config: Evaluation Run Configurations.
+
+    Returns:
+      The evaluation results for the input metrics.
+
+    Raises:
+      RuntimeError: The number of responses does not match the number of metrics.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[rapid_evaluation]"'
+        )
+
+    api_metrics, custom_metrics = _separate_custom_metrics(
+        evaluation_run_config.metrics
+    )
+    instance_list = []
+    tasks_by_metric = collections.defaultdict(list)
+    for _, row in evaluation_run_config.dataset.iterrows():
+        row_dict = _compute_custom_metrics(row.to_dict(), custom_metrics)
+
+        instance_list.append(row_dict)
+
+        for metric_name in api_metrics:
+            task = asyncio.create_task(
+                _instance_evaluation.evaluate_instances_async(
+                    client=evaluation_run_config.client,
+                    request=_instance_evaluation.build_request(
+                        metric_name=metric_name,
+                        row_dict=row_dict,
+                        evaluation_run_config=evaluation_run_config,
+                    ),
+                )
+            )
+            tasks_by_metric[metric_name].append(task)
+
+    results_dict = {
+        metric_name: await asyncio.gather(*tasks)
+        for metric_name, tasks in tasks_by_metric.items()
+    }
+
+    instance_df = pd.DataFrame.from_dict(instance_list)
+    metrics_table = _parse_metric_results_to_dataframe(instance_df, results_dict)
+
+    summary_metrics = _compute_summary_metrics(evaluation_run_config, metrics_table)
+    return summary_metrics, metrics_table
+
+
+def evaluate(
+    dataset: "pd.DataFrame",
+    metrics: List[Union[str, metrics_base.CustomMetric]],
+    *,
+    model: Optional[
+        Union[generative_models.GenerativeModel, Callable[[str], str]]
+    ] = None,
+    prompt_template: Optional[str] = None,
+    content_column_name: str = "content",
+    reference_column_name: str = "reference",
+    response_column_name: str = "response",
+    context_column_name: str = "context",
+    instruction_column_name: str = "instruction",
+) -> evaluation_base.EvalResult:
+    """Runs the evaluation for metrics.
+
+    Args:
+      dataset: The dataset to evaluate.
+      metrics: The list of metrics names to evaluate, or a metrics bundle for an
+        evaluation task, or custom metric instances.
+      model: The GenerativeModel instance or a custom model function to generate
+        responses to evaluate. If not provided, the evaluation is computed with
+        the `response` column in the `dataset`.
+      prompt_template: A prompt template string compatible with `PromptTemplate`
+        class with placeholders that can be formatted with dataset columns to
+        create completed prompts. The placeholders can be represented in curly
+        braces `{placeholder}`, and must be included in the dataset columns if
+        specified. The placeholder names cannot contain spaces.
+      content_column_name: The column name of content in the dataset to send to
+        the model. If not set, default to `content`.
+      reference_column_name: The column name of ground truth in the dataset. If
+        not set, default to `reference`.
+      response_column_name: The column name of model response in the dataset. If
+        not set, default to `response`.
+      context_column_name: The column name of summary context in the dataset. If
+        not set, default to `context`.
+      instruction_column_name: The column name of the instruction prompt in the
+        dataset. If not set, default to `instruction`.
+
+    Returns:
+      EvalResult with summary metrics and a metrics table for per-instance
+      metrics.
+    """
+
+    if not metrics:
+        raise ValueError("Metrics cannot be empty.")
+
+    evaluation_run_config = evaluation_base.EvaluationRunConfig(
+        dataset=dataset,
+        metrics=_replace_metric_bundle_with_metrics(metrics),
+        column_map={
+            constants.Dataset.CONTENT_COLUMN: content_column_name,
+            constants.Dataset.REFERENCE_COLUMN: reference_column_name,
+            constants.Dataset.MODEL_RESPONSE_COLUMN: response_column_name,
+            constants.Dataset.CONTEXT_COLUMN: context_column_name,
+            constants.Dataset.INSTRUCTION_COLUMN: instruction_column_name,
+        },
+        client=utils.create_evaluation_service_async_client(),
+    )
+
+    if prompt_template:
+        _complete_prompt_for_dataset(evaluation_run_config, prompt_template)
+
+    if model:
+        if prompt_template:
+            evaluation_run_config.validate_dataset_column(
+                constants.Dataset.COMPLETED_PROMPT_COLUMN
+            )
+        else:
+            evaluation_run_config.validate_dataset_column(
+                constants.Dataset.CONTENT_COLUMN
+            )
+
+        if isinstance(model, generative_models.GenerativeModel):
+            _generate_response_from_gemini_model(model, evaluation_run_config)
+        elif callable(model):
+            _generate_response_from_custom_model_fn(model, evaluation_run_config)
+    else:
+        evaluation_run_config.validate_dataset_column(
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+        )
+        if set(evaluation_run_config.metrics).intersection(
+            set(constants.Metric.AUTOMATIC_METRIC_LIST)
+        ):
+            evaluation_run_config.validate_dataset_column(
+                constants.Dataset.REFERENCE_COLUMN
+            )
+
+    if asyncio.get_event_loop().is_running():
+        asyncio.set_event_loop(asyncio.new_event_loop())
+    loop = asyncio.get_event_loop()
+
+    summary_metrics, metrics_table = loop.run_until_complete(
+        _compute_metrics(evaluation_run_config)
+    )
+
+    return evaluation_base.EvalResult(
+        summary_metrics=summary_metrics, metrics_table=metrics_table
+    )
diff --git a/vertexai/preview/evaluation/constants.py b/vertexai/preview/evaluation/constants.py
new file mode 100644
index 0000000000..d25bdd7655
--- /dev/null
+++ b/vertexai/preview/evaluation/constants.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Constants for evaluation."""
+import dataclasses
+
+
+@dataclasses.dataclass(frozen=True)
+class Metric:
+    """Namespace for Metrics."""
+
+    # Automatic Metrics.
+    EXACT_MATCH = "exact_match"
+    BLEU = "bleu"
+    ROUGE_1 = "rouge_1"
+    ROUGE_2 = "rouge_2"
+    ROUGE_L = "rouge_l"
+    ROUGE_L_SUM = "rouge_l_sum"
+    TOOL_CALL_VALID = "tool_call_valid"
+    TOOL_NAME_MATCH = "tool_name_match"
+    TOOL_PARAMETER_KEY_MATCH = "tool_parameter_key_match"
+    TOOL_PARAMETER_KV_MATCH = "tool_parameter_kv_match"
+    # Model-based Pointwise Metrics.
+    COHERENCE = "coherence"
+    FLUENCY = "fluency"
+    SAFETY = "safety"
+    GROUNDEDNESS = "groundedness"
+    FULFILLMENT = "fulfillment"
+    RESPONSE_RECALL = "response_recall"
+    SUMMARIZATION_QUALITY = "summarization_quality"
+    SUMMARIZATION_HELPFULNESS = "summarization_helpfulness"
+    SUMMARIZATION_VERBOSITY = "summarization_verbosity"
+    QUESTION_ANSWERING_QUALITY = "question_answering_quality"
+    QUESTION_ANSWERING_RELEVANCE = "question_answering_relevance"
+    QUESTION_ANSWERING_HELPFULNESS = "question_answering_helpfulness"
+    QUESTION_ANSWERING_CORRECTNESS = "question_answering_correctness"
+    RAG_CONTEXT_RECALL = "rag_context_recall"
+    # Side-by-side(SxS) Pairwise Metrics.
+    PAIRWISE_SUMMARIZATION_QUALITY = "pairwise_summarization_quality"
+    PAIRWISE_QUESTION_ANSWERING_QUALITY = "pairwise_question_answering_quality"
+
+    AUTOMATIC_METRIC_LIST = (
+        EXACT_MATCH,
+        BLEU,
+        ROUGE_1,
+        ROUGE_2,
+        ROUGE_L,
+        ROUGE_L_SUM,
+        TOOL_CALL_VALID,
+        TOOL_NAME_MATCH,
+        TOOL_PARAMETER_KEY_MATCH,
+        TOOL_PARAMETER_KV_MATCH,
+    )
+    MODEL_BASED_METRIC_LIST = (
+        COHERENCE,
+        FLUENCY,
+        SAFETY,
+        GROUNDEDNESS,
+        FULFILLMENT,
+        RESPONSE_RECALL,
+        SUMMARIZATION_QUALITY,
+        SUMMARIZATION_HELPFULNESS,
+        SUMMARIZATION_VERBOSITY,
+        QUESTION_ANSWERING_QUALITY,
+        QUESTION_ANSWERING_RELEVANCE,
+        QUESTION_ANSWERING_HELPFULNESS,
+        QUESTION_ANSWERING_CORRECTNESS,
+        RAG_CONTEXT_RECALL,
+    )
+    PAIRWISE_METRIC_LIST = (
+        PAIRWISE_SUMMARIZATION_QUALITY,
+        PAIRWISE_QUESTION_ANSWERING_QUALITY,
+    )
+
+
+@dataclasses.dataclass(frozen=True)
+class MetricResult:
+    ROW_COUNT_KEY = "row_count"
+    SCORE_KEY = "score"
+    EXPLANATION_KEY = "explanation"
+    CONFIDENCE_KEY = "confidence"
+    PAIRWISE_CHOICE_KEY = "pairwise_choice"
+
+    # Automatic Metrics.
+    EXACT_MATCH_RESULTS = "exact_match_results"
+    BLEU_RESULTS = "bleu_results"
+    ROUGE_RESULTS = "rouge_results"
+    TOOL_CALL_VALID_RESULTS = "tool_call_valid_results"
+    TOOL_NAME_MATCH_RESULTS = "tool_name_match_results"
+    TOOL_PARAMETER_KEY_MATCH_RESULTS = "tool_parameter_key_match_results"
+    TOOL_PARAMETER_KV_MATCH_RESULTS = "tool_parameter_kv_match_results"
+    # Model-based Pointwise Metrics.
+    COHERENCE_RESULT = "coherence_result"
+    FLUENCY_RESULT = "fluency_result"
+    SAFETY_RESULT = "safety_result"
+    GROUNDEDNESS_RESULT = "groundedness_result"
+    FULFILLMENT_RESULT = "fulfillment_result"
+    RESPONSE_RECALL_RESULT = "response_recall_result"
+    SUMMARIZATION_QUALITY_RESULT = "summarization_quality_result"
+    SUMMARIZATION_HELPFULNESS_RESULT = "summarization_helpfulness_result"
+    SUMMARIZATION_VERBOSITY_RESULT = "summarization_verbosity_result"
+    QUESTION_ANSWERING_QUALITY_RESULT = "question_answering_quality_result"
+    QUESTION_ANSWERING_RELEVANCE_RESULT = "question_answering_relevance_result"
+    QUESTION_ANSWERING_HELPFULNESS_RESULT = "question_answering_helpfulness_result"
+    QUESTION_ANSWERING_CORRECTNESS_RESULT = "question_answering_correctness_result"
+    RAG_CONTEXT_RECALL_RESULT = "rag_context_recall_result"
+    # Side-by-side(SxS) Pairwise Metrics.
+    PAIRWISE_SUMMARIZATION_QUALITY_RESULT = "pairwise_summarization_quality_result"
+    PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT = (
+        "pairwise_question_answering_quality_result"
+    )
+
+    AUTOMATIC_METRIC_RESULTS_LIST = (
+        EXACT_MATCH_RESULTS,
+        BLEU_RESULTS,
+        ROUGE_RESULTS,
+        TOOL_CALL_VALID_RESULTS,
+        TOOL_NAME_MATCH_RESULTS,
+        TOOL_PARAMETER_KEY_MATCH_RESULTS,
+        TOOL_PARAMETER_KV_MATCH_RESULTS,
+    )
+    MODEL_BASED_METRIC_RESULT_LIST = (
+        COHERENCE_RESULT,
+        FLUENCY_RESULT,
+        SAFETY_RESULT,
+        GROUNDEDNESS_RESULT,
+        FULFILLMENT_RESULT,
+        RESPONSE_RECALL_RESULT,
+        SUMMARIZATION_QUALITY_RESULT,
+        SUMMARIZATION_HELPFULNESS_RESULT,
+        SUMMARIZATION_VERBOSITY_RESULT,
+        QUESTION_ANSWERING_QUALITY_RESULT,
+        QUESTION_ANSWERING_RELEVANCE_RESULT,
+        QUESTION_ANSWERING_HELPFULNESS_RESULT,
+        QUESTION_ANSWERING_CORRECTNESS_RESULT,
+        RAG_CONTEXT_RECALL_RESULT,
+    )
+    PAIRWISE_METRIC_RESULT_LIST = (
+        PAIRWISE_SUMMARIZATION_QUALITY_RESULT,
+        PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT,
+    )
+
+
+@dataclasses.dataclass(frozen=True)
+class MetricBundle:
+    """Namespace for MetricBundle."""
+
+    TEXT_GENERATION_SIMILARITY = "text_generation_similarity"
+    TEXT_GENERATION_QUALITY = "text_generation_quality"
+    TOOL_CALL_QUALITY = "tool_call_quality"
+    TEXT_GENERATION_INSTRUCTION_FOLLOWING = "text_generation_instruction_following"
+    TEXT_GENERATION_SAFETY = "text_generation_safety"
+    TEXT_GENERATION_FACTUALITY = "text_generation_factuality"
+    SUMMARIZATION_POINTWISE_REFERENCE_FREE = "summarization_pointwise_reference_free"
+    QA_POINTWISE_REFERENCE_FREE = "qa_pointwise_reference_free"
+    QA_POINTWISE_REFERENCE_BASED = "qa_pointwise_reference_based"
+
+
+@dataclasses.dataclass(frozen=True)
+class Dataset:
+    COMPLETED_PROMPT_COLUMN = "completed_prompt"
+    MODEL_RESPONSE_COLUMN = "response"
+    BASELINE_MODEL_RESPONSE_COLUMN = "baseline_model_response"
+    CONTEXT_COLUMN = "context"
+    REFERENCE_COLUMN = "reference"
+    CONTENT_COLUMN = "content"
+    INSTRUCTION_COLUMN = "instruction"
diff --git a/vertexai/preview/evaluation/metrics/__init__.py b/vertexai/preview/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000..94d768a030
--- /dev/null
+++ b/vertexai/preview/evaluation/metrics/__init__.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Evaluation Metrics Module."""
+
+from vertexai.preview.evaluation.metrics import (
+    _base,
+)
+
+CustomMetric = _base.CustomMetric
+make_metric = _base.make_metric
+
+__all__ = [
+    "CustomMetric",
+    "make_metric",
+]
diff --git a/vertexai/preview/evaluation/metrics/_base.py b/vertexai/preview/evaluation/metrics/_base.py
new file mode 100644
index 0000000000..35ab69aec5
--- /dev/null
+++ b/vertexai/preview/evaluation/metrics/_base.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Callable, Dict
+
+
+class CustomMetric:
+    """The custom evaluation metric.
+
+    Attributes:
+      name: The name of the metric.
+      metric_function: The evaluation function. Must use the dataset row/instance
+       as the metric_function input. Returns per-instance metric result as a
+       dictionary. The metric score must mapped to the CustomMetric.name as key.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        metric_function: Callable[
+            [Dict[str, Any]],
+            Dict[str, Any],
+        ],
+    ):
+        """Initializes the evaluation metric."""
+        self.name = name
+        self.metric_function = metric_function
+
+    def __str__(self):
+        return self.name
+
+
+def make_metric(
+    name: str, metric_function: Callable[[Dict[str, Any]], Dict[str, Any]]
+) -> CustomMetric:
+    """Makes a custom metric.
+
+    Args:
+      name: The name of the metric
+      metric_function: The evaluation function. Must use the dataset row/instance
+        as the metric_function input. Returns per-instance metric result as a
+        dictionary. The metric score must mapped to the CustomMetric.name as key.
+
+    Returns:
+      A CustomMetric instance, can be passed to evaluate() function.
+    """
+    return CustomMetric(name, metric_function)
diff --git a/vertexai/preview/evaluation/metrics/_instance_evaluation.py b/vertexai/preview/evaluation/metrics/_instance_evaluation.py
new file mode 100644
index 0000000000..dc7a8ddf2b
--- /dev/null
+++ b/vertexai/preview/evaluation/metrics/_instance_evaluation.py
@@ -0,0 +1,603 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Library for Metrics Computation with Evaluation Service Async Client."""
+
+from typing import Any, Dict
+
+from google import api_core
+from google.cloud.aiplatform import base
+from google.cloud.aiplatform import initializer
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_evaluation_service_types,
+)
+from vertexai.preview.evaluation import (
+    _base as eval_base,
+)
+from vertexai.preview.evaluation import constants
+
+from google.protobuf import json_format
+
+_LOGGER = base.Logger(__name__)
+_METRIC_NAME_TO_METRIC_SPEC = {
+    # Automatic Metrics.
+    constants.Metric.EXACT_MATCH: (gapic_evaluation_service_types.ExactMatchSpec()),
+    constants.Metric.BLEU: gapic_evaluation_service_types.BleuSpec(),
+    constants.Metric.ROUGE_1: gapic_evaluation_service_types.RougeSpec(
+        rouge_type="rouge1"
+    ),
+    constants.Metric.ROUGE_2: gapic_evaluation_service_types.RougeSpec(
+        rouge_type="rouge2"
+    ),
+    constants.Metric.ROUGE_L: gapic_evaluation_service_types.RougeSpec(
+        rouge_type="rougeL"
+    ),
+    constants.Metric.ROUGE_L_SUM: gapic_evaluation_service_types.RougeSpec(
+        rouge_type="rougeLsum"
+    ),
+    constants.Metric.TOOL_CALL_VALID: (
+        gapic_evaluation_service_types.ToolCallValidSpec()
+    ),
+    constants.Metric.TOOL_NAME_MATCH: (
+        gapic_evaluation_service_types.ToolNameMatchSpec()
+    ),
+    constants.Metric.TOOL_PARAMETER_KV_MATCH: (
+        gapic_evaluation_service_types.ToolParameterKVMatchSpec()
+    ),
+    constants.Metric.TOOL_PARAMETER_KEY_MATCH: (
+        gapic_evaluation_service_types.ToolParameterKeyMatchSpec()
+    ),
+    # Model-based Pointwise Metrics.
+    constants.Metric.FLUENCY: gapic_evaluation_service_types.FluencySpec(),
+    constants.Metric.COHERENCE: gapic_evaluation_service_types.CoherenceSpec(),
+    constants.Metric.SAFETY: gapic_evaluation_service_types.SafetySpec(),
+    constants.Metric.GROUNDEDNESS: (gapic_evaluation_service_types.GroundednessSpec()),
+    constants.Metric.FULFILLMENT: (gapic_evaluation_service_types.FulfillmentSpec()),
+    constants.Metric.SUMMARIZATION_QUALITY: (
+        gapic_evaluation_service_types.SummarizationQualitySpec()
+    ),
+    constants.Metric.SUMMARIZATION_HELPFULNESS: (
+        gapic_evaluation_service_types.SummarizationHelpfulnessSpec()
+    ),
+    constants.Metric.SUMMARIZATION_VERBOSITY: (
+        gapic_evaluation_service_types.SummarizationVerbositySpec()
+    ),
+    constants.Metric.QUESTION_ANSWERING_QUALITY: (
+        gapic_evaluation_service_types.QuestionAnsweringQualitySpec()
+    ),
+    constants.Metric.QUESTION_ANSWERING_RELEVANCE: (
+        gapic_evaluation_service_types.QuestionAnsweringRelevanceSpec()
+    ),
+    constants.Metric.QUESTION_ANSWERING_CORRECTNESS: (
+        gapic_evaluation_service_types.QuestionAnsweringCorrectnessSpec(
+            use_reference=True
+        )
+    ),
+    constants.Metric.QUESTION_ANSWERING_HELPFULNESS: (
+        gapic_evaluation_service_types.QuestionAnsweringHelpfulnessSpec()
+    ),
+    # Side-by-side(SxS) Pairwise Metrics.
+    constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY: (
+        gapic_evaluation_service_types.PairwiseSummarizationQualitySpec()
+    ),
+    constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY: (
+        gapic_evaluation_service_types.PairwiseQuestionAnsweringQualitySpec()
+    ),
+}
+
+
+def build_request(
+    metric_name: str,
+    row_dict: Dict[str, Any],
+    evaluation_run_config: eval_base.EvaluationRunConfig,
+) -> gapic_evaluation_service_types.EvaluateInstancesRequest:
+    """Builds a metric instance and form the request for the evaluation service.
+
+    Args:
+        metric_name: The name of the metric to evaluate.
+        row_dict: An eval dataset instance in a dictionary.
+        evaluation_run_config: Evaluation Run Configurations.
+
+    Returns:
+        A single EvaluateInstancesRequest.
+    """
+    project = initializer.global_config.project
+    location = initializer.global_config.location
+    if not project or not location:
+        raise ValueError(
+            "No project or location specified. Please run `vertexai.init()` to"
+            " provide these parameters."
+        )
+    location_path = (
+        gapic_evaluation_services.EvaluationServiceAsyncClient.common_location_path(
+            project, location
+        )
+    )
+
+    if metric_name not in _METRIC_NAME_TO_METRIC_SPEC:
+        raise ValueError(f"Metric name: {metric_name} not supported.")
+    metric_spec = _METRIC_NAME_TO_METRIC_SPEC[metric_name]
+    column_map = evaluation_run_config.column_map
+    prediction = row_dict.get(
+        column_map.get(constants.Dataset.MODEL_RESPONSE_COLUMN), ""
+    )
+    baseline_prediction = row_dict.get(
+        column_map.get(constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN), ""
+    )
+    reference = row_dict.get(column_map.get(constants.Dataset.REFERENCE_COLUMN), "")
+    context = row_dict.get(column_map.get(constants.Dataset.CONTEXT_COLUMN), "")
+    instruction = row_dict.get(column_map.get(constants.Dataset.INSTRUCTION_COLUMN), "")
+
+    # Automatic Metrics.
+    if metric_name == constants.Metric.EXACT_MATCH:
+        instance = gapic_evaluation_service_types.ExactMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.ExactMatchInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            exact_match_input=instance,
+        )
+    if metric_name == constants.Metric.BLEU:
+        instance = gapic_evaluation_service_types.BleuInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.BleuInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            bleu_input=instance,
+        )
+    if metric_name in (
+        constants.Metric.ROUGE_1,
+        constants.Metric.ROUGE_2,
+        constants.Metric.ROUGE_L,
+        constants.Metric.ROUGE_L_SUM,
+    ):
+        instance = gapic_evaluation_service_types.RougeInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.RougeInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            rouge_input=instance,
+        )
+    if metric_name == constants.Metric.TOOL_CALL_VALID:
+        instance = gapic_evaluation_service_types.ToolCallValidInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.ToolCallValidInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_call_valid_input=instance,
+        )
+    if metric_name == constants.Metric.TOOL_NAME_MATCH:
+        instance = gapic_evaluation_service_types.ToolNameMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.ToolNameMatchInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_name_match_input=instance,
+        )
+    if metric_name == constants.Metric.TOOL_PARAMETER_KEY_MATCH:
+        instance = gapic_evaluation_service_types.ToolParameterKeyMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.ToolParameterKeyMatchInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_parameter_key_match_input=instance,
+        )
+    if metric_name == constants.Metric.TOOL_PARAMETER_KV_MATCH:
+        instance = gapic_evaluation_service_types.ToolParameterKVMatchInput(
+            metric_spec=metric_spec,
+            instances=[
+                gapic_evaluation_service_types.ToolParameterKVMatchInstance(
+                    prediction=prediction,
+                    reference=reference,
+                )
+            ],
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            tool_parameter_kv_match_input=instance,
+        )
+    # Model-based Pointwise Metrics.
+    if metric_name == constants.Metric.COHERENCE:
+        coherence_input = gapic_evaluation_service_types.CoherenceInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.CoherenceInstance(
+                prediction=prediction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            coherence_input=coherence_input,
+        )
+    if metric_name == constants.Metric.FLUENCY:
+        fluency_input = gapic_evaluation_service_types.FluencyInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.FluencyInstance(
+                prediction=prediction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            fluency_input=fluency_input,
+        )
+    if metric_name == constants.Metric.SAFETY:
+        safety_input = gapic_evaluation_service_types.SafetyInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.SafetyInstance(
+                prediction=prediction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            safety_input=safety_input,
+        )
+    if metric_name == constants.Metric.GROUNDEDNESS:
+        groundedness_input = gapic_evaluation_service_types.GroundednessInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.GroundednessInstance(
+                prediction=prediction, context=context
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            groundedness_input=groundedness_input,
+        )
+    if metric_name == constants.Metric.FULFILLMENT:
+        fulfillment_input = gapic_evaluation_service_types.FulfillmentInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.FulfillmentInstance(
+                prediction=prediction, instruction=instruction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            fulfillment_input=fulfillment_input,
+        )
+    if metric_name == constants.Metric.RESPONSE_RECALL:
+        raise NotImplementedError("Response recall is not implemented.")
+    if metric_name == constants.Metric.SUMMARIZATION_QUALITY:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        summarization_quality_input = (
+            gapic_evaluation_service_types.SummarizationQualityInput(
+                metric_spec=metric_spec,
+                instance=gapic_evaluation_service_types.SummarizationQualityInstance(
+                    prediction=prediction, context=context, instruction=instruction
+                ),
+            )
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            summarization_quality_input=summarization_quality_input,
+        )
+    if metric_name == constants.Metric.SUMMARIZATION_HELPFULNESS:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        summarization_helpfulness_input = gapic_evaluation_service_types.SummarizationHelpfulnessInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.SummarizationHelpfulnessInstance(
+                prediction=prediction, context=context, instruction=instruction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            summarization_helpfulness_input=summarization_helpfulness_input,
+        )
+    if metric_name == constants.Metric.SUMMARIZATION_VERBOSITY:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        summarization_verbosity_input = (
+            gapic_evaluation_service_types.SummarizationVerbosityInput(
+                metric_spec=metric_spec,
+                instance=gapic_evaluation_service_types.SummarizationVerbosityInstance(
+                    prediction=prediction, context=context, instruction=instruction
+                ),
+            )
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            summarization_verbosity_input=summarization_verbosity_input,
+        )
+    if metric_name == constants.Metric.QUESTION_ANSWERING_QUALITY:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        question_answering_quality_input = gapic_evaluation_service_types.QuestionAnsweringQualityInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.QuestionAnsweringQualityInstance(
+                prediction=prediction, context=context, instruction=instruction
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            question_answering_quality_input=question_answering_quality_input,
+        )
+    if metric_name == constants.Metric.QUESTION_ANSWERING_HELPFULNESS:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        question_answering_helpfulness_input = gapic_evaluation_service_types.QuestionAnsweringHelpfulnessInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.QuestionAnsweringHelpfulnessInstance(
+                prediction=prediction,
+                context=context,
+                instruction=instruction,
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            question_answering_helpfulness_input=question_answering_helpfulness_input,
+        )
+    if metric_name == constants.Metric.QUESTION_ANSWERING_RELEVANCE:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        question_answering_relevance_input = gapic_evaluation_service_types.QuestionAnsweringRelevanceInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.QuestionAnsweringRelevanceInstance(
+                prediction=prediction,
+                context=context,
+                instruction=instruction,
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            question_answering_relevance_input=question_answering_relevance_input,
+        )
+    if metric_name == constants.Metric.QUESTION_ANSWERING_CORRECTNESS:
+        # TODO(b/330807319): allow set reference field after setting metric spec is allowed.
+        question_answering_correctness_input = gapic_evaluation_service_types.QuestionAnsweringCorrectnessInput(
+            metric_spec=metric_spec,
+            instance=gapic_evaluation_service_types.QuestionAnsweringCorrectnessInstance(
+                prediction=prediction,
+                context=context,
+                instruction=instruction,
+                reference=reference,
+            ),
+        )
+        return gapic_evaluation_service_types.EvaluateInstancesRequest(
+            location=location_path,
+            question_answering_correctness_input=question_answering_correctness_input,
+        )
+    if metric_name == constants.Metric.RAG_CONTEXT_RECALL:
+        raise NotImplementedError("RAG context recall is not implemented.")
+    # Side-by-side(SxS) Pairwise Metrics.
+    if metric_name == constants.Metric.PAIRWISE_SUMMARIZATION_QUALITY:
+        raise NotImplementedError("Pairwise summarization quality is not implemented.")
+    if metric_name == constants.Metric.PAIRWISE_QUESTION_ANSWERING_QUALITY:
+        raise NotImplementedError(
+            "Pairwise question answering quality is not implemented."
+        )
+
+
+def _parse_autometric_results(
+    metric_result_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Parses the automatic metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+
+    Returns:
+        A dictionary containing metric score of the metric.
+    """
+    for value in metric_result_dict.values():
+        # Only single instance requests are used by SDK.
+        return {
+            constants.MetricResult.SCORE_KEY: value[0].get(
+                constants.MetricResult.SCORE_KEY
+            )
+        }
+
+
+def _parse_pointwise_results(
+    metric_result_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Parses the pointwise metric results from the evaluation results.
+
+    Args:
+        metric_result_dict: The metric results dictionary.
+
+    Returns:
+        A dictionary containing metric score, explanation, confidence of the
+        metric.
+    """
+    return {
+        constants.MetricResult.SCORE_KEY: metric_result_dict.get(
+            constants.MetricResult.SCORE_KEY
+        ),
+        constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
+            constants.MetricResult.EXPLANATION_KEY
+        ),
+        constants.MetricResult.CONFIDENCE_KEY: metric_result_dict.get(
+            constants.MetricResult.CONFIDENCE_KEY
+        ),
+    }
+
+
+def _parse_pairwise_results(
+    metric_result_dict: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Parses the pairwise metric results from the evaluation results.
+
+    s
+
+      Args:
+          metric_result_dict: The metric results dictionary.
+
+      Returns:
+          A dictionary containing metric score, explanation, confidence of the
+          metric.
+    """
+    return {
+        # TODO(b/330598854): handle pairwise choice.
+        constants.MetricResult.PAIRWISE_CHOICE_KEY: metric_result_dict.get(
+            constants.MetricResult.PAIRWISE_CHOICE_KEY,
+            gapic_evaluation_service_types.PairwiseChoice.PAIRWISE_CHOICE_UNSPECIFIED,
+        ),
+        constants.MetricResult.EXPLANATION_KEY: metric_result_dict.get(
+            constants.MetricResult.EXPLANATION_KEY
+        ),
+        constants.MetricResult.CONFIDENCE_KEY: metric_result_dict.get(
+            constants.MetricResult.CONFIDENCE_KEY
+        ),
+    }
+
+
+def _handle_response(
+    response: gapic_evaluation_service_types.EvaluateInstancesResponse,
+) -> Dict[str, Any]:
+    """Handles the response from the evaluation service.
+
+    Args:
+        response: The response from the evaluation service.
+
+    Returns:
+        The metric score of the evaluation.
+    """
+    metric_type = response._pb.WhichOneof("evaluation_results")
+
+    # Automatic Metrics.
+    if metric_type == constants.MetricResult.EXACT_MATCH_RESULTS:
+        metric_result = response.exact_match_results
+    elif metric_type == constants.MetricResult.BLEU_RESULTS:
+        metric_result = response.bleu_results
+    elif metric_type == constants.MetricResult.ROUGE_RESULTS:
+        metric_result = response.rouge_results
+    elif metric_type == constants.MetricResult.TOOL_CALL_VALID_RESULTS:
+        metric_result = response.tool_call_valid_results
+    elif metric_type == constants.MetricResult.TOOL_NAME_MATCH_RESULTS:
+        metric_result = response.tool_name_match_results
+    elif metric_type == constants.MetricResult.TOOL_PARAMETER_KEY_MATCH_RESULTS:
+        metric_result = response.tool_parameter_key_match_results
+    elif metric_type == constants.MetricResult.TOOL_PARAMETER_KV_MATCH_RESULTS:
+        metric_result = response.tool_parameter_kv_match_results
+    # Model-based Pointwise Metrics.
+    elif metric_type == constants.MetricResult.COHERENCE_RESULT:
+        metric_result = response.coherence_result
+    elif metric_type == constants.MetricResult.FULFILLMENT_RESULT:
+        metric_result = response.fulfillment_result
+    elif metric_type == constants.MetricResult.FLUENCY_RESULT:
+        metric_result = response.fluency_result
+    elif metric_type == constants.MetricResult.SAFETY_RESULT:
+        metric_result = response.safety_result
+    elif metric_type == constants.MetricResult.GROUNDEDNESS_RESULT:
+        metric_result = response.groundedness_result
+    elif metric_type == constants.MetricResult.RESPONSE_RECALL_RESULT:
+        metric_result = response.response_recall_result
+    elif metric_type == constants.MetricResult.SUMMARIZATION_QUALITY_RESULT:
+        metric_result = response.summarization_quality_result
+    elif metric_type == constants.MetricResult.SUMMARIZATION_HELPFULNESS_RESULT:
+        metric_result = response.summarization_helpfulness_result
+    elif metric_type == constants.MetricResult.SUMMARIZATION_VERBOSITY_RESULT:
+        metric_result = response.summarization_verbosity_result
+    elif metric_type == constants.MetricResult.QUESTION_ANSWERING_QUALITY_RESULT:
+        metric_result = response.question_answering_quality_result
+    elif metric_type == constants.MetricResult.QUESTION_ANSWERING_RELEVANCE_RESULT:
+        metric_result = response.question_answering_relevance_result
+    elif metric_type == constants.MetricResult.QUESTION_ANSWERING_HELPFULNESS_RESULT:
+        metric_result = response.question_answering_helpfulness_result
+    elif metric_type == constants.MetricResult.QUESTION_ANSWERING_CORRECTNESS_RESULT:
+        metric_result = response.question_answering_correctness_result
+    elif metric_type == constants.MetricResult.RAG_CONTEXT_RECALL_RESULT:
+        metric_result = response.rag_context_recall_result
+    # Side-by-side(SxS) Pairwise Metrics.
+    elif metric_type == constants.MetricResult.PAIRWISE_SUMMARIZATION_QUALITY_RESULT:
+        metric_result = response.pairwise_summarization_quality_result
+    elif (
+        metric_type == constants.MetricResult.PAIRWISE_QUESTION_ANSWERING_QUALITY_RESULT
+    ):
+        metric_result = response.pairwise_question_answering_quality_result
+    else:
+        raise ValueError(f"Unknown metric type: {metric_type}")
+
+    metric_result_dict = json_format.MessageToDict(
+        metric_result._pb, preserving_proto_field_name=True
+    )
+
+    if metric_type in constants.MetricResult.AUTOMATIC_METRIC_RESULTS_LIST:
+        result = _parse_autometric_results(metric_result_dict)
+    elif metric_type in constants.MetricResult.MODEL_BASED_METRIC_RESULT_LIST:
+        result = _parse_pointwise_results(metric_result_dict)
+    elif metric_type in constants.MetricResult.PAIRWISE_METRIC_RESULT_LIST:
+        result = _parse_pairwise_results(metric_result_dict)
+    else:
+        raise ValueError(f"Unknown metric type: {metric_type}")
+    return result
+
+
+async def evaluate_instances_async(
+    client: gapic_evaluation_services.EvaluationServiceAsyncClient,
+    request: gapic_evaluation_service_types.EvaluateInstancesRequest,
+):
+    """Evaluates an instance asynchronously.
+
+    Args:
+        client: The client to use for evaluation.
+        request: An EvaluateInstancesRequest.
+
+    Returns:
+        The metric score of the evaluation.
+    """
+
+    response = await client.evaluate_instances(
+        request=request,
+        retry=api_core.retry_async.AsyncRetry(
+            initial=0.250,
+            maximum=90.0,
+            multiplier=1.45,
+            deadline=600.0,
+            predicate=api_core.retry.if_exception_type(
+                api_core.exceptions.Aborted,
+                api_core.exceptions.DeadlineExceeded,
+                api_core.exceptions.InternalServerError,
+                api_core.exceptions.ResourceExhausted,
+                api_core.exceptions.ServiceUnavailable,
+                api_core.exceptions.Unknown,
+                api_core.exceptions.Cancelled,
+            ),
+        ),
+    )
+    return _handle_response(response)
diff --git a/vertexai/preview/evaluation/prompt_template.py b/vertexai/preview/evaluation/prompt_template.py
new file mode 100644
index 0000000000..14b0f6bd6a
--- /dev/null
+++ b/vertexai/preview/evaluation/prompt_template.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import string
+from typing import Set
+
+
+class PromptTemplate:
+    """A prompt template for creating prompts with placeholders.
+
+    The `PromptTemplate` class allows users to define a template string with
+    placeholders represented in curly braces `{placeholder}`. The placeholder
+    names cannot contain spaces. These placeholders can be replaced with specific
+    values using the `assemble` method, providing flexibility in generating
+    dynamic prompts.
+
+    Example Usage:
+
+        ```
+            template_str = "Hello, {name}! Today is {day}. How are you?"
+            prompt_template = PromptTemplate(template_str)
+            completed_prompt = prompt_template.assemble(name="John", day="Monday")
+            print(completed_prompt)
+        ```
+
+    Attributes:
+        template: The template string containing placeholders for replacement.
+        placeholders: A set of placeholder names from the template string.
+    """
+
+    def __init__(self, template: str):
+        """Initializes the PromptTemplate with a given template.
+
+        Args:
+            template: The template string with placeholders. Placeholders should be
+              represented in curly braces `{placeholder}`.
+        """
+        self.template = str(template)
+        self.placeholders = self._get_placeholders()
+
+    def _get_placeholders(self) -> Set[str]:
+        """Extracts and return a set of placeholder names from the template."""
+        return set(
+            field_name
+            for _, field_name, _, _ in string.Formatter().parse(self.template)
+            if field_name is not None
+        )
+
+    def assemble(self, **kwargs) -> "PromptTemplate":
+        """Replaces only the provided placeholders in the template with specific values.
+
+        Args:
+            **kwargs: Keyword arguments where keys are placeholder names and values
+              are the replacements.
+
+        Returns:
+            A new PromptTemplate instance with the updated template string.
+        """
+        replaced_values = {
+            key: kwargs.get(key, "{" + key + "}") for key in self.placeholders
+        }
+        new_template = self.template.format(**replaced_values)
+        return PromptTemplate(new_template)
+
+    def __str__(self) -> str:
+        """Returns the template string."""
+        return self.template
+
+    def __repr__(self) -> str:
+        """Returns a string representation of the PromptTemplate."""
+        return f"PromptTemplate('{self.template}')"
diff --git a/vertexai/preview/evaluation/utils.py b/vertexai/preview/evaluation/utils.py
new file mode 100644
index 0000000000..c85caba690
--- /dev/null
+++ b/vertexai/preview/evaluation/utils.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import io
+import os
+from typing import Any, Dict, Optional, Union, TYPE_CHECKING
+
+from google.cloud import bigquery
+from google.cloud import storage
+from google.cloud.aiplatform import compat
+from google.cloud.aiplatform import initializer
+from google.cloud.aiplatform import utils
+from google.cloud.aiplatform_v1beta1.services import (
+    evaluation_service as gapic_evaluation_services,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+_BQ_PREFIX = "bq://"
+_GCS_PREFIX = "gs://"
+
+
+class _EvaluationServiceAsyncClientWithOverride(utils.ClientWithOverride):
+    _is_temporary = False
+    _default_version = compat.V1BETA1
+    _version_map = (
+        (
+            compat.V1BETA1,
+            gapic_evaluation_services.EvaluationServiceAsyncClient,
+        ),
+    )
+
+
+def create_evaluation_service_async_client(
+    api_base_path_override: Optional[str] = None,
+) -> _EvaluationServiceAsyncClientWithOverride:
+    """Creates an aync client for the evaluation service.
+
+    Args:
+      api_base_path_override: Optional. Override default api base path.
+
+    Returns:
+      Instantiated Vertex AI EvaluationService async client with optional overrides.
+    """
+    return initializer.global_config.create_client(
+        client_class=_EvaluationServiceAsyncClientWithOverride,
+        location_override=initializer.global_config.location,
+        api_base_path_override=api_base_path_override,
+    )
+
+
+def load_dataset(source: Union[str, "pd.DataFrame", Dict[str, Any]]) -> "pd.DataFrame":
+    """Loads dataset from various sources into a DataFrame.
+
+    Args:
+        source: The data source. Can be the following formats:
+          - pd.DataFrame: Used directly for evaluation.
+          - dict: Converted to a pandas DataFrame before evaluation.
+          - str: Interpreted as a file path or URI. Supported formats include:
+              * Local JSONL or CSV files:  Loaded from the local filesystem.
+              * GCS JSONL or CSV files: Loaded from Google Cloud Storage
+                  (e.g., 'gs://bucket/data.csv').
+              * BigQuery table URI: Loaded from Google Cloud BigQuery
+                  (e.g., 'bq://project-id.dataset.table_name').
+
+    Returns:
+        The dataset in pandas DataFrame format.
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[rapid_evaluation]"'
+        )
+
+    if isinstance(source, pd.DataFrame):
+        return source.copy()
+    elif isinstance(source, dict):
+        return pd.DataFrame(source)
+    elif isinstance(source, str):
+        if source.startswith(_BQ_PREFIX):
+            return _load_bigquery(source[len(_BQ_PREFIX) :])
+
+        _, extension = os.path.splitext(source)
+        file_type = extension.lower()[1:]
+
+        if file_type == "jsonl":
+            return _load_jsonl(source)
+        elif file_type == "csv":
+            return _load_csv(source)
+        else:
+            raise ValueError(f"Unsupported file type: {file_type}")
+    else:
+        raise TypeError(
+            "Unsupported dataset type. Must be DataFrame, dictionary, or" " filepath."
+        )
+
+
+def _load_jsonl(filepath: str) -> "pd.DataFrame":
+    """Loads data from a JSONL file into a DataFrame."""
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[rapid_evaluation]"'
+        )
+    if filepath.startswith(_GCS_PREFIX):
+        file_contents = _read_gcs_file_contents(filepath)
+        return pd.read_json(file_contents, lines=True)
+    else:
+        with open(filepath, "r") as f:
+            return pd.read_json(f, lines=True)
+
+
+def _load_csv(filepath: str) -> "pd.DataFrame":
+    """Loads data from a CSV file into a DataFrame."""
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError(
+            'Pandas is not installed. Please install the SDK using "pip install'
+            ' google-cloud-aiplatform[rapid_evaluation]"'
+        )
+    if filepath.startswith(_GCS_PREFIX):
+        file_contents = _read_gcs_file_contents(filepath)
+        return pd.read_csv(io.StringIO(file_contents), encoding="utf-8")
+    else:
+        return pd.read_csv(filepath, encoding="utf-8")
+
+
+def _load_bigquery(table_id: str) -> "pd.DataFrame":
+    """Loads data from a BigQuery table into a DataFrame."""
+
+    client = bigquery.Client(
+        project=initializer.global_config.project,
+        credentials=initializer.global_config.credentials,
+    )
+    table = client.get_table(table_id)
+    return client.list_rows(table).to_dataframe()
+
+
+def _read_gcs_file_contents(filepath: str) -> str:
+    """Reads the contents of a file from Google Cloud Storage.
+
+    Args:
+        filepath: The GCS file path (e.g., 'gs://bucket_name/file.csv')
+
+    Returns:
+        The contents of the file.
+    """
+
+    client = storage.Client(
+        project=initializer.global_config.project,
+        credentials=initializer.global_config.credentials,
+    )
+    bucket_name, blob_path = filepath[len(_GCS_PREFIX) :].split("/", 1)
+    bucket = client.get_bucket(bucket_name)
+    blob = bucket.blob(blob_path)
+    return blob.download_as_string().decode("utf-8")