Skip to content

Commit

Permalink
fix: Fix experiments failure when backing tensorboard has been deleted.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 599955406
  • Loading branch information
vertex-sdk-bot authored and Copybara-Service committed Jan 19, 2024
1 parent 3b28d64 commit e7a197e
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 21 deletions.
16 changes: 11 additions & 5 deletions google/cloud/aiplatform/metadata/experiment_resources.py
Expand Up @@ -502,7 +502,7 @@ def _lookup_backing_tensorboard(self) -> Optional[tensorboard_resource.Tensorboa
"""Returns backing tensorboard if one is set.
Returns:
Tensorboard resource if one exists.
Tensorboard resource if one exists, otherwise returns None.
"""
tensorboard_resource_name = self._metadata_context.metadata.get(
constants._BACKING_TENSORBOARD_RESOURCE_KEY
Expand All @@ -516,10 +516,16 @@ def _lookup_backing_tensorboard(self) -> Optional[tensorboard_resource.Tensorboa
)

if tensorboard_resource_name:
return tensorboard_resource.Tensorboard(
tensorboard_resource_name,
credentials=self._metadata_context.credentials,
)
try:
return tensorboard_resource.Tensorboard(
tensorboard_resource_name,
credentials=self._metadata_context.credentials,
)
except exceptions.NotFound:
self._metadata_context.update(
metadata={constants._BACKING_TENSORBOARD_RESOURCE_KEY: None}
)
return None

def get_backing_tensorboard_resource(
self,
Expand Down
28 changes: 25 additions & 3 deletions google/cloud/aiplatform/metadata/metadata.py
Expand Up @@ -21,6 +21,7 @@
from typing import Dict, Union, Optional, Any, List

from google.api_core import exceptions
import google.auth
from google.auth import credentials as auth_credentials
from google.protobuf import timestamp_pb2

Expand Down Expand Up @@ -216,7 +217,7 @@ def _execution_to_column_named_metadata(


class _ExperimentTracker:
"""Tracks Experiments and Experiment Runs wil high level APIs"""
"""Tracks Experiments and Experiment Runs with high level APIs."""

def __init__(self):
self._experiment: Optional[experiment_resources.Experiment] = None
Expand All @@ -229,6 +230,27 @@ def reset(self):
self._experiment = None
self._experiment_run = None

def _get_global_tensorboard(self) -> Optional[tensorboard_resource.Tensorboard]:
"""Helper method to get the global TensorBoard instance.
Returns:
tensorboard_resource.Tensorboard: the global TensorBoard instance.
"""
if self._global_tensorboard:
credentials, _ = google.auth.default()
if self.experiment and self.experiment._metadata_context.credentials:
credentials = self.experiment._metadata_context.credentials
try:
return tensorboard_resource.Tensorboard(
self._global_tensorboard.resource_name,
project=self._global_tensorboard.project,
location=self._global_tensorboard.location,
credentials=credentials,
)
except exceptions.NotFound:
self._global_tensorboard = None
return None

@property
def experiment_name(self) -> Optional[str]:
"""Return the currently set experiment name, if experiment is not set, return None"""
Expand Down Expand Up @@ -284,7 +306,7 @@ def set_experiment(
If ommitted, or set to `True` or `None`, the global tensorboard is used.
If no global tensorboard is set, the default tensorboard will be used, and created if it does not exist.
To disable using a backign tensorboard, set `backing_tensorboard` to `False`.
To disable using a backing tensorboard, set `backing_tensorboard` to `False`.
To maintain this behavior, set `experiment_tensorboard` to `False` in subsequent calls to aiplatform.init().
"""
self.reset()
Expand All @@ -299,7 +321,7 @@ def set_experiment(
backing_tb = None
else:
backing_tb = (
self._global_tensorboard or _get_or_create_default_tensorboard()
self._get_global_tensorboard() or _get_or_create_default_tensorboard()
)

current_backing_tb = experiment.backing_tensorboard_resource_name
Expand Down
25 changes: 12 additions & 13 deletions google/cloud/aiplatform/tensorboard/uploader_tracker.py
Expand Up @@ -234,24 +234,23 @@ def _create_uploader(
project, location, tensorboard_id
)
else:
if _experiment_tracker._global_tensorboard:
if _experiment_tracker._get_global_tensorboard():
tensorboard_resource_name = (
_experiment_tracker._global_tensorboard.resource_name
_experiment_tracker._get_global_tensorboard().resource_name
)
else:
if _experiment_tracker._experiment:
if _experiment_tracker._experiment._lookup_backing_tensorboard():
tensorboard_resource_name = (
_experiment_tracker._experiment._lookup_backing_tensorboard().resource_name
)
else:
raise ValueError(
f"No TensorBoard associated with experiment {initializer.global_config.experiment_name}. Please provide tensorboard_id in the argument."
)
elif _experiment_tracker._experiment:
if _experiment_tracker._experiment._lookup_backing_tensorboard():
tensorboard_resource_name = (
_experiment_tracker._experiment._lookup_backing_tensorboard().resource_name
)
else:
raise ValueError(
"No TensorBoard found. Please provide tensorboard_id in the argument."
f"No TensorBoard associated with experiment {initializer.global_config.experiment_name}. Please provide tensorboard_id in the argument."
)
else:
raise ValueError(
"No TensorBoard found. Please provide tensorboard_id in the argument."
)

api_client = initializer.global_config.create_client(
client_class=TensorboardClientWithOverride,
Expand Down
59 changes: 59 additions & 0 deletions tests/system/aiplatform/test_experiments.py
Expand Up @@ -16,6 +16,7 @@
#
import tempfile

import uuid
import pytest

from google.api_core import exceptions
Expand Down Expand Up @@ -618,3 +619,61 @@ def test_init_associates_global_tensorboard_to_experiment(self, shared_state):
)
== tensorboard.resource_name
)

def test_get_backing_tensorboard_resource_returns_tensorboard(self, shared_state):
tensorboard = aiplatform.Tensorboard.create(
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
display_name=self._make_display_name("")[:64],
)
shared_state["resources"] = [tensorboard]
aiplatform.init(
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
experiment=self._experiment_name,
experiment_tensorboard=tensorboard,
)
experiment = aiplatform.Experiment(
self._experiment_name,
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
)

assert (
experiment.get_backing_tensorboard_resource().resource_name
== tensorboard.resource_name
)

def test_get_backing_tensorboard_resource_returns_none(self):
new_experiment_name = f"example-{uuid.uuid1()}"
aiplatform.init(
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
experiment=new_experiment_name,
experiment_tensorboard=False,
)
new_experiment = aiplatform.Experiment(
new_experiment_name,
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
)

assert new_experiment.get_backing_tensorboard_resource() is None

def test_delete_backing_tensorboard_experiment_run_success(self):
aiplatform.init(
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
experiment=self._experiment_name,
)
experiment = aiplatform.Experiment(
self._experiment_name,
project=e2e_base._PROJECT,
location=e2e_base._LOCATION,
)
experiment.get_backing_tensorboard_resource().delete()
run = aiplatform.start_run(_RUN)
aiplatform.end_run()

assert experiment.get_backing_tensorboard_resource() is None
assert run.name == _RUN
31 changes: 31 additions & 0 deletions tests/unit/vertexai/test_remote_training.py
Expand Up @@ -23,11 +23,14 @@
from unittest.mock import patch

import cloudpickle
from google import auth
from google.api_core import exceptions
from google.auth import credentials as auth_credentials
from google.cloud import aiplatform
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.compat.services import (
job_service_client_v1beta1 as job_service_client,
tensorboard_service_client,
)
from google.cloud.aiplatform.compat.types import (
custom_job_v1beta1 as gca_custom_job_compat,
Expand Down Expand Up @@ -89,6 +92,7 @@
_TEST_EXPERIMENT = "test-experiment"
_TEST_EXPERIMENT_RUN = "test-experiment-run"
_TEST_SERVICE_ACCOUNT = f"{_TEST_PROJECT_NUMBER}-compute@developer.gserviceaccount.com"
_TEST_CREDENTIALS = mock.Mock(spec=auth_credentials.AnonymousCredentials())

# dataset constants
dataset = load_iris()
Expand Down Expand Up @@ -707,6 +711,25 @@ def aiplatform_autolog_mock():
yield aiplatform_autolog_mock


@pytest.fixture(scope="module")
def google_auth_mock():
with mock.patch.object(auth, "default") as auth_mock:
auth_mock.return_value = (
auth_credentials.AnonymousCredentials(),
"test-project",
)
yield auth_mock


@pytest.fixture
def get_tensorboard_mock():
with patch.object(
tensorboard_service_client.TensorboardServiceClient, "get_tensorboard"
) as get_tensorboard_mock:
get_tensorboard_mock.return_value = _TEST_DEFAULT_TENSORBOARD_GCA
yield get_tensorboard_mock


# unittest `assert_any_call` method doesn't work when arguments contain `np.ndarray`
# https://stackoverflow.com/questions/56644729/mock-assert-mock-calls-with-a-numpy-array-as-argument-raises-valueerror-and-np
# tentatively runtime patch `assert_any_call` to solve this issue
Expand Down Expand Up @@ -1636,6 +1659,7 @@ def test_remote_training_keras_distributed_no_cuda_no_worker_pool_specs(
"get_artifact_not_found_mock",
"update_context_mock",
"mock_autolog_disabled",
"get_tensorboard_mock",
)
def test_remote_training_sklearn_with_experiment(
self,
Expand All @@ -1647,6 +1671,7 @@ def test_remote_training_sklearn_with_experiment(
location=_TEST_LOCATION,
staging_bucket=_TEST_BUCKET_NAME,
experiment=_TEST_EXPERIMENT,
credentials=_TEST_CREDENTIALS,
)
vertexai.preview.init(remote=True)

Expand Down Expand Up @@ -1720,6 +1745,7 @@ def test_remote_training_sklearn_with_experiment(
"update_context_mock",
"aiplatform_autolog_mock",
"mock_autolog_enabled",
"get_tensorboard_mock",
)
def test_remote_training_sklearn_with_experiment_autolog_enabled(
self,
Expand All @@ -1731,6 +1757,7 @@ def test_remote_training_sklearn_with_experiment_autolog_enabled(
location=_TEST_LOCATION,
staging_bucket=_TEST_BUCKET_NAME,
experiment=_TEST_EXPERIMENT,
credentials=_TEST_CREDENTIALS,
)
vertexai.preview.init(remote=True, autolog=True)

Expand Down Expand Up @@ -1926,6 +1953,7 @@ def test_initialize_existing_persistent_resource_service_account_mismatch(self):
"aiplatform_autolog_mock",
"mock_autolog_enabled",
"persistent_resource_running_mock",
"get_tensorboard_mock",
)
def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_experiment_error(
self,
Expand All @@ -1935,6 +1963,7 @@ def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_
location=_TEST_LOCATION,
staging_bucket=_TEST_BUCKET_NAME,
experiment=_TEST_EXPERIMENT,
credentials=_TEST_CREDENTIALS,
)
vertexai.preview.init(
remote=True, autolog=True, cluster=_TEST_PERSISTENT_RESOURCE_CONFIG
Expand Down Expand Up @@ -1966,6 +1995,7 @@ def test_remote_training_sklearn_with_persistent_cluster_no_service_account_and_
"persistent_resource_service_account_running_mock",
"mock_timestamped_unique_name",
"mock_get_custom_job",
"get_tensorboard_mock",
)
def test_remote_training_sklearn_with_persistent_cluster_and_experiment_autologging(
self,
Expand All @@ -1977,6 +2007,7 @@ def test_remote_training_sklearn_with_persistent_cluster_and_experiment_autologg
location=_TEST_LOCATION,
staging_bucket=_TEST_BUCKET_NAME,
experiment=_TEST_EXPERIMENT,
credentials=_TEST_CREDENTIALS,
)
vertexai.preview.init(
remote=True,
Expand Down

0 comments on commit e7a197e

Please sign in to comment.