Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

Closed
flaky-bot bot opened this issue Apr 11, 2023 · 2 comments
Labels
api: vertex-ai Issues related to the googleapis/python-aiplatform API. flakybot: issue An issue filed by the Flaky Bot. Should not be added manually. priority: p1 Important issue which blocks shipping the next release. Will be fixed prior to next release. type: bug Error or flaw in code with unintended results or allowing sub-optimal usage patterns.

Comments

@flaky-bot
Copy link

flaky-bot bot commented Apr 11, 2023

This test failed!

To configure my behavior, see the Flaky Bot documentation.

If I'm commenting on this issue too often, add the flakybot: quiet label and
I will stop commenting.


commit: e55a177
buildURL: Build Status, Sponge
status: failed

Test output
self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn"],
        enable_autolog=True,
    )
  custom_job.run(
        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:


google/cloud/aiplatform/jobs.py:1680: in run
self._run(
google/cloud/aiplatform/base.py:810: in wrapper
return method(*args, **kwargs)
google/cloud/aiplatform/jobs.py:1774: in _run
self._block_until_complete()


self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7f92bbca9dc0>
resource name: projects/580378083368/locations/us-central1/customJobs/761744267063328768

def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:
      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)

E RuntimeError: Job failed with:
E code: 3
E message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F761744267063328768&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22761744267063328768%22"

google/cloud/aiplatform/jobs.py:1086: RuntimeError

@flaky-bot flaky-bot bot added flakybot: issue An issue filed by the Flaky Bot. Should not be added manually. priority: p1 Important issue which blocks shipping the next release. Will be fixed prior to next release. type: bug Error or flaw in code with unintended results or allowing sub-optimal usage patterns. labels Apr 11, 2023
@product-auto-label product-auto-label bot added the api: vertex-ai Issues related to the googleapis/python-aiplatform API. label Apr 11, 2023
@flaky-bot
Copy link
Author

flaky-bot bot commented Apr 11, 2023

commit: cfb0bbd
buildURL: Build Status, Sponge
status: failed

Test output
self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn"],
        enable_autolog=True,
    )
  custom_job.run(
        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:


google/cloud/aiplatform/jobs.py:1680: in run
self._run(
google/cloud/aiplatform/base.py:810: in wrapper
return method(*args, **kwargs)
google/cloud/aiplatform/jobs.py:1774: in _run
self._block_until_complete()


self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7fb2e30d4d60>
resource name: projects/580378083368/locations/us-central1/customJobs/6713427096493883392

def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:
      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)

E RuntimeError: Job failed with:
E code: 3
E message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F6713427096493883392&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%226713427096493883392%22"

google/cloud/aiplatform/jobs.py:1086: RuntimeError

@flaky-bot
Copy link
Author

flaky-bot bot commented Apr 11, 2023

commit: 8bf6477
buildURL: Build Status, Sponge
status: failed

Test output
self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn", "pandas"],
        enable_autolog=True,
    )
  custom_job.run(
        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:


google/cloud/aiplatform/jobs.py:1680: in run
self._run(
google/cloud/aiplatform/base.py:810: in wrapper
return method(*args, **kwargs)
google/cloud/aiplatform/jobs.py:1774: in _run
self._block_until_complete()


self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7f82949e0880>
resource name: projects/580378083368/locations/us-central1/customJobs/146387191415701504

def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:
      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)

E RuntimeError: Job failed with:
E code: 3
E message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F146387191415701504&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22146387191415701504%22"

google/cloud/aiplatform/jobs.py:1086: RuntimeError

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
api: vertex-ai Issues related to the googleapis/python-aiplatform API. flakybot: issue An issue filed by the Flaky Bot. Should not be added manually. priority: p1 Important issue which blocks shipping the next release. Will be fixed prior to next release. type: bug Error or flaw in code with unintended results or allowing sub-optimal usage patterns.
Projects
None yet
Development

No branches or pull requests

1 participant