tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

flaky-bot · 2023-04-11T02:06:10Z

This test failed!

To configure my behavior, see the Flaky Bot documentation.

If I'm commenting on this issue too often, add the flakybot: quiet label and
I will stop commenting.

commit: e55a177
buildURL: Build Status, Sponge
status: failed

Test output

self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn"],
        enable_autolog=True,
    )


  custom_job.run(


        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:

google/cloud/aiplatform/jobs.py:1680: in run

self._run(

google/cloud/aiplatform/base.py:810: in wrapper

return method(*args, **kwargs)

google/cloud/aiplatform/jobs.py:1774: in _run

self._block_until_complete()

self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7f92bbca9dc0>

resource name: projects/580378083368/locations/us-central1/customJobs/761744267063328768
def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:


      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)


E           RuntimeError: Job failed with:

E           code: 3

E           message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F761744267063328768&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22761744267063328768%22"
google/cloud/aiplatform/jobs.py:1086: RuntimeError

The text was updated successfully, but these errors were encountered:

flaky-bot · 2023-04-11T19:01:11Z

commit: cfb0bbd
buildURL: Build Status, Sponge
status: failed

Test output

self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn"],
        enable_autolog=True,
    )


  custom_job.run(


        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:

google/cloud/aiplatform/jobs.py:1680: in run

self._run(

google/cloud/aiplatform/base.py:810: in wrapper

return method(*args, **kwargs)

google/cloud/aiplatform/jobs.py:1774: in _run

self._block_until_complete()

self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7fb2e30d4d60>

resource name: projects/580378083368/locations/us-central1/customJobs/6713427096493883392
def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:


      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)


E           RuntimeError: Job failed with:

E           code: 3

E           message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F6713427096493883392&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%226713427096493883392%22"
google/cloud/aiplatform/jobs.py:1086: RuntimeError

flaky-bot · 2023-04-11T21:18:18Z

commit: 8bf6477
buildURL: Build Status, Sponge
status: failed

Test output

self = 
shared_state = {'bucket': , 'resources': [}
def test_from_local_script_enable_autolog_custom_container(self, shared_state):

    aiplatform.init(
        project=e2e_base._PROJECT,
        location=e2e_base._LOCATION,
        staging_bucket=shared_state["staging_bucket_name"],
        experiment=self._experiment_name,
        experiment_tensorboard=self._backing_tensorboard,
    )

    display_name = self._make_display_name("custom-job")

    custom_job = aiplatform.CustomJob.from_local_script(
        display_name=display_name,
        script_path=_LOCAL_TRAINING_SCRIPT_PATH,
        container_uri=_CUSTOM_CONTAINER_IMAGE,
        requirements=["scikit-learn", "pandas"],
        enable_autolog=True,
    )


  custom_job.run(


        experiment=self._experiment_name,
        service_account=self._service_account,
    )

tests/system/aiplatform/test_custom_job.py:162:

google/cloud/aiplatform/jobs.py:1680: in run

self._run(

google/cloud/aiplatform/base.py:810: in wrapper

return method(*args, **kwargs)

google/cloud/aiplatform/jobs.py:1774: in _run

self._block_until_complete()

self = <google.cloud.aiplatform.jobs.CustomJob object at 0x7f82949e0880>

resource name: projects/580378083368/locations/us-central1/customJobs/146387191415701504
def _block_until_complete(self):
    """Helper method to block and check on runnable job until complete.

    Raises:
        RuntimeError: If job failed or cancelled.
    """

    log_wait = _LOG_WAIT_TIME

    previous_time = time.time()
    while self.state not in _JOB_COMPLETE_STATES:
        current_time = time.time()
        if current_time - previous_time >= _LOG_WAIT_TIME:
            self._log_job_state()
            log_wait = min(log_wait * _WAIT_TIME_MULTIPLIER, _MAX_WAIT_TIME)
            previous_time = current_time
        self._log_web_access_uris()
        time.sleep(_JOB_WAIT_TIME)

    self._log_job_state()

    if isinstance(self, CustomJob) and self._experiment_run:
        # sync resource before end run
        self._experiment_run = aiplatform.ExperimentRun.get(
            self._experiment_run.name,
            experiment=self._experiment,
        )
        self._experiment_run.end_run()

    # Error is only populated when the job state is
    # JOB_STATE_FAILED or JOB_STATE_CANCELLED.
    if self._gca_resource.state in _JOB_ERROR_STATES:


      raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error)


E           RuntimeError: Job failed with:

E           code: 3

E           message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=580378083368&resource=ml_job%2Fjob_id%2F146387191415701504&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22146387191415701504%22"
google/cloud/aiplatform/jobs.py:1086: RuntimeError

product-auto-label bot added the api: vertex-ai Issues related to the googleapis/python-aiplatform API. label Apr 11, 2023

sararob closed this as completed Apr 12, 2023

flaky-bot bot mentioned this issue Apr 24, 2024

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #3664

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

flaky-bot bot commented Apr 11, 2023

flaky-bot bot commented Apr 11, 2023

flaky-bot bot commented Apr 11, 2023

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

tests.system.aiplatform.test_custom_job.TestCustomJob: test_from_local_script_enable_autolog_custom_container failed #2082

Comments

flaky-bot bot commented Apr 11, 2023

flaky-bot bot commented Apr 11, 2023

flaky-bot bot commented Apr 11, 2023