Skip to content

Commit

Permalink
chore: bumpenvs for ROCm changes (#5026)
Browse files Browse the repository at this point in the history
  • Loading branch information
mpkouznetsov authored Sep 16, 2022
1 parent 62756f5 commit f14e5be
Show file tree
Hide file tree
Showing 53 changed files with 394 additions and 362 deletions.
48 changes: 24 additions & 24 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,11 @@ commands:
- when:
condition: <<parameters.tf1>>
steps:
- run: docker pull determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-9119094
- run: docker pull determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-69f397f
- when:
condition: <<parameters.tf2>>
steps:
- run: docker pull determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-9119094
- run: docker pull determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-69f397f

login-docker:
parameters:
Expand Down Expand Up @@ -781,7 +781,7 @@ commands:
description: The Google project ID to connect with via the gcloud CLI
type: env_var_name
environment-image:
default: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
default: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
type: string
steps:
- set-cluster-id:
Expand Down Expand Up @@ -1988,7 +1988,7 @@ jobs:
type: string
default: ""
environment-image:
default: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
default: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
type: string
docker:
- image: <<pipeline.parameters.docker-image>>
Expand Down Expand Up @@ -2586,7 +2586,7 @@ workflows:
parallelism: [1]
slack-mentions: ["${SLACK_USER_ID}"]
environment-image:
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-parallel-cuda-11
Expand All @@ -2605,7 +2605,7 @@ workflows:
gpus-per-machine: [4]
num-machines: [2]
environment-image:
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-single-gpu-tfonly
Expand All @@ -2621,10 +2621,10 @@ workflows:
parallelism: [1]
slack-mentions: ["${SLACK_USER_ID}"]
environment-image:
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.1
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-parallel-tfonly
Expand All @@ -2643,10 +2643,10 @@ workflows:
gpus-per-machine: [4]
num-machines: [2]
environment-image:
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.1
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.4

# Nightly distributed tests
- request-gpu-distributed-nightly:
Expand Down Expand Up @@ -2879,7 +2879,7 @@ workflows:
parallelism: [1]
slack-mentions: ["${SLACK_USER_ID}"]
environment-image:
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-parallel-cuda-11
Expand All @@ -2894,7 +2894,7 @@ workflows:
gpus-per-machine: [4]
num-machines: [2]
environment-image:
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
- determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-single-gpu-tfonly
Expand All @@ -2906,10 +2906,10 @@ workflows:
parallelism: [1]
slack-mentions: ["${SLACK_USER_ID}"]
environment-image:
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.1
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.4

- test-e2e-gke-cuda-11:
name: test-e2e-gke-parallel-tfonly
Expand All @@ -2924,10 +2924,10 @@ workflows:
gpus-per-machine: [4]
num-machines: [2]
environment-image:
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.1
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.1
- determinedai/environments:cuda-11.1-pytorch-1.9-tf-2.4-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.4
- determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.4

weekly-vuln-scan:
triggers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,5 +406,5 @@ This command line will spin up a cluster of up to 2 A100s in the ``us-central1-c
--compute-agent-instance-type a2-highgpu-1g --gpu-num 1 \
--gpu-type nvidia-tesla-a100 \
--region us-central1 --zone us-central1-c \
--gpu-env-image determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1 \
--cpu-env-image determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1
--gpu-env-image determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4 \
--cpu-env-image determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ by default in this version of Determined are described below.
+-------------+-------------------------------------------------------------------------+
| Environment | File Name |
+=============+=========================================================================+
| CPUs | ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-9119094`` |
| CPUs | ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-69f397f`` |
+-------------+-------------------------------------------------------------------------+
| Nvidia GPUs | ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-9119094`` |
| Nvidia GPUs | ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-69f397f`` |
+-------------+-------------------------------------------------------------------------+
| AMD GPUs | ``determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-9119094`` |
+-------------+-------------------------------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/interfaces/tensorboard.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ to additional data with a bind-mount.
.. code:: yaml
environment:
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
bind_mounts:
- host_path: /my/agent/path
container_path: /my/container/path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,11 @@

- ``cpuImage``: Sets the default docker image for all non-gpu tasks. If a docker image is
specified in the :ref:`experiment config <exp-environment-image>` this default is overriden.
Defaults to: ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1``.
Defaults to: ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4``.

- ``gpuImage``: Sets the default docker image for all gpu tasks. If a docker image is specified
in the :ref:`experiment config <exp-environment-image>` this default is overriden. Defaults
to: ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1``.
to: ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4``.

- ``enterpriseEdition``: Specifies whether to use Determined enterprise edition.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ The master supports the following configuration settings:
``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks
using the ``rocm`` key. Default values:

- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1`` for NVIDIA GPUs.
- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4`` for NVIDIA GPUs.
- ``determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-0.19.1`` for ROCm.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1`` for CPUs.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4`` for CPUs.

- ``environment_variables``: A list of environment variables that will be set in every task
container. Each element of the list should be a string of the form ``NAME=VALUE``. See
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/reference-interface/job-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ The following configuration settings are supported:
different container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6),
CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:

- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1`` for NVIDIA GPUs.
- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4`` for NVIDIA GPUs.
- ``determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-0.19.1`` for ROCm.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1`` for CPUs.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4`` for CPUs.

- ``force_pull_image``: Forcibly pull the image from the Docker registry and bypass the Docker
cache. Defaults to ``false``.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1038,8 +1038,8 @@ workloads for this experiment. For more information on customizing the trial env
images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu``
key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:

- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1`` for NVIDIA GPUs.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1`` for CPUs.
- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4`` for NVIDIA GPUs.
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4`` for CPUs.
- ``determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-0.19.1`` for ROCm.

When the cluster is configured with :ref:`resource_manager.type: slurm
Expand Down
20 changes: 10 additions & 10 deletions docs/training/apis-howto/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,19 @@ experiment is controlled by the container image that has been configured for tha
Determined provides prebuilt Docker images that include TensorFlow 2.4, 1.15, 2.5, 2.6, and 2.7,
respectively:

- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1`` (default)
- ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.1``
- ``determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.1``
- ``determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.1``
- ``determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.1``
- ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4`` (default)
- ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.4``
- ``determinedai/environments:cuda-11.2-tf-2.5-gpu-0.19.4``
- ``determinedai/environments:cuda-11.2-tf-2.6-gpu-0.19.4``
- ``determinedai/environments:cuda-11.2-tf-2.7-gpu-0.19.4``

We also provide lightweight CPU-only counterparts:

- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1``
- ``determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-0.19.1``
- ``determinedai/environments:py-3.8-tf-2.5-cpu-0.19.1``
- ``determinedai/environments:py-3.8-tf-2.6-cpu-0.19.1``
- ``determinedai/environments:py-3.8-tf-2.7-cpu-0.19.1``
- ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4``
- ``determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-0.19.4``
- ``determinedai/environments:py-3.8-tf-2.5-cpu-0.19.4``
- ``determinedai/environments:py-3.8-tf-2.6-cpu-0.19.4``
- ``determinedai/environments:py-3.8-tf-2.7-cpu-0.19.4``

To change the container image used for an experiment, specify :ref:`environment.image
<exp-environment-image>` in the experiment configuration file. Please see :ref:`container-images`
Expand Down
8 changes: 4 additions & 4 deletions docs/training/setup-guide/custom-env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ Default Images
+-------------+---------------------------------------------------------------------------------------+
| Environment | File Name |
+=============+=======================================================================================+
| CPUs | ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1`` |
| CPUs | ``determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4`` |
+-------------+---------------------------------------------------------------------------------------+
| Nvidia GPUs | ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1`` |
| Nvidia GPUs | ``determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4`` |
+-------------+---------------------------------------------------------------------------------------+
| AMD GPUs | ``determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-0.19.1`` |
+-------------+---------------------------------------------------------------------------------------+
Expand Down Expand Up @@ -132,7 +132,7 @@ Example Dockerfile that installs custom ``conda``-, ``pip``-, and ``apt``-based
.. code:: bash
# Determined Image
FROM determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
FROM determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
# Custom Configuration
RUN apt-get update && \
Expand Down Expand Up @@ -195,7 +195,7 @@ environments using :ref:`custom images <custom-docker-images>`:
.. code:: bash
# Determined Image
FROM determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1
FROM determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4
# Create a virtual environment
RUN conda create -n myenv python=3.8
Expand Down
8 changes: 4 additions & 4 deletions e2e_tests/tests/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
MAX_TRIAL_BUILD_SECS = 90


DEFAULT_TF1_CPU_IMAGE = "determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-9119094"
DEFAULT_TF2_CPU_IMAGE = "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-9119094"
DEFAULT_TF1_GPU_IMAGE = "determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-9119094"
DEFAULT_TF2_GPU_IMAGE = "determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-9119094"
DEFAULT_TF1_CPU_IMAGE = "determinedai/environments:py-3.7-pytorch-1.7-tf-1.15-cpu-69f397f"
DEFAULT_TF2_CPU_IMAGE = "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-69f397f"
DEFAULT_TF1_GPU_IMAGE = "determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-69f397f"
DEFAULT_TF2_GPU_IMAGE = "determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-69f397f"

TF1_CPU_IMAGE = os.environ.get("TF1_CPU_IMAGE") or DEFAULT_TF1_CPU_IMAGE
TF2_CPU_IMAGE = os.environ.get("TF2_CPU_IMAGE") or DEFAULT_TF2_CPU_IMAGE
Expand Down
4 changes: 2 additions & 2 deletions e2e_tests/tests/fixtures/pytorch_lightning_amp/apex_amp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ searcher:
entrypoint: apex_amp_model_def:MNistApexAMPTrial
environment:
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
cpu: determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
cpu: determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4

4 changes: 2 additions & 2 deletions e2e_tests/tests/fixtures/pytorch_lightning_amp/auto_amp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ searcher:
entrypoint: auto_amp_model_def:MNistAutoAMPTrial
environment:
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.1
cpu: determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-0.19.4
cpu: determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4

2 changes: 1 addition & 1 deletion examples/computer_vision/efficientdet_pytorch/const.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
description: efficientdet_const
environment:
image: determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.1
image: determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.4
hyperparameters:
global_batch_size: 16
min_loss_scale: 16.0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
description: efficientdet_const
environment:
image: determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.1
image: determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.19.4
hyperparameters:
global_batch_size: 16
min_loss_scale: 16.0
Expand Down
2 changes: 1 addition & 1 deletion examples/computer_vision/unets_tf_keras/const.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ min_validation_period:
entrypoint: model_def:UNetsTrial
scheduling_unit: 57
environment:
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-9119094
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-69f397f
2 changes: 1 addition & 1 deletion examples/computer_vision/unets_tf_keras/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ min_validation_period:
scheduling_unit: 8
entrypoint: model_def:UNetsTrial
environment:
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-9119094
image: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-69f397f
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ searcher:
entrypoint: model_def:BoostedTreesTrial
scheduling_unit: 1
environment:
image: "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1"
image: "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4"
2 changes: 1 addition & 1 deletion examples/decision_trees/gbt_titanic_estimator/const.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ searcher:
entrypoint: model_def:BoostedTreesTrial
scheduling_unit: 1
environment:
image: "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.1"
image: "determinedai/environments:py-3.8-pytorch-1.10-tf-2.8-cpu-0.19.4"
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ environment:
# You may need to modify this to match your network configuration.
- NCCL_SOCKET_IFNAME=ens,eth,ib
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-9119094
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-69f397f
bind_mounts:
- host_path: /tmp
container_path: /data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ environment:
# You may need to modify this to match your network configuration.
- NCCL_SOCKET_IFNAME=ens,eth,ib
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-9119094
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-69f397f
bind_mounts:
- host_path: /tmp
container_path: /data
Expand Down
2 changes: 1 addition & 1 deletion examples/deepspeed/cifar10_moe/moe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ environment:
# - NCCL_BLOCKING_WAIT=1
# - NCCL_IB_DISABLE=1
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-9119094
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-69f397f
bind_mounts:
- host_path: /tmp
container_path: /data
Expand Down
2 changes: 1 addition & 1 deletion examples/deepspeed/cifar10_moe/zero_stages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ environment:
# - NCCL_BLOCKING_WAIT=1
# - NCCL_IB_DISABLE=1
image:
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-9119094
gpu: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-69f397f
bind_mounts:
- host_path: /tmp
container_path: /data
Expand Down
Loading

0 comments on commit f14e5be

Please sign in to comment.