diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md
index c7d10d1f4..cd40565cf 100644
--- a/docs/docs/concepts/tasks.md
+++ b/docs/docs/concepts/tasks.md
@@ -136,7 +136,7 @@ resources:
 </div>
 
 Nodes can communicate using their private IP addresses.
-Use `DSTACK_MASTER_NODE_IP`, `$DSTACK_NODE_RANK`, and other
+Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other
 [System environment variables](#system-environment-variables)
 to discover IP addresses and other details.
 
@@ -159,6 +159,11 @@ to discover IP addresses and other details.
       # ... The rest of the commands
     ```
 
+??? info "SSH"
+    You can log in to any node from any node via SSH on port 10022 using the `~/.ssh/dstack_job` private key.
+    For convenience, `~/.ssh/config` is preconfigured with these options, so a simple `ssh <node_ip>` is enough.
+    For a list of nodes IPs check the `DSTACK_NODES_IPS` environment variable.
+
 !!! info "Fleets"
     Distributed tasks can only run on fleets with
     [cluster placement](fleets.md#cloud-placement).
diff --git a/docs/examples.md b/docs/examples.md
index c039fda4d..fcf8c0bcf 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -155,4 +155,14 @@ hide:
             Use Docker and Docker Compose inside runs
         </p>
     </a>
+    <a href="/examples/misc/nccl-tests"
+       class="feature-cell sky">
+        <h3>
+            NCCL Tests
+        </h3>
+
+        <p>
+            Run multi-node NCCL Tests with MPI
+        </p>
+    </a>
 </div>
diff --git a/docs/examples/misc/nccl-tests/index.md b/docs/examples/misc/nccl-tests/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/misc/nccl-tests/.dstack.yml b/examples/misc/nccl-tests/.dstack.yml
new file mode 100644
index 000000000..2e3376f5c
--- /dev/null
+++ b/examples/misc/nccl-tests/.dstack.yml
@@ -0,0 +1,42 @@
+type: task
+name: nccl-tests
+
+image: un1def/aws-efa-test
+nodes: 2
+
+env:
+  - NCCL_DEBUG=INFO
+
+commands:
+  - |
+    # We use FIFO for inter-node communication
+    FIFO=/tmp/dstack_job
+    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
+      cd /root/nccl-tests/build
+      echo "${DSTACK_NODES_IPS}" > hostfile
+      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
+      # Wait for other nodes
+      while true; do
+        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
+          break
+        fi
+        echo 'Waiting for nodes...'
+        sleep 5
+      done
+      # Run NCCL Tests
+      ${MPIRUN} \
+        -n $((DSTACK_NODES_NUM * DSTACK_GPUS_PER_NODE)) -N ${DSTACK_GPUS_PER_NODE} \
+        --mca btl_tcp_if_exclude lo,docker0 \
+        --bind-to none \
+        ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
+      # Notify nodes the job is done
+      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
+    else
+      mkfifo ${FIFO}
+      # Wait for a message from the first node
+      cat ${FIFO}
+    fi
+
+resources:
+  gpu: nvidia:4:16GB
+  shm_size: 16GB
diff --git a/examples/misc/nccl-tests/README.md b/examples/misc/nccl-tests/README.md
new file mode 100644
index 000000000..318312573
--- /dev/null
+++ b/examples/misc/nccl-tests/README.md
@@ -0,0 +1,79 @@
+# NCCL Tests
+
+This example shows how to run distributed [NCCL Tests :material-arrow-top-right-thin:{ .external }](https://github.com/NVIDIA/nccl-tests){:target="_blank"} with MPI using `dstack`.
+
+??? info "AWS EFA"
+    The used image is optimized for AWS [EFA :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"} but works with regular TCP/IP network adapters as well.
+
+## Configuration
+
+This configuration runs AllReduce test on 2 nodes with 4 GPUs each (8 processes total), but you can adjust both `nodes` and `resources.gpu` without modifying the script.
+
+<div editor-title="examples/misc/nccl-tests/.dstack.yml">
+
+```yaml
+type: task
+name: nccl-tests
+
+image: un1def/aws-efa-test
+nodes: 2
+
+env:
+  - NCCL_DEBUG=INFO
+
+commands:
+  - |
+    # We use FIFO for inter-node communication
+    FIFO=/tmp/dstack_job
+    if [ ${DSTACK_NODE_RANK} -eq 0 ]; then
+      cd /root/nccl-tests/build
+      echo "${DSTACK_NODES_IPS}" > hostfile
+      MPIRUN='mpirun --allow-run-as-root --hostfile hostfile'
+      # Wait for other nodes
+      while true; do
+        if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then
+          break
+        fi
+        echo 'Waiting for nodes...'
+        sleep 5
+      done
+      # Run NCCL Tests
+      ${MPIRUN} \
+        -n $((DSTACK_NODES_NUM * DSTACK_GPUS_PER_NODE)) -N ${DSTACK_GPUS_PER_NODE} \
+        --mca btl_tcp_if_exclude lo,docker0 \
+        --bind-to none \
+        ./all_reduce_perf -b 8 -e 8G -f 2 -g 1
+      # Notify nodes the job is done
+      ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}"
+    else
+      mkfifo ${FIFO}
+      # Wait for a message from the first node
+      cat ${FIFO}
+    fi
+
+resources:
+  gpu: nvidia:4:16GB
+  shm_size: 16GB
+
+```
+
+</div>
+
+### Running a configuration
+
+To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply/) command.
+
+<div class="termy">
+
+```shell
+$ dstack apply -f examples/misc/nccl-tests/.dstack.yml
+
+ #  BACKEND  REGION     INSTANCE       RESOURCES                                   SPOT  PRICE
+ 1  aws      us-east-1  g4dn.12xlarge  48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk)  no    $3.912
+ 2  aws      us-west-2  g4dn.12xlarge  48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk)  no    $3.912
+ 3  aws      us-east-2  g4dn.12xlarge  48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk)  no    $3.912
+
+Submit the run nccl-tests? [y/n]: y
+```
+
+</div>
diff --git a/mkdocs.yml b/mkdocs.yml
index 961097f02..b146eec38 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -273,6 +273,7 @@ nav:
         - Llama 3.2: examples/llms/llama32/index.md
     - Misc:
         - Docker Compose: examples/misc/docker-compose/index.md
+        - NCCL Tests: examples/misc/nccl-tests/index.md
 #  - Community: community.md
   - Partners: partners.md
   - Blog:
diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go
index 6b4b6374d..3b6ee1cef 100644
--- a/runner/internal/executor/executor.go
+++ b/runner/internal/executor/executor.go
@@ -1,6 +1,7 @@
 package executor
 
 import (
+	"bytes"
 	"context"
 	"errors"
 	"fmt"
@@ -30,6 +31,7 @@ type RunExecutor struct {
 	tempDir    string
 	homeDir    string
 	workingDir string
+	sshPort    int
 	uid        uint32
 
 	run             schemas.RunSpec
@@ -74,6 +76,7 @@ func NewRunExecutor(tempDir string, homeDir string, workingDir string, sshPort i
 		tempDir:    tempDir,
 		homeDir:    homeDir,
 		workingDir: workingDir,
+		sshPort:    sshPort,
 		uid:        uid,
 
 		mu:              mu,
@@ -322,15 +325,18 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
 			log.Warning(ctx, "failed to write SSH environment", "path", ex.homeDir, "err", err)
 		}
 	}
+	userSSHDir := ""
+	uid := -1
+	gid := -1
 	if user != nil && *user.Uid != 0 {
 		// non-root user
-		uid := int(*user.Uid)
-		gid := int(*user.Gid)
+		uid = int(*user.Uid)
+		gid = int(*user.Gid)
 		homeDir, isHomeDirAccessible := prepareHomeDir(ctx, uid, gid, user.HomeDir)
 		envMap["HOME"] = homeDir
 		if isHomeDirAccessible {
 			log.Trace(ctx, "provisioning homeDir", "path", homeDir)
-			userSSHDir, err := prepareSSHDir(uid, gid, homeDir)
+			userSSHDir, err = prepareSSHDir(uid, gid, homeDir)
 			if err != nil {
 				log.Warning(ctx, "failed to prepare ssh dir", "home", homeDir, "err", err)
 			} else {
@@ -354,6 +360,17 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
 	} else {
 		// root user
 		envMap["HOME"] = ex.homeDir
+		userSSHDir = filepath.Join(ex.homeDir, ".ssh")
+	}
+
+	if ex.jobSpec.SSHKey != nil && userSSHDir != "" {
+		err := configureSSH(
+			ex.jobSpec.SSHKey.Private, ex.jobSpec.SSHKey.Public, ex.clusterInfo.JobIPs, ex.sshPort,
+			uid, gid, userSSHDir,
+		)
+		if err != nil {
+			log.Warning(ctx, "failed to configure SSH", "err", err)
+		}
 	}
 
 	cmd.Env = envMap.Render()
@@ -712,6 +729,56 @@ func writeSSHEnvironment(env map[string]string, uid int, gid int, envPath string
 	return nil
 }
 
+func configureSSH(private string, public string, ips []string, port int, uid int, gid int, sshDir string) error {
+	privatePath := filepath.Join(sshDir, "dstack_job")
+	privateFile, err := os.OpenFile(privatePath, os.O_TRUNC|os.O_WRONLY|os.O_CREATE, 0o600)
+	if err != nil {
+		return err
+	}
+	defer privateFile.Close()
+	if err := os.Chown(privatePath, uid, gid); err != nil {
+		return err
+	}
+	if _, err := privateFile.WriteString(private); err != nil {
+		return err
+	}
+
+	akPath := filepath.Join(sshDir, "authorized_keys")
+	akFile, err := os.OpenFile(akPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o600)
+	if err != nil {
+		return err
+	}
+	defer akFile.Close()
+	if err := os.Chown(akPath, uid, gid); err != nil {
+		return err
+	}
+	if _, err := akFile.WriteString(public); err != nil {
+		return err
+	}
+
+	configPath := filepath.Join(sshDir, "config")
+	configFile, err := os.OpenFile(configPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o600)
+	if err != nil {
+		return err
+	}
+	defer configFile.Close()
+	if err := os.Chown(configPath, uid, gid); err != nil {
+		return err
+	}
+	var configBuffer bytes.Buffer
+	for _, ip := range ips {
+		configBuffer.WriteString(fmt.Sprintf("\nHost %s\n", ip))
+		configBuffer.WriteString(fmt.Sprintf("    Port %d\n", port))
+		configBuffer.WriteString("    StrictHostKeyChecking no\n")
+		configBuffer.WriteString("    UserKnownHostsFile /dev/null\n")
+		configBuffer.WriteString(fmt.Sprintf("    IdentityFile %s\n", privatePath))
+	}
+	if _, err := configFile.Write(configBuffer.Bytes()); err != nil {
+		return err
+	}
+	return nil
+}
+
 // A makeshift solution to deliver authorized_keys to a non-root user
 // without modifying the existing API/bootstrap process
 // TODO: implement key delivery properly, i.e. sumbit keys to and write by the runner,
diff --git a/runner/internal/schemas/schemas.go b/runner/internal/schemas/schemas.go
index 8d370e253..86318b0ec 100644
--- a/runner/internal/schemas/schemas.go
+++ b/runner/internal/schemas/schemas.go
@@ -54,6 +54,7 @@ type JobSpec struct {
 	Env            map[string]string `json:"env"`
 	SingleBranch   bool              `json:"single_branch"`
 	MaxDuration    int               `json:"max_duration"`
+	SSHKey         *SSHKey           `json:"ssh_key"`
 	WorkingDir     *string           `json:"working_dir"`
 }
 
@@ -63,6 +64,11 @@ type ClusterInfo struct {
 	GPUSPerJob  int      `json:"gpus_per_job"`
 }
 
+type SSHKey struct {
+	Private string `json:"private"`
+	Public  string `json:"public"`
+}
+
 type RepoCredentials struct {
 	CloneURL   string  `json:"clone_url"`
 	PrivateKey *string `json:"private_key"`
diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py
index c12c81918..13fef3fc7 100644
--- a/src/dstack/_internal/core/models/runs.py
+++ b/src/dstack/_internal/core/models/runs.py
@@ -178,6 +178,11 @@ class Gateway(CoreModel):
     options: dict = {}
 
 
+class JobSSHKey(CoreModel):
+    private: str
+    public: str
+
+
 class JobSpec(CoreModel):
     replica_num: int = 0  # default value for backward compatibility
     job_num: int
@@ -198,6 +203,7 @@ class JobSpec(CoreModel):
     requirements: Requirements
     retry: Optional[Retry]
     volumes: Optional[List[MountPoint]] = None
+    ssh_key: Optional[JobSSHKey] = None
     # For backward compatibility with 0.18.x when retry_policy was required.
     # TODO: remove in 0.19
     retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py
index 9b5052964..0ab4387b7 100644
--- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py
+++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py
@@ -127,7 +127,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
     run_model = res.unique().scalar_one()
     repo_model = run_model.repo
     project = run_model.project
-    run = run_model_to_run(run_model)
+    run = run_model_to_run(run_model, include_sensitive=True)
     job_submission = job_model_to_job_submission(job_model)
     job_provisioning_data = job_submission.job_provisioning_data
     if job_provisioning_data is None:
diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py
index 847fa0624..e0c69c750 100644
--- a/src/dstack/_internal/server/schemas/runner.py
+++ b/src/dstack/_internal/server/schemas/runner.py
@@ -64,6 +64,7 @@ class SubmitBody(CoreModel):
                 "gateway",
                 "single_branch",
                 "max_duration",
+                "ssh_key",
                 "working_dir",
             }
         ),
diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py
index d144063fa..4bb00b289 100644
--- a/src/dstack/_internal/server/services/jobs/configurators/base.py
+++ b/src/dstack/_internal/server/services/jobs/configurators/base.py
@@ -21,6 +21,7 @@
 from dstack._internal.core.models.runs import (
     AppSpec,
     JobSpec,
+    JobSSHKey,
     Requirements,
     Retry,
     RunSpec,
@@ -30,6 +31,7 @@
 from dstack._internal.core.services.profiles import get_retry
 from dstack._internal.core.services.ssh.ports import filter_reserved_ports
 from dstack._internal.server.services.docker import ImageConfig, get_image_config
+from dstack._internal.utils import crypto
 from dstack._internal.utils.common import run_async
 from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
 
@@ -57,6 +59,8 @@ class JobConfigurator(ABC):
     TYPE: RunConfigurationType
 
     _image_config: Optional[ImageConfig] = None
+    # JobSSHKey should be shared for all jobs in a replica for inter-node communitation.
+    _job_ssh_key: Optional[JobSSHKey] = None
 
     def __init__(self, run_spec: RunSpec):
         self.run_spec = run_spec
@@ -123,6 +127,7 @@ async def _get_job_spec(
             retry=self._retry(),
             working_dir=self._working_dir(),
             volumes=self._volumes(job_num),
+            ssh_key=self._ssh_key(jobs_per_replica),
         )
         return job_spec
 
@@ -238,6 +243,17 @@ def _python(self) -> str:
     def _volumes(self, job_num: int) -> List[MountPoint]:
         return interpolate_job_volumes(self.run_spec.configuration.volumes, job_num)
 
+    def _ssh_key(self, jobs_per_replica: int) -> Optional[JobSSHKey]:
+        if jobs_per_replica < 2:
+            return None
+        if self._job_ssh_key is None:
+            private, public = crypto.generate_rsa_key_pair_bytes(comment="dstack_job")
+            self._job_ssh_key = JobSSHKey(
+                private=private.decode(),
+                public=public.decode(),
+            )
+        return self._job_ssh_key
+
 
 def interpolate_job_volumes(
     run_volumes: List[Union[MountPoint, str]],
diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py
index 44f5520a6..d4f3afce5 100644
--- a/src/dstack/_internal/server/services/runs.py
+++ b/src/dstack/_internal/server/services/runs.py
@@ -342,8 +342,11 @@ async def get_plan(
         job_offers.extend(offer for _, offer in offers)
         job_offers.sort(key=lambda offer: not offer.availability.is_available())
 
+        job_spec = job.job_spec
+        _remove_job_spec_sensitive_info(job_spec)
+
         job_plan = JobPlan(
-            job_spec=job.job_spec,
+            job_spec=job_spec,
             offers=job_offers[:50],
             total_offers=len(job_offers),
             max_price=max((offer.price for offer in job_offers), default=None),
@@ -619,7 +622,10 @@ async def delete_runs(
 
 
 def run_model_to_run(
-    run_model: RunModel, include_job_submissions: bool = True, return_in_api: bool = False
+    run_model: RunModel,
+    include_job_submissions: bool = True,
+    return_in_api: bool = False,
+    include_sensitive: bool = False,
 ) -> Run:
     jobs: List[Job] = []
     run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num))
@@ -634,6 +640,8 @@ def run_model_to_run(
             for job_model in job_submissions:
                 if job_spec is None:
                     job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
+                    if not include_sensitive:
+                        _remove_job_spec_sensitive_info(job_spec)
                 if include_job_submissions:
                     job_submission = job_model_to_job_submission(job_model)
                     if return_in_api:
@@ -1046,3 +1054,7 @@ async def retry_run_replica_jobs(
         # dirty hack to avoid passing all job submissions
         new_job_model.submission_num = job_model.submission_num + 1
         session.add(new_job_model)
+
+
+def _remove_job_spec_sensitive_info(spec: JobSpec):
+    spec.ssh_key = None
diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py
index 5795eaddc..a8d142934 100644
--- a/src/tests/_internal/server/routers/test_runs.py
+++ b/src/tests/_internal/server/routers/test_runs.py
@@ -214,6 +214,7 @@ def get_dev_env_run_plan_dict(
                     },
                     "retry": None,
                     "volumes": volumes,
+                    "ssh_key": None,
                     "retry_policy": {"retry": False, "duration": None},
                     "working_dir": ".",
                 },
@@ -374,6 +375,7 @@ def get_dev_env_run_dict(
                     },
                     "retry": None,
                     "volumes": [],
+                    "ssh_key": None,
                     "retry_policy": {"retry": False, "duration": None},
                     "working_dir": ".",
                 },
diff --git a/src/tests/_internal/server/services/jobs/configurators/test_task.py b/src/tests/_internal/server/services/jobs/configurators/test_task.py
new file mode 100644
index 000000000..4515209e6
--- /dev/null
+++ b/src/tests/_internal/server/services/jobs/configurators/test_task.py
@@ -0,0 +1,35 @@
+from unittest.mock import patch
+
+import pytest
+
+from dstack._internal.core.models.configurations import TaskConfiguration
+from dstack._internal.core.models.runs import JobSSHKey
+from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
+from dstack._internal.server.testing.common import get_run_spec
+
+
+@pytest.mark.asyncio
+@pytest.mark.usefixtures("image_config_mock")
+class TestTaskJobConfigurator:
+    async def test_ssh_key_single_node(self):
+        configuration = TaskConfiguration(nodes=1, image="debian")
+        run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
+        configurator = TaskJobConfigurator(run_spec)
+
+        job_specs = await configurator.get_job_specs(replica_num=0)
+
+        assert len(job_specs) == 1
+        assert job_specs[0].ssh_key is None
+
+    async def test_ssh_key_multi_node(self):
+        configuration = TaskConfiguration(nodes=2, image="debian")
+        run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration)
+        configurator = TaskJobConfigurator(run_spec)
+
+        with patch("dstack._internal.utils.crypto.generate_rsa_key_pair_bytes") as gen_mock:
+            gen_mock.side_effect = [(b"private1", b"public1"), (b"private2", b"public2")]
+            job_specs = await configurator.get_job_specs(replica_num=0)
+
+        assert len(job_specs) == 2
+        assert job_specs[0].ssh_key == JobSSHKey(private="private1", public="public1")
+        assert job_specs[1].ssh_key == JobSSHKey(private="private1", public="public1")