coreweave · tmadhyastha-cw · Nov 5, 2025 · Nov 3, 2025 · Nov 5, 2025
diff --git a/skypilot/README.md b/skypilot/README.md
@@ -0,0 +1,33 @@
+# SkyPilot Configuration Examples
+
+This directory contains example SkyPilot configuration files demonstrating different use cases for running workloads on CoreWeave infrastructure.
+
+## Configuration Examples
+
+### 1. mydevpod.yaml
+
+A development environment configuration that sets up a containerized workspace for interactive development and testing.
+
+**Use Case:** Interactive development, experimentation, and testing with GPU acceleration.
+
+### 2. vllm.yaml
+
+A production-ready configuration for deploying vLLM inference servers with OpenAI-compatible API endpoints.
+
+**Use Case:** Production inference serving with OpenAI-compatible API for language models.
+
+### 3. distributed_training.yaml
+
+A multi-node distributed training configuration using PyTorch's Distributed Data Parallel (DDP) framework.
+
+**Use Case:** Large-scale distributed training across multiple nodes for computationally intensive models.
+
+## Getting Started
+
+To use any of these configurations:
+
+1. Ensure you have SkyPilot installed and configured for CoreWeave
+2. Modify the configuration parameters as needed for your specific requirements
+3. Launch the configuration using: `sky launch <config-file.yaml>`
+
+For more information on SkyPilot and CoreWeave integration, refer to the main documentation.
diff --git a/skypilot/config-examples/distributed_training.yaml b/skypilot/config-examples/distributed_training.yaml
@@ -0,0 +1,40 @@
+name: minGPT-ddp
+
+resources:
+    cpus: 8+
+    accelerators: H100_NVLINK_80GB:8 # Modify to match your resources
+    image_id: docker:ghcr.io/coreweave/nccl-tests:12.8.1-devel-ubuntu22.04-nccl2.26.2-1-0708d2e
+    network_tier: best  # Automatically requests rdma/ib: 1 resource and sets env vars
+
+num_nodes: 2
+
+setup: |
+    git clone --depth 1 https://github.com/pytorch/examples || true
+    cd examples
+    git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp
+    uv venv --python 3.10
+    source .venv/bin/activate
+    uv pip install -r requirements.txt "numpy<2" "torch==2.7.1+cu118" --extra-index-url https://download.pytorch.org/whl/cu118
+
+run: |
+    cd examples
+    source .venv/bin/activate
+    cd mingpt
+    export LOGLEVEL=INFO
+
+    MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+    echo "Starting distributed training, head node: $MASTER_ADDR"
+
+    # Explicit check for torchrun
+    if ! command -v torchrun >/dev/null 2>&1; then
+        echo "ERROR: torchrun command not found" >&2
+        exit 1
+    fi
+
+    torchrun \
+    --nnodes=$SKYPILOT_NUM_NODES \
+    --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=8008 \
+    --node_rank=${SKYPILOT_NODE_RANK} \
+    main.py
diff --git a/skypilot/config-examples/mydevpod.yaml b/skypilot/config-examples/mydevpod.yaml
@@ -0,0 +1,41 @@
+name: mydevpod
+
+resources:
+  # Modify this below to request different resources
+  accelerators: H100_NVLINK_80GB:1  # Use 1 H100
+  image_id: docker:ghcr.io/coreweave/ml-containers/nightly-torch-extras:8b6c417-base-25110205-cuda12.9.1-ubuntu22.04-torch2.10.0a0-vision0.25.0a0-audio2.10.0a0
+  memory: 32+  # Request at least 32GB of RAM
+
+file_mounts:
+  /my_data: # Mount storage bucket to /my_data in the container
+    source: cw://skypilot # Change this to be your bucket name
+    mode: MOUNT  # MOUNT or COPY or MOUNT_CACHED. Defaults to MOUNT. Optional.
+# Sync data in my-code/ on local machine to ~/sky_workdir in the container
+workdir: ./my-code
+
+#Environment variables to set in the container
+# These are needed to access CoreWeave Object Storage using the AWS CLI
+envs:
+  AWS_SHARED_CREDENTIALS_FILE: "~/.coreweave/cw.credentials"
+  AWS_CONFIG_FILE: "~/.coreweave/cw.config"
+  AWS_PROFILE: "cw"
+
+# Any setup commands to run in the container before 'run'
+# Here we install the AWS CLI to access storage
+setup: |
+  echo "Setting up test storage environment..."
+  # Install AWS CLI
+  apt install python3.10-venv -y
+  curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip"
+  unzip awscli-bundle.zip
+  sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws
+  echo export AWS_CONFIG_FILE=$AWS_CONFIG_FILE >> ~/.bashrc
+  echo export AWS_SHARED_CREDENTIALS_FILE=$AWS_SHARED_CREDENTIALS_FILE >> ~/.bashrc
+  echo export AWS_PROFILE=$AWS_PROFILE >> ~/.bashrc
+
+run: |
+  echo "Starting container..."
+  echo "Node info:"
+  nvidia-smi
+  echo "Available GPUs: $(nvidia-smi --list-gpus | wc -l)"
+  echo "Container ready for testing..."
diff --git a/skypilot/config-examples/vllm.yaml b/skypilot/config-examples/vllm.yaml
@@ -0,0 +1,30 @@
+
+resources:
+  ports: 8000
+  # Define the resources needed for the vLLM server, here we use 1 H100 GPU
+  accelerators: H100_NVLINK_80GB:1
+  # Ensure sufficient memory for OPT-125M model and vLLM overhead
+  memory: 16+
+  # Use latest vLLM Docker image
+  image_id: docker:vllm/vllm-openai:latest
+  # Expose port for OpenAI-compatible API
+
+service:
+  readiness_probe: /health
+  replicas: 2
+
+
+# Typical use: pip install -r requirements.txt
+# Invoked under the workdir (i.e., can use its files).
+setup: |
+  echo "Setting up vLLM environment..."
+  # Container already has vLLM installed
+
+# Typical use: make use of resources, such as running training.
+# Invoked under the workdir (i.e., can use its files).
+run: |
+  echo "Starting vLLM OpenAI-compatible server..."
+  python3 -m vllm.entrypoints.openai.api_server \
+    --model facebook/opt-125m \
+    --host 0.0.0.0 \
+    --port 8000