diff --git a/skypilot/README.md b/skypilot/README.md new file mode 100644 index 0000000..93d7db3 --- /dev/null +++ b/skypilot/README.md @@ -0,0 +1,33 @@ +# SkyPilot Configuration Examples + +This directory contains example SkyPilot configuration files demonstrating different use cases for running workloads on CoreWeave infrastructure. + +## Configuration Examples + +### 1. mydevpod.yaml + +A development environment configuration that sets up a containerized workspace for interactive development and testing. + +**Use Case:** Interactive development, experimentation, and testing with GPU acceleration. + +### 2. vllm.yaml + +A production-ready configuration for deploying vLLM inference servers with OpenAI-compatible API endpoints. + +**Use Case:** Production inference serving with OpenAI-compatible API for language models. + +### 3. distributed_training.yaml + +A multi-node distributed training configuration using PyTorch's Distributed Data Parallel (DDP) framework. + +**Use Case:** Large-scale distributed training across multiple nodes for computationally intensive models. + +## Getting Started + +To use any of these configurations: + +1. Ensure you have SkyPilot installed and configured for CoreWeave +2. Modify the configuration parameters as needed for your specific requirements +3. Launch the configuration using: `sky launch ` + +For more information on SkyPilot and CoreWeave integration, refer to the main documentation. \ No newline at end of file diff --git a/skypilot/config-examples/distributed_training.yaml b/skypilot/config-examples/distributed_training.yaml new file mode 100644 index 0000000..f0f4cf2 --- /dev/null +++ b/skypilot/config-examples/distributed_training.yaml @@ -0,0 +1,40 @@ +name: minGPT-ddp + +resources: + cpus: 8+ + accelerators: H100_NVLINK_80GB:8 # Modify to match your resources + image_id: docker:ghcr.io/coreweave/nccl-tests:12.8.1-devel-ubuntu22.04-nccl2.26.2-1-0708d2e + network_tier: best # Automatically requests rdma/ib: 1 resource and sets env vars + +num_nodes: 2 + +setup: | + git clone --depth 1 https://github.com/pytorch/examples || true + cd examples + git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp + uv venv --python 3.10 + source .venv/bin/activate + uv pip install -r requirements.txt "numpy<2" "torch==2.7.1+cu118" --extra-index-url https://download.pytorch.org/whl/cu118 + +run: | + cd examples + source .venv/bin/activate + cd mingpt + export LOGLEVEL=INFO + + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + # Explicit check for torchrun + if ! command -v torchrun >/dev/null 2>&1; then + echo "ERROR: torchrun command not found" >&2 + exit 1 + fi + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=8008 \ + --node_rank=${SKYPILOT_NODE_RANK} \ + main.py diff --git a/skypilot/config-examples/mydevpod.yaml b/skypilot/config-examples/mydevpod.yaml new file mode 100644 index 0000000..79450ec --- /dev/null +++ b/skypilot/config-examples/mydevpod.yaml @@ -0,0 +1,41 @@ +name: mydevpod + +resources: + # Modify this below to request different resources + accelerators: H100_NVLINK_80GB:1 # Use 1 H100 + image_id: docker:ghcr.io/coreweave/ml-containers/nightly-torch-extras:8b6c417-base-25110205-cuda12.9.1-ubuntu22.04-torch2.10.0a0-vision0.25.0a0-audio2.10.0a0 + memory: 32+ # Request at least 32GB of RAM + +file_mounts: + /my_data: # Mount storage bucket to /my_data in the container + source: cw://skypilot # Change this to be your bucket name + mode: MOUNT # MOUNT or COPY or MOUNT_CACHED. Defaults to MOUNT. Optional. +# Sync data in my-code/ on local machine to ~/sky_workdir in the container +workdir: ./my-code + +#Environment variables to set in the container +# These are needed to access CoreWeave Object Storage using the AWS CLI +envs: + AWS_SHARED_CREDENTIALS_FILE: "~/.coreweave/cw.credentials" + AWS_CONFIG_FILE: "~/.coreweave/cw.config" + AWS_PROFILE: "cw" + +# Any setup commands to run in the container before 'run' +# Here we install the AWS CLI to access storage +setup: | + echo "Setting up test storage environment..." + # Install AWS CLI + apt install python3.10-venv -y + curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip" + unzip awscli-bundle.zip + sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws + echo export AWS_CONFIG_FILE=$AWS_CONFIG_FILE >> ~/.bashrc + echo export AWS_SHARED_CREDENTIALS_FILE=$AWS_SHARED_CREDENTIALS_FILE >> ~/.bashrc + echo export AWS_PROFILE=$AWS_PROFILE >> ~/.bashrc + +run: | + echo "Starting container..." + echo "Node info:" + nvidia-smi + echo "Available GPUs: $(nvidia-smi --list-gpus | wc -l)" + echo "Container ready for testing..." \ No newline at end of file diff --git a/skypilot/config-examples/vllm.yaml b/skypilot/config-examples/vllm.yaml new file mode 100644 index 0000000..983f5ef --- /dev/null +++ b/skypilot/config-examples/vllm.yaml @@ -0,0 +1,30 @@ + +resources: + ports: 8000 + # Define the resources needed for the vLLM server, here we use 1 H100 GPU + accelerators: H100_NVLINK_80GB:1 + # Ensure sufficient memory for OPT-125M model and vLLM overhead + memory: 16+ + # Use latest vLLM Docker image + image_id: docker:vllm/vllm-openai:latest + # Expose port for OpenAI-compatible API + +service: + readiness_probe: /health + replicas: 2 + + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "Setting up vLLM environment..." + # Container already has vLLM installed + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "Starting vLLM OpenAI-compatible server..." + python3 -m vllm.entrypoints.openai.api_server \ + --model facebook/opt-125m \ + --host 0.0.0.0 \ + --port 8000