dstackai · Egor-S · Jan 15, 2024 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -175,3 +175,32 @@ jobs:
           aws s3 cp configuration.json "s3://dstack-runner-downloads-stgn/latest/schemas/configuration.json" --acl public-read
           aws s3 cp profiles.json "s3://dstack-runner-downloads-stgn/$VERSION/schemas/profiles.json" --acl public-read
           aws s3 cp profiles.json "s3://dstack-runner-downloads-stgn/latest/schemas/profiles.json" --acl public-read
+
+  gateway-build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: gateway
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+      - name: Install AWS
+        run: pip install awscli
+      - name: Install dependencies
+        run: pip install wheel build
+      - name: Compute version
+        run: echo VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }})) > $GITHUB_ENV
+      - name: Build package
+        run: |
+          echo "__version__ = \"${{ env.VERSION }}\"" > src/dstack/gateway/version.py
+          python -m build .
+      - name: Upload to S3
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          WHEEL=dstack_gateway-${{ env.VERSION }}-py3-none-any.whl
+          aws s3 cp dist/$WHEEL "s3://dstack-gateway-downloads/stgn/$WHEEL"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -216,3 +216,32 @@ jobs:
           aws s3 cp configuration.json "s3://dstack-runner-downloads/latest/schemas/configuration.json" --acl public-read
           aws s3 cp profiles.json "s3://dstack-runner-downloads/$VERSION/schemas/profiles.json" --acl public-read
           aws s3 cp profiles.json "s3://dstack-runner-downloads/latest/schemas/profiles.json" --acl public-read
+
+  gateway-build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: gateway
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+      - name: Install AWS
+        run: pip install awscli
+      - name: Install dependencies
+        run: pip install wheel build
+      - name: Store version
+        run: echo VERSION=${GITHUB_REF#refs/tags/} > $GITHUB_ENV
+      - name: Build package
+        run: |
+          echo "__version__ = \"${{ env.VERSION }}\"" > src/dstack/gateway/version.py
+          python -m build .
+      - name: Upload to S3
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          WHEEL=dstack_gateway-${{ env.VERSION }}-py3-none-any.whl
+          aws s3 cp dist/$WHEEL "s3://dstack-gateway-downloads/release/$WHEEL"
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -3,13 +3,14 @@
 
 ## Overview
 
-The `dstack` platform consists of five major components:
+The `dstack` platform consists of six major components:
 
 * the server
 * the Python API
 * the CLI
 * the runner
 * the shim
+* the gateway (optional)
 
 The server provides an HTTP API for submitting runs and managing all of the `dstack` functionality including users, projects, backends, repos, secrets, and gateways.
 
@@ -19,6 +20,8 @@ When the server provisions a cloud instance for a run, it launches a Docker imag
 
 The shim may be or may not be present depending on which type of cloud is used. If it's a GPU cloud that provides an API for running Docker images, then no shim is required. If it's a traditional cloud that provisions VMs, then the shim is started on the VM launch. It pulls and runs the Docker image, controls its execution, and implements any cloud-specific functionality such as terminating the instance.
 
+The gateway makes jobs available via a public URL. It works like a reverse proxy that forwards requests to the job instance via an SSH tunnel.
+
 ## Implementation of dstack run
 
 When a user invokes `dstack run`, the CLI first sends the run configuration and other profile parameters to the server to get the run plan. The server iterates over configured backends to get all instance offers matching the requirements and their availability. If the user is willing to proceed with the offers suggested, the CLI uploads the code from the user's machine to the server and submits the run configuration.
@@ -77,4 +80,8 @@ The server is a FastAPI app backend by sqlite. The runner and shim are written i
             * `_public` – the implementation of the high-level Python API
             * `server` – the low-level Python API (a Python wrapper around server's HTTP API)
         * `core/` – core Python API modules (e.g. `dstack` errors)
-    * `tests/`
+    * `tests/`
+* `gateway/src/dstack/gateway` - source code for the gateway application
+  * `openai/` - OpenAI API proxy
+  * `registry/` - gateway services registry
+  * `systemd/` - systemd service files
diff --git a/docs/docs/guides/services.md b/docs/docs/guides/services.md
@@ -104,6 +104,74 @@ Once the service is deployed, its endpoint will be available at
 
 [//]: # (TODO: Example)
 
+### Enable OpenAI interface
+
+To use your model via the OpenAI interface, you need to extend the configuration with `model`.
+
+<div editor-title="mistral_openai.dstack.yml"> 
+
+```yaml
+type: service
+
+image: ghcr.io/huggingface/text-generation-inference:1.3
+
+env:
+  - MODEL_ID=TheBloke/Mistral-7B-OpenOrca-AWQ
+
+port: 8000
+
+commands:
+  - text-generation-launcher --hostname 0.0.0.0 --port 8000 --quantize awq --max-input-length 3696 --max-total-tokens 4096 --max-batch-prefill-tokens 4096
+
+model:
+  type: chat
+  name: TheBloke/Mistral-7B-OpenOrca-AWQ
+  format: tgi
+```
+
+</div>
+
+!!! info "Experimental feature"
+    OpenAI interface is an experimental feature. 
+    Only TGI chat models are supported at the moment.
+    Streaming is not supported yet.
+
+Run the configuration. Text Generation Inference requires a GPU with a compute capability above 8.0: e.g., L4 or A100.
+
+<div class="termy">
+
+```shell
+$ dstack run . -f mistral_openai.dstack.yml --gpu L4 
+```
+
+</div>
+
+Once the service is deployed,
+OpenAI interface will be available at `https://gateway.<domain-name>` for all deployed models in the project.
+
+The example below shows how to use the model with `openai` library:
+
+<div editor-title="mistral_complete.py">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="https://gateway.<domain-name>",
+    api_key="none",
+)
+r = client.chat.completions.create(
+    model="TheBloke/Mistral-7B-OpenOrca-AWQ",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Three main ingredients of a burger in one sentence?"}
+    ],
+)
+print(r)
+```
+
+</div>
+
 What's next?
 
 1. Check the [Text Generation Inference](../../learn/tgi.md) and [vLLM](../../learn/vllm.md) examples

diff --git a/gateway/README.md b/gateway/README.md
@@ -0,0 +1,29 @@
+# dstack gateway
+
+## Purpose
+
+* Make dstack services available to the outside world
+* Manage SSL certificates
+* Manage nginx configs
+* Establish SSH tunnels from gateway to dstack runner
+* Proxy OpenAI API requests to different formats (e.g. TGI)
+
+## Development
+
+1. Build the wheel:
+    ```
+    python -m build .
+    ```
+2. Upload the wheel:
+    ```shell
+    scp dist/dstack_gateway-0.0.0-py3-none-any.whl ubuntu@${GATEWAY}:/tmp/
+    ```
+3. Install the wheel:
+    ```
+    ssh ubuntu@${GATEWAY} "pip install --force-reinstall /tmp/dstack_gateway-0.0.0-py3-none-any.whl"
+    ```
+4. Run the tunnel and the gateway:
+    ```
+    ssh -L 9001:localhost:8000 -t ubuntu@${GATEWAY} "uvicorn dstack.gateway.main:app"
+    ```
+5. Visit the gateway docs page at http://localhost:9001/docs
diff --git a/gateway/pyproject.toml b/gateway/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "dstack-gateway"
+authors = [
+    { name = "dstack GmbH" },
+]
+requires-python = ">=3.10"
+dynamic = ["version"]
+dependencies = [
+    "fastapi",
+    "pydantic >=2.0.0",
+    "httpx",
+    "jinja2",
+    "uvicorn",
+]
+
+[tool.setuptools.package-data]
+"dstack.gateway" = ["systemd/resources/*"]
+
+[tool.setuptools.dynamic]
+version = {attr = "dstack.gateway.version.__version__"}
diff --git a/gateway/src/dstack/__init__.py b/gateway/src/dstack/__init__.py
diff --git a/gateway/src/dstack/gateway/__init__.py b/gateway/src/dstack/gateway/__init__.py
diff --git a/gateway/src/dstack/gateway/common.py b/gateway/src/dstack/gateway/common.py
@@ -0,0 +1,11 @@
+import asyncio
+import functools
+from typing import Callable, ParamSpec, TypeVar
+
+R = TypeVar("R")
+P = ParamSpec("P")
+
+
+async def run_async(func: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> R:
+    func_with_args = functools.partial(func, *args, **kwargs)
+    return await asyncio.get_running_loop().run_in_executor(None, func_with_args)
diff --git a/gateway/src/dstack/gateway/errors.py b/gateway/src/dstack/gateway/errors.py
@@ -0,0 +1,28 @@
+from fastapi import HTTPException
+
+
+class GatewayError(Exception):
+    def http(self, code: int = 400, **kwargs) -> HTTPException:
+        return HTTPException(
+            code,
+            {
+                "error": self.__class__.__name__,
+                "message": str(self),
+                **kwargs,
+            },
+        )
+
+
+class SSHError(GatewayError):
+    pass
+
+
+class NotFoundError(HTTPException):
+    def __init__(self, message: str = "Not found", **kwargs):
+        super().__init__(
+            404,
+            {
+                "message": message,
+                **kwargs,
+            },
+        )
diff --git a/gateway/src/dstack/gateway/logging.py b/gateway/src/dstack/gateway/logging.py
@@ -0,0 +1,11 @@
+import logging
+
+
+def configure_logging(level: int = logging.INFO):
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+
+    logger = logging.getLogger("dstack.gateway")
+    logger.setLevel(level)
+    logger.addHandler(handler)
diff --git a/gateway/src/dstack/gateway/main.py b/gateway/src/dstack/gateway/main.py
@@ -0,0 +1,43 @@
+import logging
+from contextlib import asynccontextmanager
+
+import pydantic_core
+from fastapi import FastAPI
+
+import dstack.gateway.openai.store as openai_store
+import dstack.gateway.version
+from dstack.gateway.logging import configure_logging
+from dstack.gateway.openai.routes import router as openai_router
+from dstack.gateway.registry.routes import router as registry_router
+from dstack.gateway.services.persistent import save_persistent_state
+from dstack.gateway.services.store import get_store
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    store = get_store()
+    openai = openai_store.get_store()
+    await store.subscribe(openai)
+    yield
+
+    async with store._lock, store.nginx._lock, openai._lock:
+        # Store the state between restarts
+        save_persistent_state(
+            pydantic_core.to_json(
+                {
+                    "store": store,
+                    "openai": openai,
+                }
+            )
+        )
+
+
+configure_logging(logging.DEBUG)
+app = FastAPI(lifespan=lifespan)
+app.include_router(registry_router, prefix="/api/registry")
+app.include_router(openai_router, prefix="/api/openai")
+
+
+@app.get("/")
+def get_info():
+    return {"version": dstack.gateway.version.__version__}
diff --git a/gateway/src/dstack/gateway/openai/__init__.py b/gateway/src/dstack/gateway/openai/__init__.py
diff --git a/gateway/src/dstack/gateway/openai/clients/__init__.py b/gateway/src/dstack/gateway/openai/clients/__init__.py
@@ -0,0 +1,18 @@
+from abc import ABC, abstractmethod
+from typing import AsyncIterator
+
+from dstack.gateway.openai.schemas import (
+    ChatCompletionsChunk,
+    ChatCompletionsRequest,
+    ChatCompletionsResponse,
+)
+
+
+class ChatCompletionsClient(ABC):
+    @abstractmethod
+    async def generate(self, request: ChatCompletionsRequest) -> ChatCompletionsResponse:
+        pass
+
+    @abstractmethod
+    async def stream(self, request: ChatCompletionsRequest) -> AsyncIterator[ChatCompletionsChunk]:
+        pass