Added rocm builds and documentation

cromefire · Dec 10, 2023 · 5b0816c · 5b0816c
1 parent 7361e6c
commit 5b0816c
Show file tree

Hide file tree

Showing 13 changed files with 280 additions and 27 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,8 @@
+.idea
+ci
+clients
+.github
+python
 **/target
 **/node_modules
+website
diff --git a/.github/workflows/docker.yml → .github/workflows/docker-cuda.yml b/.github/workflows/docker.yml → .github/workflows/docker-cuda.yml
@@ -1,4 +1,4 @@
-name: Create and publish docker image
+name: Create and publish CUDA docker image
 
 on:
   workflow_dispatch:
@@ -50,7 +50,10 @@ jobs:
 
       # Workaround: https://github.com/docker/build-push-action/issues/461
       - name: Setup Docker buildx
-        uses: docker/setup-buildx-action@v2.0.0
+        uses: docker/setup-buildx-action@v3.0.0
+        with:
+          # Needed to support OCI annotations
+          version: v0.12.0
 
       # Login against a Docker registry except on PR
       # https://github.com/docker/login-action
@@ -78,12 +81,14 @@ jobs:
 
       - name: Docker meta
         id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5.0.0
         with:
           # list of Docker images to use as base name for tags
           images: |
             ghcr.io/${{ env.IMAGE_NAME }}
+            ghcr.io/${{ env.IMAGE_NAME }}/cuda
             ${{ env.IMAGE_NAME }}
+            ${{ env.IMAGE_NAME }}-cuda
           # generate Docker tags based on the following events/attributes
           tags: |
             type=raw,value={{branch}}-{{sha}},enable=${{ startsWith(github.ref, 'refs/heads') }}
@@ -95,13 +100,14 @@ jobs:
       # https://github.com/docker/build-push-action
       - name: Build and push Docker image
         id: build-and-push
-        uses: docker/build-push-action@v3.1.1
+        uses: docker/build-push-action@v5.1.0
         with:
-          file: Dockerfile
+          file: cuda.Dockerfile
           push: true
           context: .
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+          annotations: ${{ steps.meta.outputs.labels }}
           cache-from: ${{ steps.cache.outputs.cache-from }}
           cache-to: ${{ steps.cache.outputs.cache-to }}
           build-args: RUST_TOOLCHAIN=${{ env.RUST_TOOLCHAIN }}
@@ -112,4 +118,3 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
           repository: tabbyml/tabby
-
diff --git a/.github/workflows/docker-rocm.yml b/.github/workflows/docker-rocm.yml
@@ -0,0 +1,119 @@
+name: Create and publish ROCm docker image
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 20 */1 * *'
+  push:
+    tags:
+      - 'v*'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} 
+
+  # If this is enabled it will cancel current running and start latest
+  cancel-in-progress: true
+
+env:
+  RUST_TOOLCHAIN: 1.73.0
+
+jobs:
+  release-docker:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+      # This is used to complete the identity challenge
+      # with sigstore/fulcio when running outside of PRs.
+      id-token: write
+
+    steps:
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: false
+          swap-storage: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      # Workaround: https://github.com/docker/build-push-action/issues/461
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3.0.0
+        with:
+          # Needed to support OCI annotations
+          version: v0.12.0
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into GitHub Container registry
+        uses: docker/login-action@v2.0.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Log into Docker Hub
+        uses: docker/login-action@v2.0.0
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Generate image name
+        run: |
+          echo "IMAGE_NAME=${GITHUB_REPOSITORY,,}" >>${GITHUB_ENV}
+
+      - uses: int128/docker-build-cache-config-action@v1
+        id: cache
+        with:
+          image: ghcr.io/${{ env.IMAGE_NAME }}/cache
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5.0.0
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            ghcr.io/${{ env.IMAGE_NAME }}/rocm
+            ${{ env.IMAGE_NAME }}-rocm
+          # generate Docker tags based on the following events/attributes
+          variant: rocm
+          tags: |
+            type=raw,value={{branch}}-{{sha}},enable=${{ startsWith(github.ref, 'refs/heads') }}
+            type=schedule,pattern=nightly
+            type=schedule,pattern={{date 'YYYYMMDD'}}
+            type=semver,pattern={{version}}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v5.1.0
+        with:
+          file: rocm.Dockerfile
+          push: true
+          context: .
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          annotations: ${{ steps.meta.outputs.labels }}
+          cache-from: ${{ steps.cache.outputs.cache-from }}
+          cache-to: ${{ steps.cache.outputs.cache-to }}
+          build-args: RUST_TOOLCHAIN=${{ env.RUST_TOOLCHAIN }}
+
+      - name: Docker Hub Description
+        uses: peter-evans/dockerhub-description@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          repository: tabbyml/tabby
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ jobs:
     container: ${{ matrix.container }}
     strategy:
       matrix:
-        binary: [aarch64-apple-darwin, x86_64-manylinux2014, x86_64-manylinux2014-cuda117]
+        binary: [aarch64-apple-darwin, x86_64-manylinux2014, x86_64-manylinux2014-cuda117, x86_64-manylinux2014-rocm5.7]
         include:
           - os: macos-latest
             target: aarch64-apple-darwin
@@ -40,6 +40,11 @@ jobs:
             binary: x86_64-manylinux2014-cuda117
             container: sameli/manylinux2014_x86_64_cuda_11.7
             build_args: --features cuda
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+            binary: x86_64-manylinux2014-rocm5.7
+            container: rocm/dev-ubuntu-22.04:rocm5.7
+            build_args: --features rocm
 
     env:
       SCCACHE_GHA_ENABLED: true

diff --git a/README.md b/README.md
@@ -50,14 +50,25 @@ You can find our documentation [here](https://tabby.tabbyml.com/docs/getting-sta
 - ⚙️ [Configuration](https://tabby.tabbyml.com/docs/configuration)
 
 ### Run Tabby in 1 Minute
-The easiest way to start a Tabby server is by using the following Docker command:
+The easiest way to start a Tabby server is by using the following Docker command...
 
+...with cuda:
 ```bash
 docker run -it \
   --gpus all -p 8080:8080 -v $HOME/.tabby:/data \
-  tabbyml/tabby \
+  tabbyml/tabby-cuda \
   serve --model TabbyML/StarCoder-1B --device cuda
 ```
+
+...with ROCm (Linux only):
+```bash
+docker run -it \
+  --device /dev/dri --device /dev/kfd \
+  -p 8080:8080 -v $HOME/.tabby:/data \
+  tabbyml/tabby-rocm \
+  serve --model TabbyML/StarCoder-1B --device rocm
+```
+
 For additional options (e.g inference type, parallelism), please refer to the [documentation page](https://tabbyml.github.io/tabby).
 
 ## 🤝 Contributing

diff --git a/Dockerfile → cuda.Dockerfile b/Dockerfile → cuda.Dockerfile
@@ -29,12 +29,13 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- --default-toolchain ${RUST_TOOLC
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 WORKDIR /root/workspace
-COPY . .
 
 RUN mkdir -p /opt/tabby/bin
 RUN mkdir -p /opt/tabby/lib
 RUN mkdir -p target
 
+COPY . .
+
 RUN --mount=type=cache,target=/usr/local/cargo/registry \
     --mount=type=cache,target=/root/workspace/target \
     cargo build --features cuda --release --package tabby && \

diff --git a/rocm.Dockerfile b/rocm.Dockerfile
@@ -0,0 +1,61 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.7
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER="rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete"
+# Target the CUDA runtime image
+ARG BASE_ROCM_RUN_CONTAINER="rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete"
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Rust toolchain version
+ARG RUST_TOOLCHAIN=stable
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        pkg-config \
+        libssl-dev \
+        protobuf-compiler \
+        git \
+        cmake \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# setup rust.
+RUN curl https://sh.rustup.rs -sSf | bash -s -- --default-toolchain ${RUST_TOOLCHAIN} -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /root/workspace
+
+RUN mkdir -p /opt/tabby/bin
+RUN mkdir -p /opt/tabby/lib
+RUN mkdir -p target
+
+COPY . .
+
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/root/workspace/target \
+    cargo build --features rocm --release --package tabby && \
+    cp target/release/tabby /opt/tabby/bin/
+
+FROM ${BASE_ROCM_RUN_CONTAINER} as runtime
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Disable safe directory in docker
+# Context: https://github.com/git/git/commit/8959555cee7ec045958f9b6dd62e541affb7e7d9
+RUN git config --system --add safe.directory "*"
+
+COPY --from=build /opt/tabby /opt/tabby
+
+ENV TABBY_ROOT=/data
+
+ENTRYPOINT ["/opt/tabby/bin/tabby"]
diff --git a/website/docs/extensions/troubleshooting.md b/website/docs/extensions/troubleshooting.md
@@ -112,9 +112,9 @@ for the current code context.
 If your completion requests are timing out, Tabby may display a warning message. 
 This could be due to network issues or poor server performance, especially when 
 running a large model on a CPU. To improve performance, consider running the model 
-on a GPU with CUDA support or on Apple M1/M2 with Metal support. When running 
-the server, make sure to specify the device in the arguments using  `--device cuda` 
-or `--device metal`. You can also try using a smaller model from the available [models](https://tabby.tabbyml.com/docs/models/). 
+on a GPU with CUDA or ROCm support or on Apple M1/M2 with Metal support. When running 
+the server, make sure to specify the device in the arguments using  `--device cuda`, `--device rocm` or
+`--device metal`. You can also try using a smaller model from the available [models](https://tabby.tabbyml.com/docs/models/). 
 
 By default, the timeout for automatically triggered completion requests is set to 4 seconds. 
 You can adjust this timeout value in the `~/.tabby-client/agent/config.toml` configuration file.

diff --git a/website/docs/faq.mdx b/website/docs/faq.mdx
@@ -1,10 +1,11 @@
-import CodeBlock from '@theme/CodeBlock';
-
 # ⁉️ Frequently Asked Questions
 
 <details>
   <summary>How much VRAM a LLM model consumes?</summary>
-  <div>By default, Tabby operates in int8 mode with CUDA, requiring approximately 8GB of VRAM for CodeLlama-7B.</div>
+    <div>
+        <p>By default, Tabby operates in int8 mode with CUDA, requiring approximately 8GB of VRAM for CodeLlama-7B.</p>
+        <p>For ROCm the actual limits are currently largely untested, but the same CodeLlama-7B seems to use 8GB of VRAM as well on a AMD Radeon™ RX 7900 XTX according to the ROCm monitoring tools.</p>
+    </div>
 </details>
 
 <details>
@@ -18,13 +19,26 @@ import CodeBlock from '@theme/CodeBlock';
     <p>
       To determine the mapping between the GPU card type and its compute capability, please visit <a href="https://developer.nvidia.com/cuda-gpus">this page</a>
     </p>
+    <p>
+      This also seems to be available on AMD Radeon™ GPUs, but it's unclear which cards besides RDNA3 support this.
+    </p>
   </div>
 </details>
 
 <details>
   <summary>How to utilize multiple NVIDIA GPUs?</summary>
   <div>
-    <p>Tabby only supports the use of a single GPU. To utilize multiple GPUs, you can initiate multiple Tabby instances and set CUDA_VISIBLE_DEVICES accordingly.</p>
+    <p>Tabby only supports the use of a single GPU. To utilize multiple GPUs, you can initiate multiple Tabby instances and set CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES accordingly.</p>
+  </div>
+</details>
+
+<details>
+  <summary>My AMD ROCm device isn't supported by ROCm</summary>
+  <div>
+    <p>
+      You can use the HSA_OVERRIDE_GFX_VERSION variable if there is a similar GPU that is supported by ROCm you can set it to that.
+      For example for RDNA2 you can set it to 10.3.0 and to 11.0.0 for RDNA3.
+    </p>
   </div>
 </details>
 

diff --git a/website/docs/installation/apple.md b/website/docs/installation/apple.md
@@ -14,4 +14,4 @@ brew install tabbyml/tabby/tabby
 tabby serve --device metal --model TabbyML/StarCoder-1B
 ```
 
-The compute power of M1/M2 is limited and is likely to be sufficient only for individual usage. If you require a shared instance for a team, we recommend considering Docker hosting with CUDA. You can find more information about Docker [here](./docker).
+The compute power of M1/M2 is limited and is likely to be sufficient only for individual usage. If you require a shared instance for a team, we recommend considering Docker hosting with CUDA or ROCm. You can find more information about Docker [here](./docker).