Skip to content

Commit

Permalink
cuda 12.4 (#50)
Browse files Browse the repository at this point in the history
* update to cuda12

* upgrade to cuda12.1

* use self-hosted runner

* upgrade to torch2.3

---------

Co-authored-by: nobody <nobody>
Co-authored-by: Your Name <you@example.com>
  • Loading branch information
robotcator and Your Name committed Jun 7, 2024
1 parent a0f432d commit bcf247c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 22 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/docker_rdma_latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ on:
push:
branches:
- main
- cuda12

jobs:
docker:
runs-on: ubuntu-latest
runs-on: self-hosted
steps:
-
name: Checkout
Expand All @@ -25,9 +26,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push cu117 with rdma
name: Build and push cu12.4 with rdma
uses: docker/build-push-action@v3
with:
context: ./docker/rdma/
push: true
tags: dptechnology/unicore:latest-pytorch2.0.1-cuda11.7-rdma
tags: dptechnology/unicore:latest-pytorch2.3.0-cuda12.4-rdma
25 changes: 6 additions & 19 deletions docker/rdma/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvcr.io/nvidia/pytorch:22.05-py3
FROM nvcr.io/nvidia/pytorch:24.03-py3

RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
rm -rf /var/lib/apt/lists/* \
Expand Down Expand Up @@ -50,38 +50,25 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
nfs-common \
bc

RUN pip uninstall -y torch torchvision torchtext && \
pip uninstall -y torch torchvision torchtext && \
rm -rf ~/.cache/pip && \
conda clean -ya

RUN conda install -y pyyaml tensorboardX && \
conda clean -ya

# RUN ldconfig

# # ==================================================================
# # pytorch
# # ------------------------------------------------------------------
ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0"

RUN conda install -y ninja typing && \
conda clean -ya
ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;9.0"

RUN pip3 install --upgrade sentry-sdk requests

RUN pip3 install torch==2.0.1+cu117 --index-url https://download.pytorch.org/whl/cu117 && rm -rf ~/.cache/pip

RUN cd /tmp && \
git clone https://github.com/dptech-corp/Uni-Core && \
cd Uni-Core && \
python setup.py install && \
rm -rf /tmp/* && rm -rf ~/.cache/pip
rm -rf /tmp/* && rm -rf ~/.cache/pip

RUN pip3 install --no-cache-dir tokenizers lmdb biopython ml-collections timeout-decorator urllib3 tree dm-tree && rm -rf ~/.cache/pip

RUN MAX_JOBS=4 pip3 install -U 'flash-attn<2.5.0' --no-build-isolation --no-cache-dir

RUN ldconfig && \
apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/* /tmp/* && \
conda clean -ya
rm -rf /var/lib/apt/lists/* /tmp/*

0 comments on commit bcf247c

Please sign in to comment.