diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8acf10d..0650d02 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -43,7 +43,7 @@ jobs:
       matrix:
         # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
         # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
-        os: [ubuntu-22.04]
+        os: [ubuntu-22.04, ubuntu-22.04-arm64]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
         torch-version: ["2.5.1", "2.6.0", "2.7.1", "2.8.0"]
         cuda-version: ["12.9.1"]
@@ -52,7 +52,7 @@ jobs:
         # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
         # when building without C++11 ABI and using it on nvcr images.
         cxx11_abi: ["FALSE", "TRUE"]
-        arch: ["80", "90"]
+        arch: ["80", "90", "100", "120"]
         include:
             - torch-version: "2.9.0.dev20250904"
               cuda-version: "13.0"
diff --git a/setup.py b/setup.py
index 99b875e..95015cd 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,7 @@ def should_skip_cuda_build():
 
 @functools.lru_cache(maxsize=None)
 def cuda_archs():
-    return os.getenv("FLASH_DMATTN_CUDA_ARCHS", "80;90").split(";")
+    return os.getenv("FLASH_DMATTN_CUDA_ARCHS", "80;90;100;120").split(";")
 
 
 def detect_preferred_sm_arch() -> Optional[str]: