From 3065ed1ad996855b28b66eb2a1ee2e301dc2525d Mon Sep 17 00:00:00 2001
From: LoserCheems <3314685395@qq.com>
Date: Sat, 20 Sep 2025 20:36:17 +0800
Subject: [PATCH 1/2] Expands build matrix to include ARM64 OS and additional
 architectures for improved compatibility

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8acf10d..0650d02 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -43,7 +43,7 @@ jobs:
       matrix:
         # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the
         # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
-        os: [ubuntu-22.04]
+        os: [ubuntu-22.04, ubuntu-22.04-arm64]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
         torch-version: ["2.5.1", "2.6.0", "2.7.1", "2.8.0"]
         cuda-version: ["12.9.1"]
@@ -52,7 +52,7 @@ jobs:
         # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
         # when building without C++11 ABI and using it on nvcr images.
         cxx11_abi: ["FALSE", "TRUE"]
-        arch: ["80", "90"]
+        arch: ["80", "90", "100", "120"]
         include:
             - torch-version: "2.9.0.dev20250904"
               cuda-version: "13.0"

From e78456e409ab8e49d8165951328bb4344879a2f3 Mon Sep 17 00:00:00 2001
From: LoserCheems <3314685395@qq.com>
Date: Sat, 20 Sep 2025 20:36:29 +0800
Subject: [PATCH 2/2] Updates default CUDA architectures to include additional
 versions for improved compatibility

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 99b875e..95015cd 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,7 @@ def should_skip_cuda_build():
 
 @functools.lru_cache(maxsize=None)
 def cuda_archs():
-    return os.getenv("FLASH_DMATTN_CUDA_ARCHS", "80;90").split(";")
+    return os.getenv("FLASH_DMATTN_CUDA_ARCHS", "80;90;100;120").split(";")
 
 
 def detect_preferred_sm_arch() -> Optional[str]: