From 96dcddf7689bb6a4d71b6f5ff660609496b24c5d Mon Sep 17 00:00:00 2001 From: zhzhcookie Date: Tue, 8 Apr 2025 11:29:07 +0800 Subject: [PATCH 01/12] [CI/CD] Disable code scan temporarily (#16) --- .github/workflows/code_scan.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/code_scan.yml b/.github/workflows/code_scan.yml index 0d0cef3cd..b7632fedc 100644 --- a/.github/workflows/code_scan.yml +++ b/.github/workflows/code_scan.yml @@ -1,10 +1,7 @@ name: code-scan on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] + workflow_call: jobs: scan-code-and-report: From 57613d46967e6f7268d1423d57fef43e4702aeb8 Mon Sep 17 00:00:00 2001 From: zhzhcookie Date: Wed, 9 Apr 2025 15:09:30 +0800 Subject: [PATCH 02/12] Revert "[CI/CD] Disable code scan temporarily" (#17) Reverts FlagTree/flagtree#16 --- .github/workflows/code_scan.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/code_scan.yml b/.github/workflows/code_scan.yml index b7632fedc..0d0cef3cd 100644 --- a/.github/workflows/code_scan.yml +++ b/.github/workflows/code_scan.yml @@ -1,7 +1,10 @@ name: code-scan on: - workflow_call: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] jobs: scan-code-and-report: From c59d3894bebde25f2094fec52aa461f13fe1dbd0 Mon Sep 17 00:00:00 2001 From: zhzhcookie Date: Wed, 9 Apr 2025 17:18:35 +0800 Subject: [PATCH 03/12] [CI/CD] Limit workflow repo to FlagTree/flagtree (#18) --- .github/workflows/cambricon-build-and-test.yml | 1 + .github/workflows/code_scan.yml | 1 + .github/workflows/iluvatar-build-and-test.yml | 1 + .github/workflows/kunlun-build-and-test.yml | 1 + .github/workflows/metax-build-and-test.yml | 1 + .github/workflows/mthreads-build-and-test.yml | 1 + .github/workflows/nv-build-and-test.yml | 1 + 7 files changed, 7 insertions(+) diff --git a/.github/workflows/cambricon-build-and-test.yml b/.github/workflows/cambricon-build-and-test.yml index 7536a3959..9700446cb 100644 --- a/.github/workflows/cambricon-build-and-test.yml +++ b/.github/workflows/cambricon-build-and-test.yml @@ -13,6 +13,7 @@ concurrency: jobs: cambricon-build-and-test: runs-on: cambricon + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/code_scan.yml b/.github/workflows/code_scan.yml index 0d0cef3cd..e32fe78f9 100644 --- a/.github/workflows/code_scan.yml +++ b/.github/workflows/code_scan.yml @@ -9,6 +9,7 @@ on: jobs: scan-code-and-report: runs-on: scan + if: ${{ github.repository == 'FlagTree/flagtree' }} concurrency: group: scan-code-and-report-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true diff --git a/.github/workflows/iluvatar-build-and-test.yml b/.github/workflows/iluvatar-build-and-test.yml index f54cb575b..e9afd670d 100644 --- a/.github/workflows/iluvatar-build-and-test.yml +++ b/.github/workflows/iluvatar-build-and-test.yml @@ -13,6 +13,7 @@ concurrency: jobs: iluvatar-build-and-test: runs-on: iluvatar + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code (attempt 1) id: checkout1 diff --git a/.github/workflows/kunlun-build-and-test.yml b/.github/workflows/kunlun-build-and-test.yml index 5c5b5887b..94578dd36 100644 --- a/.github/workflows/kunlun-build-and-test.yml +++ b/.github/workflows/kunlun-build-and-test.yml @@ -14,6 +14,7 @@ concurrency: jobs: kunlun-build-and-test: runs-on: kunlun + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/metax-build-and-test.yml b/.github/workflows/metax-build-and-test.yml index c760d19b4..3dc7622f5 100644 --- a/.github/workflows/metax-build-and-test.yml +++ b/.github/workflows/metax-build-and-test.yml @@ -10,6 +10,7 @@ concurrency: jobs: metax-build-and-test: runs-on: metax + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/mthreads-build-and-test.yml b/.github/workflows/mthreads-build-and-test.yml index b3474802e..7097fffbc 100644 --- a/.github/workflows/mthreads-build-and-test.yml +++ b/.github/workflows/mthreads-build-and-test.yml @@ -10,6 +10,7 @@ concurrency: jobs: mthreads-build-and-test: runs-on: mthreads + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/nv-build-and-test.yml b/.github/workflows/nv-build-and-test.yml index 392c728d5..85e76773f 100644 --- a/.github/workflows/nv-build-and-test.yml +++ b/.github/workflows/nv-build-and-test.yml @@ -15,6 +15,7 @@ concurrency: jobs: nv-build-and-test: runs-on: nv-jiuding + if: ${{ github.repository == 'FlagTree/flagtree' }} steps: - name: Checkout code uses: actions/checkout@v4 From 734a1972c36ea3b8fb1f95e6b5db0027020ec6d5 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Thu, 24 Apr 2025 19:11:01 +0800 Subject: [PATCH 04/12] [BACKEND] Add Initial Version of Arm China NPU "Zhouyi" Backend Support (#7) --------- Co-authored-by: Yuchou Gan Co-authored-by: arozha01 Co-authored-by: chunying Co-authored-by: Qiang Zhang Co-authored-by: stayua01 --- CMakeLists.txt | 14 +- python/setup.py | 8 +- python/setup_helper.py | 8 +- python/src/main.cc | 5 +- third_party/aipu/CMakeLists.txt | 15 + third_party/aipu/backend/__init__.py | 0 third_party/aipu/backend/aipu_torch_dev.cpp | 363 ++++++++ third_party/aipu/backend/analysis/__init__.py | 3 + .../backend/analysis/determine_vfactor.py | 37 + third_party/aipu/backend/codegen.py | 442 ++++++++++ third_party/aipu/backend/compiler.py | 116 +++ third_party/aipu/backend/driver.py | 130 +++ third_party/aipu/backend/name.conf | 1 + third_party/aipu/language/aipu/__init__.py | 3 + third_party/aipu/language/aipu/libdevice.py | 795 ++++++++++++++++++ .../aipu/python/test/test_01_vector_add.py | 131 +++ .../aipu/python/test/test_02_fused_softmax.py | 175 ++++ .../test/test_analysis_determine_vfactor.py | 85 ++ third_party/aipu/triton_aipu.cc | 78 ++ 19 files changed, 2398 insertions(+), 11 deletions(-) create mode 100644 third_party/aipu/CMakeLists.txt create mode 100644 third_party/aipu/backend/__init__.py create mode 100644 third_party/aipu/backend/aipu_torch_dev.cpp create mode 100644 third_party/aipu/backend/analysis/__init__.py create mode 100644 third_party/aipu/backend/analysis/determine_vfactor.py create mode 100644 third_party/aipu/backend/codegen.py create mode 100644 third_party/aipu/backend/compiler.py create mode 100644 third_party/aipu/backend/driver.py create mode 100644 third_party/aipu/backend/name.conf create mode 100644 third_party/aipu/language/aipu/__init__.py create mode 100644 third_party/aipu/language/aipu/libdevice.py create mode 100644 third_party/aipu/python/test/test_01_vector_add.py create mode 100644 third_party/aipu/python/test/test_02_fused_softmax.py create mode 100644 third_party/aipu/python/test/test_analysis_determine_vfactor.py create mode 100644 third_party/aipu/triton_aipu.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 698352b1d..487208be5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,9 @@ elseif(FLAGTREE_BACKEND STREQUAL "mthreads") set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) set(ENV{FLAGTREE_PLUGIN} $ENV{FLAGTREE_BACKEND}) +elseif(FLAGTREE_BACKEND STREQUAL "aipu") + add_definitions(-D__NVIDIA__) + add_definitions(-D__AMD__) endif() set(FLAGTREE_PLUGIN "$ENV{FLAGTREE_PLUGIN}") if(FLAGTREE_PLUGIN) @@ -206,6 +209,9 @@ if (FLAGTREE_BACKEND STREQUAL "cambricon") include_directories(${PROJECT_BINARY_DIR}/include) # Tablegen'd files add_subdirectory(include) add_subdirectory(lib) +elseif (FLAGTREE_BACKEND STREQUAL "aipu") + add_subdirectory(include) + add_subdirectory(lib) elseif(NOT FLAGTREE_BACKEND) add_subdirectory(include) add_subdirectory(lib) @@ -263,10 +269,10 @@ if(TRITON_BUILD_PYTHON_MODULE) if (TRITON_BUILD_PROTON) add_definitions(-D__PROTON__) add_subdirectory(third_party/proton) - # We always build proton dialect - list(APPEND TRITON_PLUGIN_NAMES "proton") - add_subdirectory(third_party/proton/dialect) endif() + # We always build proton dialect + list(APPEND TRITON_PLUGIN_NAMES "proton") + add_subdirectory(third_party/proton/dialect) get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS) get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS) @@ -443,7 +449,7 @@ find_package(Threads REQUIRED) add_subdirectory(third_party/f2reduce) -if(NOT FLAGTREE_BACKEND) +if(NOT FLAGTREE_BACKEND OR FLAGTREE_BACKEND STREQUAL "aipu") add_subdirectory(bin) add_subdirectory(test) endif() diff --git a/python/setup.py b/python/setup.py index d8fd3bc79..c9e623f9b 100644 --- a/python/setup.py +++ b/python/setup.py @@ -597,7 +597,13 @@ def build_extension(self, ext): ) if helper.flagtree_backend: - backends = [*BackendInstaller.copy(helper.extend_backends), *BackendInstaller.copy_externals()] + if helper.flagtree_backend == "aipu": + backends = [ + *BackendInstaller.copy(helper.default_backends + helper.extend_backends), + *BackendInstaller.copy_externals(), + ] + else: + backends = [*BackendInstaller.copy(helper.extend_backends), *BackendInstaller.copy_externals()] else: backends = [*BackendInstaller.copy(helper.default_backends), *BackendInstaller.copy_externals()] diff --git a/python/setup_helper.py b/python/setup_helper.py index 58b718364..b84c301c7 100644 --- a/python/setup_helper.py +++ b/python/setup_helper.py @@ -236,7 +236,7 @@ def skip_package_dir(package): @staticmethod def get_package_dir(packages): package_dict = {} - if flagtree_backend and flagtree_backend != 'cambricon': + if flagtree_backend and flagtree_backend not in ("cambricon", "aipu"): connection = [] backend_triton_path = f"../third_party/{flagtree_backend}/python/" for package in packages: @@ -281,7 +281,7 @@ def git_clone(lib, lib_path): "so we couldn't compile triton_shared\n") third_partys = [] - if os.environ.get("USE_TRITON_SHARED", "ON") == "ON" and not flagtree_backend: + if os.environ.get("USE_TRITON_SHARED", "ON") == "ON": third_partys.append(flagtree_backend_info["triton_shared"]) else: use_triton_shared = False @@ -301,9 +301,9 @@ def handle_flagtree_backend(): if flagtree_backend: print(f"flagtree_backend is {flagtree_backend}") extend_backends.append(flagtree_backend) - if "editable_wheel" in sys.argv: + if "editable_wheel" in sys.argv and flagtree_backend != "aipu": ext_sourcedir = os.path.abspath(f"../third_party/{flagtree_backend}/python/{ext_sourcedir}") + "/" - if use_triton_shared and not flagtree_backend: + if use_triton_shared: default_backends.append("triton_shared") diff --git a/python/src/main.cc b/python/src/main.cc index 82289edc0..ab7b727f9 100644 --- a/python/src/main.cc +++ b/python/src/main.cc @@ -8,11 +8,12 @@ namespace py = pybind11; #define FOR_EACH_2(MACRO, X, ...) MACRO(X) FOR_EACH_1(MACRO, __VA_ARGS__) #define FOR_EACH_3(MACRO, X, ...) MACRO(X) FOR_EACH_2(MACRO, __VA_ARGS__) #define FOR_EACH_4(MACRO, X, ...) MACRO(X) FOR_EACH_3(MACRO, __VA_ARGS__) +#define FOR_EACH_5(MACRO, X, ...) MACRO(X) FOR_EACH_4(MACRO, __VA_ARGS__) #define FOR_EACH_NARG(...) FOR_EACH_NARG_(__VA_ARGS__, FOR_EACH_RSEQ_N()) #define FOR_EACH_NARG_(...) FOR_EACH_ARG_N(__VA_ARGS__) -#define FOR_EACH_ARG_N(_1, _2, _3, _4, N, ...) N -#define FOR_EACH_RSEQ_N() 4, 3, 2, 1, 0 +#define FOR_EACH_ARG_N(_1, _2, _3, _4, _5, N, ...) N +#define FOR_EACH_RSEQ_N() 5, 4, 3, 2, 1, 0 #define CONCATENATE(x, y) CONCATENATE1(x, y) #define CONCATENATE1(x, y) x##y diff --git a/third_party/aipu/CMakeLists.txt b/third_party/aipu/CMakeLists.txt new file mode 100644 index 000000000..3b864c116 --- /dev/null +++ b/third_party/aipu/CMakeLists.txt @@ -0,0 +1,15 @@ +add_triton_plugin(TritonAIPU ${CMAKE_CURRENT_SOURCE_DIR}/triton_aipu.cc) +target_include_directories(TritonAIPU PRIVATE ${CMAKE_SOURCE_DIR}/third_party/triton_shared/include) +target_link_libraries(TritonAIPU PRIVATE + Python3::Module + pybind11::headers + MLIRLinalgUtils + MLIRLinalgToStandard + MLIRBufferizationTransforms + MLIRBufferizationToMemRef + MLIRArithTransforms + MLIRFuncAllExtensions + MLIRAffineToStandard + MLIRSCFTransforms + MLIRAffineTransforms +) diff --git a/third_party/aipu/backend/__init__.py b/third_party/aipu/backend/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/third_party/aipu/backend/aipu_torch_dev.cpp b/third_party/aipu/backend/aipu_torch_dev.cpp new file mode 100644 index 000000000..0ae1f5adf --- /dev/null +++ b/third_party/aipu/backend/aipu_torch_dev.cpp @@ -0,0 +1,363 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +static c10::DeviceIndex aipu_device_index = 0; + +namespace c10 { +namespace impl { + +struct C10_API AIPUGuardImpl final : public DeviceGuardImplInterface { + static constexpr DeviceType static_type = DeviceType::PrivateUse1; + inline static int8_t current_device = 0; + inline static int64_t current_stream = 0; + + DeviceType type() const override { return static_type; } + + void setDevice(Device d) const override { + TORCH_CHECK(d.is_privateuseone(), "Device must be PrivateUse1 type"); + current_device = d.index(); + } + + void uncheckedSetDevice(Device d) const noexcept override { + current_device = d.index(); + } + + Device getDevice() const override { + return Device(DeviceType::PrivateUse1, current_device); + } + + Device exchangeDevice(Device d) const override { + Device old_device = getDevice(); + setDevice(d); + return old_device; + } + + Stream getStream(Device d) const noexcept override { + int64_t stream_id = d.index(); + return Stream(Stream::UNSAFE, d, stream_id); + } + + Stream exchangeStream(Stream s) const noexcept override { + auto old_stream = getStream(s.device()); + current_stream = s.id(); + return old_stream; + } + + DeviceIndex deviceCount() const noexcept override { return 1; } +}; + +} // namespace impl +} // namespace c10 + +namespace at { +namespace detail { + +C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::impl::AIPUGuardImpl); +} +} // namespace at + +#define AIPU_DRIVER_HANDLE_ERROR(status) \ + do { \ + if (status != AIPU_STATUS_SUCCESS) { \ + const char *error_message = nullptr; \ + aipu_get_error_message(aipu_ctx_, status, &error_message); \ + std::cout << error_message; \ + } \ + } while (false) + +/*! \brief Return whether a string starts with the given prefix. */ +inline bool StrStartsWith(const std::string &str, const std::string &prefix) { + if (prefix.size() > str.size()) + return false; + return std::equal(str.c_str(), str.c_str() + prefix.size(), prefix.c_str()); +} + +class Context final { +public: + aipu_ctx_handle_t *process_ctx = nullptr; + std::mutex inst_lock; + Context() { + if (process_ctx == nullptr) { + std::lock_guard lock(inst_lock); + if (process_ctx == nullptr) { + aipu_status_t status = aipu_init_context(&process_ctx); + if (status != AIPU_STATUS_SUCCESS) { + // + } + } + } + }; + ~Context() { + if (process_ctx != nullptr) { + std::lock_guard lock(inst_lock); + if (process_ctx != nullptr) { + aipu_status_t status = aipu_deinit_context(process_ctx); + if (status != AIPU_STATUS_SUCCESS) { + // + } + process_ctx = nullptr; + } + } + }; +}; + +Context *context() { + static const std::unique_ptr context([]() -> Context * { + try { + return new Context(); + } catch (...) { + } + return nullptr; + }()); + + return context.get(); +} + +using namespace at; + +struct AIPUAllocator final : Allocator { + AIPUAllocator() = default; + + DataPtr allocate(size_t nbytes) override { + void *data = nullptr; + status_ = aipu_malloc(aipu_ctx_, nbytes, 32, 0, &data); + AIPU_DRIVER_HANDLE_ERROR(status_); + + std::cerr << "alloc with aipu allocator for " << nbytes + << " bytes with ptr " << (uint64_t)data << std::endl; + return {data, data, &ReportAndDelete, + Device(DeviceType::PrivateUse1, aipu_device_index)}; + } + + static void ReportAndDelete(void *ptr) { + if (!ptr) { + return; + } + status_ = aipu_free(aipu_ctx_, &ptr); + AIPU_DRIVER_HANDLE_ERROR(status_); + } + + DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; } + + void copy_data(void *dest, const void *src, std::size_t count) const final { + default_copy_data(dest, src, count); + } + + static aipu_ctx_handle_t *aipu_ctx_; + static aipu_status_t status_; +}; + +// Register our dummy allocator +aipu_ctx_handle_t *AIPUAllocator::aipu_ctx_ = context()->process_ctx; +aipu_status_t AIPUAllocator::status_ = AIPU_STATUS_SUCCESS; +static AIPUAllocator global_custom_alloc; +REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc); + +Tensor custom_empty_symint(c10::IntArrayRef size, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { + constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); + return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, + c10::dtype_or_default(dtype), memory_format); +} + +Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { + constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); + auto dtype = c10::dtype_or_default(dtype_opt); + return at::detail::empty_strided_generic(size, stride, &global_custom_alloc, + private_use_ks, dtype); +} + +Tensor aipu_view(const Tensor &self, c10::IntArrayRef size) { + IntArrayRef self_sizes = self.sizes(); + IntArrayRef self_strides = self.strides(); + DimVector inferred_size = infer_size_dv(self_sizes, self.numel()); + std::optional stride = + at::detail::computeStride(self_sizes, self_strides, inferred_size); + TORCH_CHECK( + stride.has_value(), + "view size is " + "not compatible with input tensor's size and stride (at least one " + "dimension" + " spans across two contiguous subspaces). Use .reshape(...) instead."); + + Tensor self_ = at::detail::make_tensor( + c10::TensorImpl::VIEW, c10::Storage(self.storage()), self.key_set(), + self.dtype()); + self_.unsafeGetTensorImpl()->set_sizes_and_strides(inferred_size, *stride); + self_.unsafeGetTensorImpl()->set_storage_offset(self.storage_offset()); + return self_; +} + +Tensor aipu_copy_from(const Tensor &self, const Tensor &dst, + bool non_blocking = false) { + auto kind = AIPU_MEMCPY_HOST_TO_DEVICE; + if (StrStartsWith(self.device().str(), "aipu")) { + kind = AIPU_MEMCPY_DEVICE_TO_HOST; + if (StrStartsWith(dst.device().str(), "aipu")) { + kind = AIPU_MEMCPY_DEVICE_TO_DEVICE; + } + } + + auto aipu_ctx_ = AIPUAllocator::aipu_ctx_; + auto status = aipu_memcpy(aipu_ctx_, dst.data_ptr(), self.data_ptr(), + self.nbytes(), kind); + AIPU_DRIVER_HANDLE_ERROR(status); + return self; +} + +template