From c612e36c4f088d188233920bfa8b44c6ff1c337b Mon Sep 17 00:00:00 2001 From: "Frank Lin (Engrg-Hardware 1)" Date: Thu, 21 Sep 2023 03:44:00 +0000 Subject: [PATCH 1/4] Fix Device Event Creation --- .../fluid/distributed/collective/process_group_nccl.cc | 10 +++++++--- paddle/fluid/platform/device_event_base.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index 7ffe00b8cd824..79a6901dde6aa 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -44,7 +44,7 @@ ProcessGroupNCCL::NCCLTask::NCCLTask(const Place& place, bool sync_op, bool use_calc_stream) : TaskStream(rank, comm_type, sync_op, use_calc_stream), - comm_event_(place), + comm_event_(place, platform::GenerateDeviceEventFlag()), task_place_(place) {} ProcessGroupNCCL::NCCLTask::~NCCLTask() = default; @@ -506,7 +506,9 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, auto nccl_comm_ctx = this->GetCommContext(); comm_ctx->set_nccl_comm(nccl_comm_ctx->GetNcclComm()); - place_to_calc_event_.emplace(place_key, place); + place_to_calc_event_.emplace( + place_key, + platform::DeviceEvent(place, platform::GenerateDeviceEventFlag())); place_to_calc_ctx_.emplace(place_key, calc_ctx); place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx)); @@ -636,7 +638,9 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( GroupEnd(); // TODO(sunyilun): for compatibility, will be removed later - place_to_calc_event_.emplace(places_key, places[0]); + place_to_calc_event_.emplace( + places_key, + platform::DeviceEvent(places[0], platform::GenerateDeviceEventFlag())); place_to_calc_ctx_.emplace( places_key, static_cast( diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index e2de1e5a9abe3..03fd7d4bb13f0 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -55,7 +55,7 @@ enum EventStatus { class DeviceEvent { public: - explicit DeviceEvent(const platform::Place& place, unsigned int flag = 0) + explicit DeviceEvent(const platform::Place& place, unsigned int flag) : event_(), place_(place), flag_(flag) { type_id_ = DeviceTypeToId(platform::Place2DeviceType(place)); PADDLE_ENFORCE_LT(type_id_, From a397713af7de1b5e1068090c51fff05739bd6a95 Mon Sep 17 00:00:00 2001 From: "Frank Lin (Engrg-Hardware 1)" Date: Thu, 21 Sep 2023 09:45:20 +0000 Subject: [PATCH 2/4] Fix Device Event Test --- paddle/fluid/distributed/collective/process_group_nccl.cc | 2 +- paddle/fluid/platform/device_event_test.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index 79a6901dde6aa..89f5dcb222e63 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -594,7 +594,7 @@ ProcessGroupNCCL::NCCLTask::NCCLTask( CommType CommType, const std::vector& inputs) : TaskStream(rank, inputs, CommType), - comm_event_(places[0]), + comm_event_(places[0], platform::GenerateDeviceEventFlag()), task_place_(places[0]) {} // create NCCLManager cache for places_key diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 7dfacc66437ae..6b0466d28c147 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -37,7 +37,7 @@ TEST(DeviceEvent, CUDA) { ASSERT_NE(context, nullptr); // case 1. test for event_creator - DeviceEvent event(place); + DeviceEvent event(place, paddle::platform::GenerateDeviceEventFlag()); ASSERT_NE(event.GetEvent().get(), nullptr); bool status = event.Query(); ASSERT_EQ(status, true); @@ -127,7 +127,7 @@ TEST(DeviceEvent, CUDA) { TEST(DeviceEvent, CPU) { using paddle::platform::CPUPlace; auto place = CPUPlace(); - DeviceEvent event(place); + DeviceEvent event(place, paddle::platform::GenerateDeviceEventFlag()); auto& pool = DeviceContextPool::Instance(); auto* context = pool.Get(place); From 7b5628e234c59860269eeba03aa903475c1d741a Mon Sep 17 00:00:00 2001 From: gongweibao <> Date: Fri, 22 Sep 2023 16:40:45 +0800 Subject: [PATCH 3/4] fix compilation error of device_event_test.cc under ROCM --- paddle/fluid/platform/device_event_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 6b0466d28c147..3c963d72dd854 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -86,7 +86,7 @@ TEST(DeviceEvent, CUDA) { ASSERT_NE(context, nullptr); // case 1. test for event_creator - DeviceEvent event(place); + DeviceEvent event(place, paddle:platform::GenerateDeviceEventFlag()); ASSERT_NE(event.GetEvent().get(), nullptr); bool status = event.Query(); ASSERT_EQ(status, true); From 9758c33fc8c3eda124fe945cff5d1593d2e8298f Mon Sep 17 00:00:00 2001 From: gongweibao <> Date: Fri, 22 Sep 2023 18:06:44 +0800 Subject: [PATCH 4/4] fix style --- paddle/fluid/platform/device_event_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 3c963d72dd854..75c0e65352f52 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -85,8 +85,9 @@ TEST(DeviceEvent, CUDA) { auto* context = static_cast(pool.Get(place)); ASSERT_NE(context, nullptr); + // case 1. test for event_creator - DeviceEvent event(place, paddle:platform::GenerateDeviceEventFlag()); + DeviceEvent event(place, paddle::platform::GenerateDeviceEventFlag()); ASSERT_NE(event.GetEvent().get(), nullptr); bool status = event.Query(); ASSERT_EQ(status, true);