From 1849571e57dfb4242688e5b6cdff4209ae804e00 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Thu, 23 Feb 2023 22:11:30 +0800 Subject: [PATCH] remove mpi communications in DNN solving procedure when use pure GPU --- src/dfChemistryModel/dfChemistryModel.C | 26 +- src/dfChemistryModel/libtorchFunctions.H | 340 +++++++++++++---------- 2 files changed, 207 insertions(+), 159 deletions(-) diff --git a/src/dfChemistryModel/dfChemistryModel.C b/src/dfChemistryModel/dfChemistryModel.C index 43780323..99016118 100644 --- a/src/dfChemistryModel/dfChemistryModel.C +++ b/src/dfChemistryModel/dfChemistryModel.C @@ -146,21 +146,27 @@ Foam::dfChemistryModel::dfChemistryModel // initialization the Inferencer (if use multi GPU) if(torchSwitch_) { - if(!(Pstream::myProcNo() % cores_)) // Now is a master + if (gpu_) { - torch::jit::script::Module torchModel1_ = torch::jit::load(torchModelName1_); - torch::jit::script::Module torchModel2_ = torch::jit::load(torchModelName2_); - torch::jit::script::Module torchModel3_ = torch::jit::load(torchModelName3_); - std::string device_; - if (gpu_) + if(!(Pstream::myProcNo() % cores_)) // Now is a master { + torch::jit::script::Module torchModel1_ = torch::jit::load(torchModelName1_); + torch::jit::script::Module torchModel2_ = torch::jit::load(torchModelName2_); + torch::jit::script::Module torchModel3_ = torch::jit::load(torchModelName3_); + std::string device_; int CUDANo = (Pstream::myProcNo() / cores_) % GPUsPerNode_; device_ = "cuda:" + std::to_string(CUDANo); + DNNInferencer DNNInferencer(torchModel1_, torchModel2_, torchModel3_, device_); + DNNInferencer_ = DNNInferencer; } - else - { - device_ = "cpu"; - } + } + else + { + torch::jit::script::Module torchModel1_ = torch::jit::load(torchModelName1_); + torch::jit::script::Module torchModel2_ = torch::jit::load(torchModelName2_); + torch::jit::script::Module torchModel3_ = torch::jit::load(torchModelName3_); + std::string device_; + device_ = "cpu"; DNNInferencer DNNInferencer(torchModel1_, torchModel2_, torchModel3_, device_); DNNInferencer_ = DNNInferencer; } diff --git a/src/dfChemistryModel/libtorchFunctions.H b/src/dfChemistryModel/libtorchFunctions.H index 2eb9ea2a..26f64c31 100644 --- a/src/dfChemistryModel/libtorchFunctions.H +++ b/src/dfChemistryModel/libtorchFunctions.H @@ -33,195 +33,237 @@ Foam::scalar Foam::dfChemistryModel::solve_DNN( std::chrono::duration processingTime10 = std::chrono::duration_cast>(stop10 - start10); time_getProblems_ += processingTime10.count(); - /*==============================send problems==============================*/ - std::chrono::steady_clock::time_point start2 = std::chrono::steady_clock::now(); - - PstreamBuffers pBufs(Pstream::commsTypes::nonBlocking); - if (Pstream::myProcNo() % cores_) //for slave + if (gpu_) { - UOPstream send((Pstream::myProcNo()/cores_)*cores_, pBufs);// sending problem to master - send << GPUproblemList; - } - pBufs.finishedSends(); + /*==============================send problems==============================*/ + std::chrono::steady_clock::time_point start2 = std::chrono::steady_clock::now(); - /*==============================send CVODE problems from submaster to neighbour==============================*/ - PstreamBuffers pBufs1(Pstream::commsTypes::nonBlocking); - if (!(Pstream::myProcNo() % cores_)) // submaster - { - UOPstream send((Pstream::myProcNo() + 1), pBufs1);// sending CPUproblems to neighbour - send << CPUproblemList; - } - pBufs1.finishedSends(); - if ((Pstream::myProcNo() % cores_) == 1) // neighbour of submaster - { - DynamicList CPUproblemList_submaster; - UIPstream recv((Pstream::myProcNo() - 1), pBufs1); - recv >> CPUproblemList_submaster; - CPUproblemList.append(CPUproblemList_submaster); - } + PstreamBuffers pBufs(Pstream::commsTypes::nonBlocking); + if (Pstream::myProcNo() % cores_) //for slave + { + UOPstream send((Pstream::myProcNo()/cores_)*cores_, pBufs);// sending problem to master + send << GPUproblemList; + } + pBufs.finishedSends(); - /*========================================================================================================*/ + /*==============================send CVODE problems from submaster to neighbour==============================*/ + PstreamBuffers pBufs1(Pstream::commsTypes::nonBlocking); + if (!(Pstream::myProcNo() % cores_)) // submaster + { + UOPstream send((Pstream::myProcNo() + 1), pBufs1);// sending CPUproblems to neighbour + send << CPUproblemList; + } + pBufs1.finishedSends(); + if ((Pstream::myProcNo() % cores_) == 1) // neighbour of submaster + { + DynamicList CPUproblemList_submaster; + UIPstream recv((Pstream::myProcNo() - 1), pBufs1); + recv >> CPUproblemList_submaster; + CPUproblemList.append(CPUproblemList_submaster); + } - DynamicBuffer solutionBuffer; + /*========================================================================================================*/ - std::chrono::steady_clock::time_point stop2 = std::chrono::steady_clock::now(); - std::chrono::duration processingTime2 = std::chrono::duration_cast>(stop2 - start2); - // std::cout << "sendProblemTime = " << processingTime2.count() << std::endl; - time_sendProblem_ += processingTime2.count(); + DynamicBuffer solutionBuffer; - /*=============================submaster work start=============================*/ - if (!(Pstream::myProcNo() % cores_)) - { - std::chrono::steady_clock::time_point start1 = std::chrono::steady_clock::now(); - std::chrono::steady_clock::time_point start3 = std::chrono::steady_clock::now(); + std::chrono::steady_clock::time_point stop2 = std::chrono::steady_clock::now(); + std::chrono::duration processingTime2 = std::chrono::duration_cast>(stop2 - start2); + // std::cout << "sendProblemTime = " << processingTime2.count() << std::endl; + time_sendProblem_ += processingTime2.count(); - label problemSize = 0; // problemSize is defined to debug - DynamicBuffer problemBuffer(cores_);//each submaster init a local problemBuffer TODO:rename it + /*=============================submaster work start=============================*/ + if (!(Pstream::myProcNo() % cores_)) + { + std::chrono::steady_clock::time_point start1 = std::chrono::steady_clock::now(); + std::chrono::steady_clock::time_point start3 = std::chrono::steady_clock::now(); - /*==============================gather problems==============================*/ - problemBuffer[0] = GPUproblemList; //problemList of submaster get index 0 - problemSize += problemBuffer[0].size(); + label problemSize = 0; // problemSize is defined to debug + DynamicBuffer problemBuffer(cores_);//each submaster init a local problemBuffer TODO:rename it - for (label i = 1; i < cores_; i++) - { - UIPstream recv(i + Pstream::myProcNo(), pBufs); - recv >> problemBuffer[i]; //recv previous send problem and append to problemList - problemSize += problemBuffer[i].size(); - } - if (gpulog_) - { - Info << "problemSize = " << problemSize << endl; - } + /*==============================gather problems==============================*/ + problemBuffer[0] = GPUproblemList; //problemList of submaster get index 0 + problemSize += problemBuffer[0].size(); - std::chrono::steady_clock::time_point stop3 = std::chrono::steady_clock::now(); - std::chrono::duration processingTime3 = std::chrono::duration_cast>(stop3 - start3); - // std::cout << "RecvProblemTime = " << processingTime3.count() << std::endl; - time_RecvProblem_ += processingTime3.count(); + for (label i = 1; i < cores_; i++) + { + UIPstream recv(i + Pstream::myProcNo(), pBufs); + recv >> problemBuffer[i]; //recv previous send problem and append to problemList + problemSize += problemBuffer[i].size(); + } + if (gpulog_) + { + Info << "problemSize = " << problemSize << endl; + } - /*==============================construct DNN inputs==============================*/ - std::vector