From 7333973f7fc816275c75b0ee3770de19daf9e784 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Thu, 20 Oct 2022 19:15:08 +0800
Subject: [PATCH] multi-gpu implementation via pybind11 & pytorch

---
 applications/solvers/df0DFoam/Make/options    |  15 +-
 applications/solvers/df0DFoam/df0DFoam.C      |  11 +-
 applications/solvers/df0DFoam/setRootCase2.H  |   5 +
 .../solvers/dfHighSpeedFoam/Make/options      |  12 +-
 .../solvers/dfHighSpeedFoam/dfHighSpeedFoam.C |   8 +-
 .../solvers/dfHighSpeedFoam/setRootCase2.H    |   5 +
 .../solvers/dfLowMachFoam/Make/options        |  12 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |   9 +-
 .../solvers/dfLowMachFoam/setRootCase2.H      |   5 +
 applications/solvers/dfSprayFoam/Make/options |  14 +-
 .../solvers/dfSprayFoam/dfSprayFoam.C         |   9 +-
 .../solvers/dfSprayFoam/setRootCase2.H        |   5 +
 install.sh                                    |  23 +-
 .../GpuInference/DynamicBuffer.C              |  28 +
 .../GpuInference/DynamicBuffer.H              |  48 ++
 .../GpuInference/GpuProblem.C                 |  29 +
 .../GpuInference/GpuProblem.H                 |  99 ++++
 .../GpuInference/GpuSolution.C                |  36 ++
 .../GpuInference/GpuSolution.H                |  86 +++
 src/dfChemistryModel/Make/options             |   6 +-
 src/dfChemistryModel/dfChemistryModel.C       | 548 +++++++++++++++++-
 src/dfChemistryModel/dfChemistryModel.H       |  66 +++
 src/dfCombustionModels/Make/options           |   6 +-
 23 files changed, 1060 insertions(+), 25 deletions(-)
 create mode 100644 applications/solvers/df0DFoam/setRootCase2.H
 create mode 100644 applications/solvers/dfHighSpeedFoam/setRootCase2.H
 create mode 100644 applications/solvers/dfLowMachFoam/setRootCase2.H
 create mode 100644 applications/solvers/dfSprayFoam/setRootCase2.H
 create mode 100644 src/dfChemistryModel/GpuInference/DynamicBuffer.C
 create mode 100644 src/dfChemistryModel/GpuInference/DynamicBuffer.H
 create mode 100644 src/dfChemistryModel/GpuInference/GpuProblem.C
 create mode 100644 src/dfChemistryModel/GpuInference/GpuProblem.H
 create mode 100644 src/dfChemistryModel/GpuInference/GpuSolution.C
 create mode 100644 src/dfChemistryModel/GpuInference/GpuSolution.H

diff --git a/applications/solvers/df0DFoam/Make/options b/applications/solvers/df0DFoam/Make/options
index ca01d10c..adef6cbf 100644
--- a/applications/solvers/df0DFoam/Make/options
+++ b/applications/solvers/df0DFoam/Make/options
@@ -1,4 +1,12 @@
+-include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+PYTHON_LIB_DIR := $(shell python3 -c "from distutils import sysconfig; \
+    import os.path as op; v = sysconfig.get_config_vars(); \
+    fpaths = [op.join(v[pv], v['LDLIBRARY']) for pv in ('LIBDIR', 'LIBPL')]; \
+    print(list(filter(op.exists, fpaths))[0])" | xargs dirname)
+
 EXE_INC = -std=c++14 \
+    $(PFLAGS) $(PINC) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
     -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
@@ -14,7 +22,8 @@ EXE_INC = -std=c++14 \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
     $(if $(BOOST_ARCH_PATH),-I$(BOOST_ARCH_PATH),) \
-    $(if $(BOOST_ARCH_PATH),-DBOOST_ARCH_PATH_FOUNDD,)
+    $(if $(BOOST_ARCH_PATH),-DBOOST_ARCH_PATH_FOUNDD,) \
+    $(PYTHON_INC_DIR)
 
 EXE_LIBS = \
     -lcompressibleTransportModels \
@@ -33,4 +42,6 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
     $(if $(LIBTORCH_ROOT),-rdynamic,) \
-    $(if $(LIBTORCH_ROOT),-lpthread,)
+    $(if $(LIBTORCH_ROOT),-lpthread,) \
+    -L$(PYTHON_LIB_DIR) \
+    -lpython3.8
diff --git a/applications/solvers/df0DFoam/df0DFoam.C b/applications/solvers/df0DFoam/df0DFoam.C
index ac3df5c8..b27bf873 100644
--- a/applications/solvers/df0DFoam/df0DFoam.C
+++ b/applications/solvers/df0DFoam/df0DFoam.C
@@ -28,6 +28,9 @@ Description
 #include "dfChemistryModel.H"
 #include "CanteraMixture.H"
 #include "hePsiThermo.H"
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
 
 #include "fvCFD.H"
 #include "dynamicFvMesh.H"
@@ -44,9 +47,15 @@ Description
 
 int main(int argc, char *argv[])
 {
+    pybind11::scoped_interpreter guard{}; //start python interpreter
+
     #include "postProcess.H"
 
-    #include "setRootCaseLists.H"
+     // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+    
     #include "createTime.H"
     #include "createDynamicFvMesh.H"
     #include "createDyMControls.H"
diff --git a/applications/solvers/df0DFoam/setRootCase2.H b/applications/solvers/df0DFoam/setRootCase2.H
new file mode 100644
index 00000000..45d966e6
--- /dev/null
+++ b/applications/solvers/df0DFoam/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/applications/solvers/dfHighSpeedFoam/Make/options b/applications/solvers/dfHighSpeedFoam/Make/options
index 9c464844..77fe5462 100644
--- a/applications/solvers/dfHighSpeedFoam/Make/options
+++ b/applications/solvers/dfHighSpeedFoam/Make/options
@@ -1,4 +1,9 @@
 -include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+PYTHON_LIB_DIR := $(shell python3 -c "from distutils import sysconfig; \
+    import os.path as op; v = sysconfig.get_config_vars(); \
+    fpaths = [op.join(v[pv], v['LDLIBRARY']) for pv in ('LIBDIR', 'LIBPL')]; \
+    print(list(filter(op.exists, fpaths))[0])" | xargs dirname)
 
 EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
@@ -17,7 +22,8 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfChemistryModel/lnInclude \
     -I$(CANTERA_ROOT)/include \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,)
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR)
 
 EXE_LIBS = \
     -lfiniteVolume \
@@ -35,4 +41,6 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
     $(if $(LIBTORCH_ROOT),-rdynamic,) \
-    $(if $(LIBTORCH_ROOT),-lpthread,)
+    $(if $(LIBTORCH_ROOT),-lpthread,) \
+    -L$(PYTHON_LIB_DIR) \
+    -lpython3.8
diff --git a/applications/solvers/dfHighSpeedFoam/dfHighSpeedFoam.C b/applications/solvers/dfHighSpeedFoam/dfHighSpeedFoam.C
index 9a4fdb67..24a75827 100644
--- a/applications/solvers/dfHighSpeedFoam/dfHighSpeedFoam.C
+++ b/applications/solvers/dfHighSpeedFoam/dfHighSpeedFoam.C
@@ -33,6 +33,9 @@ Description
 #include "dfChemistryModel.H"
 #include "CanteraMixture.H"
 #include "hePsiThermo.H"
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
 
 #include "fvCFD.H"
 #include "dynamicFvMesh.H"
@@ -51,7 +54,10 @@ int main(int argc, char *argv[])
     #define NO_CONTROL
     #include "postProcess.H"
 
-    #include "setRootCaseLists.H"
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
     #include "createTime.H"
     #include "createDynamicFvMesh.H"
     #include "createFields.H"
diff --git a/applications/solvers/dfHighSpeedFoam/setRootCase2.H b/applications/solvers/dfHighSpeedFoam/setRootCase2.H
new file mode 100644
index 00000000..45d966e6
--- /dev/null
+++ b/applications/solvers/dfHighSpeedFoam/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
index 6b8c6742..84468590 100644
--- a/applications/solvers/dfLowMachFoam/Make/options
+++ b/applications/solvers/dfLowMachFoam/Make/options
@@ -1,4 +1,9 @@
 -include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+PYTHON_LIB_DIR := $(shell python3 -c "from distutils import sysconfig; \
+    import os.path as op; v = sysconfig.get_config_vars(); \
+    fpaths = [op.join(v[pv], v['LDLIBRARY']) for pv in ('LIBDIR', 'LIBPL')]; \
+    print(list(filter(op.exists, fpaths))[0])" | xargs dirname)
 
 EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
@@ -17,7 +22,8 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfCombustionModels/lnInclude \
     -I$(CANTERA_ROOT)/include \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,)
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR)
 
 EXE_LIBS = \
     -lcompressibleTransportModels \
@@ -35,4 +41,6 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
     $(if $(LIBTORCH_ROOT),-rdynamic,) \
-    $(if $(LIBTORCH_ROOT),-lpthread,)
+    $(if $(LIBTORCH_ROOT),-lpthread,) \
+    -L$(PYTHON_LIB_DIR) \
+    -lpython3.8
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index e41d671e..a32f4ad6 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -36,6 +36,9 @@ Description
 #include "dfChemistryModel.H"
 #include "CanteraMixture.H"
 #include "hePsiThermo.H"
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
 
 #include "fvCFD.H"
 #include "fluidThermo.H"
@@ -54,7 +57,11 @@ int main(int argc, char *argv[])
 {
     #include "postProcess.H"
 
-    #include "setRootCaseLists.H"
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
     #include "createTime.H"
     #include "createMesh.H"
     #include "createDyMControls.H"
diff --git a/applications/solvers/dfLowMachFoam/setRootCase2.H b/applications/solvers/dfLowMachFoam/setRootCase2.H
new file mode 100644
index 00000000..45d966e6
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/applications/solvers/dfSprayFoam/Make/options b/applications/solvers/dfSprayFoam/Make/options
index f8d4c48e..1f284da0 100644
--- a/applications/solvers/dfSprayFoam/Make/options
+++ b/applications/solvers/dfSprayFoam/Make/options
@@ -1,3 +1,10 @@
+-include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+PYTHON_LIB_DIR := $(shell python3 -c "from distutils import sysconfig; \
+    import os.path as op; v = sysconfig.get_config_vars(); \
+    fpaths = [op.join(v[pv], v['LDLIBRARY']) for pv in ('LIBDIR', 'LIBPL')]; \
+    print(list(filter(op.exists, fpaths))[0])" | xargs dirname)
+
 EXE_INC = -std=c++14 \
     -I. \
     -I$(FOAM_APP)/solvers/lagrangian/reactingParcelFoam \
@@ -27,7 +34,8 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfCombustionModels/lnInclude \
     -I$(CANTERA_ROOT)/include \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,)
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR)
 
 EXE_LIBS = \
     -lturbulenceModels \
@@ -55,4 +63,6 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
     $(if $(LIBTORCH_ROOT),-rdynamic,) \
-    $(if $(LIBTORCH_ROOT),-lpthread,)
+    $(if $(LIBTORCH_ROOT),-lpthread,) \
+    -L$(PYTHON_LIB_DIR) \
+    -lpython3.8
diff --git a/applications/solvers/dfSprayFoam/dfSprayFoam.C b/applications/solvers/dfSprayFoam/dfSprayFoam.C
index e3b6206a..8bb1c257 100644
--- a/applications/solvers/dfSprayFoam/dfSprayFoam.C
+++ b/applications/solvers/dfSprayFoam/dfSprayFoam.C
@@ -33,6 +33,9 @@ Description
 #include "CanteraMixture.H"
 #include "hePsiThermo.H"
 #include "turbulentFluidThermoModel.H"
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
 
 #include "fvCFD.H"
 #include "dynamicFvMesh.H"
@@ -52,7 +55,11 @@ int main(int argc, char *argv[])
 {
     #include "postProcess.H"
 
-    #include "setRootCaseLists.H"
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+    
     #include "createTime.H"
     #include "createDynamicFvMesh.H"
     #include "createDyMControls.H"
diff --git a/applications/solvers/dfSprayFoam/setRootCase2.H b/applications/solvers/dfSprayFoam/setRootCase2.H
new file mode 100644
index 00000000..45d966e6
--- /dev/null
+++ b/applications/solvers/dfSprayFoam/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/install.sh b/install.sh
index a9a5928b..720980d5 100755
--- a/install.sh
+++ b/install.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
 
-if [ -z "$CONDA_PREFIX" ]; then
-    echo "You should run this script only when the conda enviorment including libcantera-devel activated."
-    return
-fi
+# if [ -z "$CONDA_PREFIX" ]; then
+#     echo "You should run this script only when the conda enviorment including libcantera-devel activated."
+#     return
+# fi
 
 print_usage() {
     #printf "Usage: ...\n"
@@ -42,6 +42,17 @@ while test $# -gt 0; do
         --libtorch_no)
             shift
             ;;
+        --libcantera_dir)
+            shift
+            if test $# -gt 0; then
+                LIBCANTERA_DIR=$1
+                echo LIBCANTERA_DIR = $LIBCANTERA_DIR
+            else
+                print_usage
+            return
+            fi
+            shift
+            ;;            
         *)
             echo "$1 is not a recognized flag!"
             print_usage
@@ -73,8 +84,8 @@ fi
 
 cp bashrc.in bashrc
 sed -i s#pwd#$PWD#g ./bashrc
-echo "CONDA_PREFIX is set to $CONDA_PREFIX"
-sed -i s#CONDA_PREFIX#$CONDA_PREFIX#g ./bashrc
+#echo "LIBCANTERA_DIR is set to $CONDA_PREFIX"
+sed -i s#CONDA_PREFIX#$LIBCANTERA_DIR#g ./bashrc
 sed -i s#LIBTORCH_DIR#$LIBTORCH_DIR#g ./bashrc
 
 
diff --git a/src/dfChemistryModel/GpuInference/DynamicBuffer.C b/src/dfChemistryModel/GpuInference/DynamicBuffer.C
new file mode 100644
index 00000000..67aa4764
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/DynamicBuffer.C
@@ -0,0 +1,28 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | DLBFoam: Dynamic Load Balancing 
+   \\    /   O peration     | for fast reactive simulations
+    \\  /    A nd           | 
+     \\/     M anipulation  | 2020, Aalto University, Finland
+-------------------------------------------------------------------------------
+License
+    This file is part of DLBFoam library, derived from OpenFOAM.
+
+    https://github.com/blttkgl/DLBFoam
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+    
+\*---------------------------------------------------------------------------*/
+#include "DynamicBuffer.H"
+namespace Foam{
+
+}
\ No newline at end of file
diff --git a/src/dfChemistryModel/GpuInference/DynamicBuffer.H b/src/dfChemistryModel/GpuInference/DynamicBuffer.H
new file mode 100644
index 00000000..9c2d9cd2
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/DynamicBuffer.H
@@ -0,0 +1,48 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | DLBFoam: Dynamic Load Balancing 
+   \\    /   O peration     | for fast reactive simulations
+    \\  /    A nd           | 
+     \\/     M anipulation  | 2020, Aalto University, Finland
+-------------------------------------------------------------------------------
+License
+    This file is part of DLBFoam library, derived from OpenFOAM.
+
+    https://github.com/blttkgl/DLBFoam
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Class
+    Foam::DynamicBuffer
+
+Description
+    Currently just a typedef to DynamicList<DynamicList>. Could possibly be made
+    constant size at some point to avoid allocations during runtime.
+
+\*---------------------------------------------------------------------------*/
+
+#ifndef DynamicBuffer_H
+#define DynamicBuffer_H
+
+#include "DynamicList.H"
+
+namespace Foam
+{
+
+template <class T>
+using DynamicBuffer = DynamicList<DynamicList<T>>;
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/dfChemistryModel/GpuInference/GpuProblem.C b/src/dfChemistryModel/GpuInference/GpuProblem.C
new file mode 100644
index 00000000..49dd9ccd
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/GpuProblem.C
@@ -0,0 +1,29 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     |
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GpuProblem.H"
+namespace Foam{
+
+}
\ No newline at end of file
diff --git a/src/dfChemistryModel/GpuInference/GpuProblem.H b/src/dfChemistryModel/GpuInference/GpuProblem.H
new file mode 100644
index 00000000..7fcbe2d5
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/GpuProblem.H
@@ -0,0 +1,99 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     |
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Class
+    Foam::GpuProblem
+
+Description
+    A small object containing everything required for solving the reaction rate
+    using the ODE solver. These are passed around in the load balancer.
+
+\*---------------------------------------------------------------------------*/
+
+#ifndef GpuProblem_H
+#define GpuProblem_H
+
+#include "volFields.H"
+
+namespace Foam
+{
+
+struct GpuProblem
+{
+
+    GpuProblem() = default;
+    GpuProblem(label nSpecie)
+        : Y(nSpecie), Ti(0), pi(0), rhoi(0), DNNid(0), cellid(0)
+    {
+    }
+
+    scalarList Y;
+    scalar Ti;
+    scalar pi;
+    scalar rhoi;
+    label DNNid;
+    label cellid;
+
+    // TODO: implement!
+    bool operator==(const GpuProblem& rhs) const
+    {
+        return false;
+    }
+
+    bool operator!=(const GpuProblem& rhs) const
+    {
+        return !(*this == rhs);
+    }
+};
+
+//- Serialization for send
+static inline Ostream& operator<<(Ostream& os, const GpuProblem& p)
+{
+
+    os << p.Y;
+    os << p.Ti;
+    os << p.pi;
+    os << p.rhoi;
+    os << p.DNNid;
+    os << p.cellid;
+
+    return os;
+}
+
+//- Get a serialized problem from IStream
+static inline Istream& operator>>(Istream& is, GpuProblem& p)
+{
+
+    is >> p.Y;
+    is >> p.Ti;
+    is >> p.pi;
+    is >> p.rhoi;
+    is >> p.DNNid;
+    is >> p.cellid;
+
+    return is;
+}
+
+} // namespace Foam
+
+#endif
\ No newline at end of file
diff --git a/src/dfChemistryModel/GpuInference/GpuSolution.C b/src/dfChemistryModel/GpuInference/GpuSolution.C
new file mode 100644
index 00000000..a5ef62c6
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/GpuSolution.C
@@ -0,0 +1,36 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     |
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Class
+    Foam::GpuProblem
+
+Description
+    A small object containing everything required for solving the reaction rate
+    using the ODE solver. These are passed around in the load balancer.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GpuSolution.H"
+namespace Foam{
+
+}
\ No newline at end of file
diff --git a/src/dfChemistryModel/GpuInference/GpuSolution.H b/src/dfChemistryModel/GpuInference/GpuSolution.H
new file mode 100644
index 00000000..fa88a674
--- /dev/null
+++ b/src/dfChemistryModel/GpuInference/GpuSolution.H
@@ -0,0 +1,86 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | DLBFoam: Dynamic Load Balancing 
+   \\    /   O peration     | for fast reactive simulations
+    \\  /    A nd           | 
+     \\/     M anipulation  | 2020, Aalto University, Finland
+-------------------------------------------------------------------------------
+License
+    This file is part of DLBFoam library, derived from OpenFOAM.
+
+    https://github.com/blttkgl/DLBFoam
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Class
+    Foam::GpuSolution
+    
+Description
+    A small object containing everything required for updating the reaction rate
+    and the chemistry time step. These are passed around in the load balancer.
+
+\*---------------------------------------------------------------------------*/
+
+#ifndef GpuSolution_H
+#define GpuSolution_H
+
+#include "volFields.H"
+
+namespace Foam
+{
+
+struct GpuSolution
+{
+
+    GpuSolution() = default;
+
+    GpuSolution(label nspecie)
+        : Ti(0), RRi(nspecie, 0.0), cellid(0)
+    {
+    }
+
+    bool operator==(const GpuSolution& rhs) const
+    {
+        return false;
+    }
+
+    bool operator!=(const GpuSolution& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    scalar Ti;
+    scalarList RRi;
+    label cellid;
+};
+
+//- Serialization for send
+static inline Ostream& operator<<(Ostream& os, const GpuSolution& s)
+{
+    os << s.Ti;
+    os << s.RRi;
+    os << s.cellid;
+    return os;
+}
+
+//- Get a serialized solution from IStream
+static inline Istream& operator>>(Istream& is, GpuSolution& s)
+{
+    is >> s.Ti;
+    is >> s.RRi;
+    is >> s.cellid;
+    return is;
+}
+
+} // namespace Foam
+
+#endif
\ No newline at end of file
diff --git a/src/dfChemistryModel/Make/options b/src/dfChemistryModel/Make/options
index f7c4b5ed..f21d42f6 100644
--- a/src/dfChemistryModel/Make/options
+++ b/src/dfChemistryModel/Make/options
@@ -1,3 +1,6 @@
+-include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+
 EXE_INC = -std=c++14 \
     $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
@@ -8,7 +11,8 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/CanteraMixture/lnInclude \
     -I$(CANTERA_ROOT)/include \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,)
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR)
 
 EXE_LIBS = \
     -lcompressibleTransportModels \
diff --git a/src/dfChemistryModel/dfChemistryModel.C b/src/dfChemistryModel/dfChemistryModel.C
index 1b32a4bb..99086262 100644
--- a/src/dfChemistryModel/dfChemistryModel.C
+++ b/src/dfChemistryModel/dfChemistryModel.C
@@ -107,6 +107,27 @@ Foam::dfChemistryModel<ThermoType>::dfChemistryModel
     Tact_ = this->subDict("torchParameters").lookupOrDefault("Tact", 700);
     Qdotact_ = this->subDict("torchParameters").lookupOrDefault("Qdotact", 1e9);
 #endif
+    // time_allsolve_ = 0;
+    // time_submaster_(0),
+    // time_sendProblem_(0),
+    // time_RecvProblem_(0),
+    // time_sendRecvSolution_(0),
+    // time_getDNNinputs_(0),
+    // time_DNNinference_(0),
+    // time_updateSolutionBuffer_(0),
+    // time_vec2ndarray_(0),
+    // time_python_(0),
+
+    Tact1_ = this->subDict("torchParameters1").lookupOrDefault("Tact", 700);
+    Qdotact1_ = this->subDict("torchParameters1").lookupOrDefault("Qdotact", 1e9);
+
+    Tact2_ = this->subDict("torchParameters2").lookupOrDefault("Tact", 700);
+    Qdotact2_ = this->subDict("torchParameters2").lookupOrDefault("Qdotact", 1e9);
+
+    Tact3_ = this->subDict("torchParameters3").lookupOrDefault("Tact", 700);
+    Qdotact3_ = this->subDict("torchParameters3").lookupOrDefault("Qdotact", 1e9);
+    
+    coresPerGPU = this->subDict("torchParameters1").lookupOrDefault("coresPerGPU", 8);
 
     for(const auto& name : CanteraGas_->speciesNames())
     {
@@ -216,11 +237,12 @@ Foam::scalar Foam::dfChemistryModel<ThermoType>::solve
 )
 {
     scalar result = 0;
-#ifdef USE_LIBTORCH
-    result = torchSolve(deltaT);
-#else
-    result = solve_loadBalance(deltaT);
-#endif
+// #ifdef USE_LIBTORCH
+//     result = torchSolve(deltaT);
+// #else
+//     result = solve_loadBalance(deltaT);
+// #endif
+    result = torchDCUSolve(deltaT);
     return result;
 }
 
@@ -916,4 +938,520 @@ Foam::scalar Foam::dfChemistryModel<ThermoType>::solve_loadBalance
     return updateReactionRates(incomingSolutions);
 }
 
+template <class ThermoType>
+template <class DeltaTType>
+Foam::scalar Foam::dfChemistryModel<ThermoType>::torchCUDAoneCoreSolve(
+    const DeltaTType &deltaT)
+{
+    scalar deltaTMin = great;
+
+    if (!this->chemistry_)
+    {
+        return deltaTMin;
+    }
+    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+    Info << "=== begin torchCUDAoneCore-solve === " << endl;
+
+    // set variables
+    scalarList yPre_(mixture_.nSpecies());
+    scalarList yBCT_(mixture_.nSpecies());
+    scalarList u_(mixture_.nSpecies() + 2);
+    Cantera::Reactor react;
+    double lambda = 0.1;
+
+    std::vector<size_t> torch_cell;
+    label torch_cellname = 0;
+
+    // obtain the number of DNN cells
+    std::chrono::steady_clock::time_point start_0 = std::chrono::steady_clock::now();
+    forAll(T_, cellI)
+    {
+        if (T_[cellI] >= Tact1_)
+        {
+            torch_cell.push_back(cellI);
+        }
+    }
+
+    // generate GPU inputs and solve CVODE cells
+    std::vector<double> inputs_;
+    inputs_.reserve(torch_cell.size() * (CanteraGas_->nSpecies() + 3));
+
+    forAll(T_, cellI)
+    {
+        scalar Ti = T_[cellI];
+        scalar pi = p_[cellI];
+        scalar rhoi = rho_[cellI];
+
+        if (Ti >= Tact1_)
+        {
+            Qdot_[cellI] = 0.0;
+
+            // set inputs
+            inputs_.push_back(rhoi);
+            inputs_.push_back(Ti);
+            inputs_.push_back(pi/101325);
+            for (size_t i = 0; i < CanteraGas_->nSpecies(); i++)
+            {
+                inputs_.push_back(Y_[i][cellI]);
+            }
+        }
+        else
+        {
+            Qdot_[cellI] = 0.0;
+            for (size_t i = 0; i < CanteraGas_->nSpecies(); i++)
+            {
+                yPre_[i] = Y_[i][cellI];
+            }
+
+            CanteraGas_->setState_TPY(Ti, pi, yPre_.begin());
+            react.insert(mixture_.CanteraSolution());
+            react.setEnergy(0);
+
+            Cantera::ReactorNet sim;
+            sim.addReactor(react);
+            setNumerics(sim);
+            sim.advance(deltaT);
+
+            CanteraGas_->getMassFractions(yTemp_.begin());
+
+            for (size_t i = 0; i < CanteraGas_->nSpecies(); i++)
+            {
+                RR_[i][cellI] = (yTemp_[i] - yPre_[i]) * rhoi / deltaT;
+                Qdot_[cellI] -= hc_[i] * RR_[i][cellI];
+            }
+        }
+    }
+    std::chrono::steady_clock::time_point stop_0 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime_0 = std::chrono::duration_cast<std::chrono::duration<double>>(stop_0 - start_0);
+    std::cout << "beforeCUDATime = " << processingTime_0.count() << std::endl;
+    time_getDNNinputs_ += processingTime_0.count();
+
+    // DNN
+
+    std::chrono::steady_clock::time_point start_3 = std::chrono::steady_clock::now();
+    std::chrono::steady_clock::time_point start_4 = std::chrono::steady_clock::now();
+
+    pybind11::array_t<double> vec = pybind11::cast(inputs_);
+
+    std::chrono::steady_clock::time_point stop_4 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime_4 = std::chrono::duration_cast<std::chrono::duration<double>>(stop_4 - start_4);
+    std::cout << "vec2ndarrayTime = " << processingTime_4.count() << std::endl;
+    time_vec2ndarray_ += processingTime_4.count();
+
+    pybind11::module_ call_torch = pybind11::module_::import("inference_H2"); // import python file
+
+    std::chrono::steady_clock::time_point start_5 = std::chrono::steady_clock::now();
+
+    pybind11::object result = call_torch.attr("inference")(vec); // call function
+    const double* star = result.cast<pybind11::array_t<double>>().data();
+
+    std::vector<double> outputsVec(star, star + torch_cell.size() * 7);
+
+    std::chrono::steady_clock::time_point stop_5 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime_5 = std::chrono::duration_cast<std::chrono::duration<double>>(stop_5 - start_5);
+    std::cout << "pythonTime = " << processingTime_5.count() << std::endl;
+    time_python_ += processingTime_5.count();
+
+    std::chrono::steady_clock::time_point stop_3 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime_3 = std::chrono::duration_cast<std::chrono::duration<double>>(stop_3 - start_3);
+    std::cout << "DNNinferenceTime = " << processingTime_3.count() << std::endl;
+    time_DNNinference_ += processingTime_3.count();
+
+    std::chrono::steady_clock::time_point start_2 = std::chrono::steady_clock::now();
+    for (size_t cellI = 0; cellI < torch_cell.size(); cellI++)
+    {
+        // update y
+        for (size_t i = 0; i < CanteraGas_->nSpecies(); i++)
+        {
+            RR_[i][torch_cell[cellI]] = outputsVec[cellI * 7 + i];
+            Qdot_[cellI] -= hc_[i] * RR_[i][cellI];
+        }
+    }
+    std::chrono::steady_clock::time_point stop_2 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime_2 = std::chrono::duration_cast<std::chrono::duration<double>>(stop_2 - start_2);
+    std::cout << "afterCUDATime = " << processingTime_2.count() << std::endl;
+    time_updateSolutionBuffer_ += processingTime_2.count();
+
+    Info << "=== end torch&ode-solve === " << endl;
+    std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
+    std::cout << "allSolveTime = " << processingTime.count() << std::endl;
+    time_allsolve_ += processingTime.count();
+
+    return deltaTMin;
+}
+
+template <class ThermoType>
+template<class DeltaTType>
+Foam::DynamicList<Foam::GpuProblem>
+Foam::dfChemistryModel<ThermoType>::getGPUProblems
+(
+    const DeltaTType& deltaT
+)
+{
+    DynamicList<GpuProblem> problemList; //single core TODO:rename it
+
+    // get cuda problemList, for all cell
+    // each get problem
+    forAll(T_, cellI)
+    {
+        scalar Ti = T_[cellI];
+        scalar pi = p_[cellI];
+        scalar rhoi = rho_[cellI];
+
+        // set problems
+        GpuProblem problem(CanteraGas_->nSpecies());
+        problem.cellid = cellI;
+        problem.Ti = Ti;
+        problem.pi = pi/101325;
+        for (size_t i = 0; i < CanteraGas_->nSpecies(); i++)
+        {
+            problem.Y[i] = Y_[i][cellI];
+        }
+        // choose DNN module
+        if ((Qdot_[cellI] < Qdotact2_) && (T_[cellI] <= Tact2_)  && ( T_[cellI] >= Tact1_))//choose1
+        {
+            problem.DNNid = 0;
+        }
+        if(((Qdot_[cellI] <= Qdotact3_)&&(Qdot_[cellI] >= Qdotact2_) && (Tact3_ > T_[cellI])&&(T_[cellI] > Tact2_))||(Qdot_[cellI] > Qdotact3_))  //choose2
+        {
+            problem.DNNid = 1;
+        }
+        if  ((Qdot_[cellI] <= Qdotact3_) && (T_[cellI] >= Tact3_))//if(Ti >= Tact_))//choose3
+        {
+            problem.DNNid = 2;
+        }
+        problem.rhoi = rhoi;
+        problemList.append(problem);
+        Qdot_[cellI] = 0.0;
+    }
+
+    return problemList; 
+}
+
+template <class ThermoType>
+void Foam::dfChemistryModel<ThermoType>::getDNNinputs
+(
+    const Foam::DynamicBuffer<GpuProblem>& problemBuffer, 
+    std::vector<Foam::label>& outputLength,
+    std::vector<std::vector<double>>& DNNinputs, 
+    std::vector<Foam::DynamicBuffer<label>>& cellIDBuffer,
+    std::vector<std::vector<label>>& problemCounter
+)
+{
+    std::vector<label> problemCounter0;     // evaluate the number of the problems of each subslave for DNN0
+    std::vector<label> problemCounter1;     // evaluate the number of the problems of each subslave for DNN1
+    std::vector<label> problemCounter2;     // evaluate the number of the problems of each subslave for DNN2
+    std::vector<double> inputsDNN0;         // the vector constructed for inference via DNN0
+    std::vector<double> inputsDNN1;         // the vector constructed for inference via DNN1
+    std::vector<double> inputsDNN2;         // the vector constructed for inference via DNN2
+    DynamicList<label> cellIDList0;         // store the cellID of each problem in each subslave for DNN0
+    DynamicList<label> cellIDList1;         // store the cellID of each problem in each subslave for DNN1
+    DynamicList<label> cellIDList2;         // store the cellID of each problem in each subslave for DNN2
+    DynamicBuffer<label> cellIDList0Buffer; // store the cellIDList0 of each subslave
+    DynamicBuffer<label> cellIDList1Buffer; // store the cellIDList1 of each subslave
+    DynamicBuffer<label> cellIDList2Buffer; // store the cellIDList2 of each subslave
+
+    for (label i = 0; i < coresPerGPU; i++) // for all local core TODO: i may cause misleading
+    {
+        label counter0 = 0;
+        label counter1 = 0;
+        label counter2 = 0;
+        //TODO: parallel the loop
+        for (label cellI = 0; cellI < problemBuffer[i].size(); cellI++) // loop coresPerGPU*problemBuffer[i].size() times
+        {
+            switch (problemBuffer[i][cellI].DNNid) //divide by Dnn id
+            {
+            case 0:
+                inputsDNN0.push_back(problemBuffer[i][cellI].rhoi);
+                inputsDNN0.push_back(problemBuffer[i][cellI].Ti);
+                inputsDNN0.push_back(problemBuffer[i][cellI].pi);
+                for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+                {
+                    inputsDNN0.push_back(problemBuffer[i][cellI].Y[speciID]);
+                }
+                counter0++;
+                cellIDList0.append(problemBuffer[i][cellI].cellid); // store cellid for further send back
+                break;
+
+            case 1:
+                inputsDNN1.push_back(problemBuffer[i][cellI].rhoi);
+                inputsDNN1.push_back(problemBuffer[i][cellI].Ti);
+                inputsDNN1.push_back(problemBuffer[i][cellI].pi);
+                for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+                {
+                    inputsDNN1.push_back(problemBuffer[i][cellI].Y[speciID]);
+                }
+                counter1++;
+                cellIDList1.append(problemBuffer[i][cellI].cellid);
+                break;
+
+            case 2:
+                inputsDNN2.push_back(problemBuffer[i][cellI].rhoi);
+                inputsDNN2.push_back(problemBuffer[i][cellI].Ti);
+                inputsDNN2.push_back(problemBuffer[i][cellI].pi);
+                for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+                {
+                    inputsDNN2.push_back(problemBuffer[i][cellI].Y[speciID]);
+                }
+                counter2++;
+                cellIDList2.append(problemBuffer[i][cellI].cellid);
+                break;
+            
+            default:
+                Info<<"invalid input"<<endl;
+                break;
+            }
+        }
+        problemCounter0.push_back(counter0); //count number of inputs mapped to each dnn
+        problemCounter1.push_back(counter1);
+        problemCounter2.push_back(counter2);
+        cellIDList0Buffer.append(cellIDList0);
+        cellIDList1Buffer.append(cellIDList1);
+        cellIDList2Buffer.append(cellIDList2);
+        cellIDList0.clear();
+        cellIDList1.clear();
+        cellIDList2.clear();
+    }
+
+    // get cellNumbers for each model
+    label length0 = std::accumulate(problemCounter0.begin(), problemCounter0.end(), 0);
+    label length1 = length0 + std::accumulate(problemCounter1.begin(), problemCounter1.end(), 0);
+    label length2 = length1 + std::accumulate(problemCounter2.begin(), problemCounter2.end(), 0);
+
+    // set output
+    outputLength = {length0, length1, length2};
+    DNNinputs = {inputsDNN0, inputsDNN1, inputsDNN2};
+    cellIDBuffer = {cellIDList0Buffer, cellIDList1Buffer, cellIDList2Buffer};
+    problemCounter = {problemCounter0, problemCounter1, problemCounter2};
+
+    Info<<"get inputs successfully"<<endl;
+
+    return;
+}
+
+template <class ThermoType>
+void Foam::dfChemistryModel<ThermoType>::updateSolutionBuffer
+(
+    Foam::DynamicBuffer<Foam::GpuSolution>& solutionBuffer, 
+    const double* star,
+    const std::vector<Foam::label>& outputLength,
+    const std::vector<Foam::DynamicBuffer<Foam::label>>& cellIDBuffer,
+    std::vector<std::vector<Foam::label>>& problemCounter
+)
+{
+    std::vector<double> outputsVec0(star, star+outputLength[0] * 21); //the float number is sample_length*sample_number
+    std::vector<double> outputsVec1(star+outputLength[0] * 21, star+outputLength[1] * 21);
+    std::vector<double> outputsVec2(star+outputLength[1] * 21, star+outputLength[2] * 21);
+
+    GpuSolution solution(CanteraGas_->nSpecies());
+    DynamicList<GpuSolution> solutionList; //TODO: rename
+
+    label outputCounter0 = 0;
+    label outputCounter1 = 0;
+    label outputCounter2 = 0;
+
+    for (label i = 0; i < coresPerGPU; i++) //TODO: i may cause misleading
+    {
+        for (size_t cellI = 0; cellI < problemCounter[0][i]; cellI++)
+        {
+            for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+            {
+                solution.RRi[speciID] = outputsVec0[outputCounter0 * mixture_.nSpecies() + speciID];
+            }
+            solution.cellid = cellIDBuffer[0][i][cellI]; //cellid are sequential so that's fine
+            solutionList.append(solution);
+            outputCounter0++;
+        }
+        for (size_t cellI = 0; cellI < problemCounter[1][i]; cellI++)
+        {
+            for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+            {
+                solution.RRi[speciID] = outputsVec1[outputCounter1 * mixture_.nSpecies() + speciID];
+            }
+            solution.cellid = cellIDBuffer[1][i][cellI];
+            solutionList.append(solution);
+            outputCounter1++;
+        }
+        for (size_t cellI = 0; cellI < problemCounter[2][i]; cellI++)
+        {
+            for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+            {
+                solution.RRi[speciID] = outputsVec2[outputCounter2 * mixture_.nSpecies() + speciID];
+            }
+            solution.cellid = cellIDBuffer[2][i][cellI];
+            solutionList.append(solution);
+            outputCounter2++;
+        }
+    solutionBuffer.append(solutionList);
+    solutionList.clear();
+    }
+    return;
+}
+
+template <class ThermoType>
+template <class DeltaTType>
+Foam::scalar Foam::dfChemistryModel<ThermoType>::torchDCUSolve(
+    const DeltaTType &deltaT)
+{
+    scalar deltaTMin = great;
+    // set the cores slaved by a DCU
+    if (!this->chemistry_)
+    {
+        return deltaTMin;
+    }
+
+    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+    Info << "=== begin torchDCUsolve === " << endl;
+
+    /*=============================gather problems=============================*/
+    DynamicList<GpuProblem> problemList = getGPUProblems(deltaT);
+
+    /*==============================send problems==============================*/
+    std::chrono::steady_clock::time_point start2 = std::chrono::steady_clock::now();
+
+    PstreamBuffers pBufs(Pstream::commsTypes::nonBlocking); 
+    if (Pstream::myProcNo() % coresPerGPU) //for slave 
+    {
+        UOPstream send((Pstream::myProcNo()/coresPerGPU)*coresPerGPU, pBufs);// sending problem to master
+        send << problemList;
+    }
+    pBufs.finishedSends();
+
+    DynamicBuffer<GpuSolution> solutionBuffer;
+
+    std::chrono::steady_clock::time_point stop2 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime2 = std::chrono::duration_cast<std::chrono::duration<double>>(stop2 - start2);
+    std::cout << "sendProblemTime = " << processingTime2.count() << std::endl;
+    time_sendProblem_ += processingTime2.count();
+
+    /*=============================submaster work start=============================*/
+    if (!(Pstream::myProcNo() % coresPerGPU))
+    {
+        std::chrono::steady_clock::time_point start1 = std::chrono::steady_clock::now();
+        std::chrono::steady_clock::time_point start3 = std::chrono::steady_clock::now();
+
+        label problemSize = 0; // problemSize is defined to debug
+        DynamicBuffer<GpuProblem> problemBuffer(coresPerGPU);//each submaster init a local problemBuffer TODO:rename it
+
+        /*==============================gather problems==============================*/
+        problemBuffer[0] = problemList; //problemList of submaster get index 0
+        problemSize += problemBuffer[0].size();
+
+        for (label i = 1; i < coresPerGPU; i++)
+        {
+            UIPstream recv(i + Pstream::myProcNo(), pBufs);
+            recv >> problemBuffer[i];  //recv previous send problem and append to problemList
+            problemSize += problemBuffer[i].size();
+        }
+        Info << "problemSize = " << problemSize << endl;
+
+        std::chrono::steady_clock::time_point stop3 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime3 = std::chrono::duration_cast<std::chrono::duration<double>>(stop3 - start3);
+        std::cout << "RecvProblemTime = " << processingTime3.count() << std::endl;
+        time_RecvProblem_ += processingTime3.count();
+
+        /*==============================construct DNN inputs==============================*/
+        std::vector<label> outputLength;
+        std::vector<std::vector<double>> DNNinputs;     // vectors for the inference of DNN
+        std::vector<DynamicBuffer<label>> cellIDBuffer; // Buffer contains the cell numbers
+        std::vector<std::vector<label>> problemCounter; // evaluate the number of the problems of each subslave
+
+        std::chrono::steady_clock::time_point start5 = std::chrono::steady_clock::now();
+        getDNNinputs(problemBuffer, outputLength, DNNinputs, cellIDBuffer, problemCounter);
+        std::chrono::steady_clock::time_point stop5 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime5 = std::chrono::duration_cast<std::chrono::duration<double>>(stop5 - start5);
+        std::cout << "getDNNinputsTime = " << processingTime5.count() << std::endl;
+        time_getDNNinputs_ += processingTime5.count();
+
+        /*=============================inference via pybind11=============================*/
+        std::chrono::steady_clock::time_point start7 = std::chrono::steady_clock::now();
+        std::chrono::steady_clock::time_point start8 = std::chrono::steady_clock::now();
+
+        pybind11::array_t<double> vec0 = pybind11::cast(DNNinputs[0]); // cast vector to np.array
+        pybind11::array_t<double> vec1 = pybind11::cast(DNNinputs[1]);
+        pybind11::array_t<double> vec2 = pybind11::cast(DNNinputs[2]);
+
+        std::chrono::steady_clock::time_point stop8 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime8 = std::chrono::duration_cast<std::chrono::duration<double>>(stop8 - start8);
+        std::cout << "vec2ndarrayTime = " << processingTime8.count() << std::endl;
+        time_vec2ndarray_ += processingTime8.count();
+
+        pybind11::module_ call_torch = pybind11::module_::import("inference2"); // import python file
+
+        std::chrono::steady_clock::time_point start9 = std::chrono::steady_clock::now();
+
+        pybind11::object result = call_torch.attr("inference")(vec0, vec1, vec2); // call python function
+        const double* star = result.cast<pybind11::array_t<double>>().data();
+
+        std::chrono::steady_clock::time_point stop9 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime9 = std::chrono::duration_cast<std::chrono::duration<double>>(stop9 - start9);
+        std::cout << "pythonTime = " << processingTime9.count() << std::endl;
+        time_python_ += processingTime9.count();
+
+        std::chrono::steady_clock::time_point stop7 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime7 = std::chrono::duration_cast<std::chrono::duration<double>>(stop7 - start7);
+        std::cout << "DNNinferenceTime = " << processingTime7.count() << std::endl;
+        time_DNNinference_ += processingTime7.count();
+
+        /*=============================construct solutions=============================*/
+        std::chrono::steady_clock::time_point start6 = std::chrono::steady_clock::now();
+        updateSolutionBuffer(solutionBuffer, star, outputLength, cellIDBuffer, problemCounter);
+        std::chrono::steady_clock::time_point stop6 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime6 = std::chrono::duration_cast<std::chrono::duration<double>>(stop6 - start6);
+        std::cout << "updateSolutionBufferTime = " << processingTime6.count() << std::endl;
+        time_updateSolutionBuffer_ += processingTime6.count();
+
+        std::chrono::steady_clock::time_point stop1 = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime1 = std::chrono::duration_cast<std::chrono::duration<double>>(stop1 - start1);
+        std::cout << "submasterTime = " << processingTime1.count() << std::endl;  
+        time_submaster_ += processingTime1.count();
+    }
+
+    /*=============================send and recv solutions=============================*/
+    std::chrono::steady_clock::time_point start4 = std::chrono::steady_clock::now();
+
+    DynamicList<GpuSolution> finalList;
+    PstreamBuffers pBufs2(Pstream::commsTypes::nonBlocking);
+    if (!(Pstream::myProcNo() % coresPerGPU))
+    {
+        finalList = solutionBuffer[0];
+        for (label i = 1; i < coresPerGPU; i++)
+        {
+            UOPstream send(i + Pstream::myProcNo(), pBufs2);
+            send << solutionBuffer[i];
+        }
+    }
+    pBufs2.finishedSends();
+    if (Pstream::myProcNo() % coresPerGPU)
+    {
+        UIPstream recv((Pstream::myProcNo()/coresPerGPU)*coresPerGPU, pBufs2);
+        recv >> finalList;
+    }
+
+    std::chrono::steady_clock::time_point stop4 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime4 = std::chrono::duration_cast<std::chrono::duration<double>>(stop4 - start4);
+    std::cout << "SendRecvSolutionTime = " << processingTime4.count() << std::endl;
+    time_sendRecvSolution_ += processingTime4.count();
+
+    /*=============================update RR fields=============================*/
+    for (size_t cellI = 0; cellI < finalList.size(); cellI++)
+    {
+        for (size_t speciID = 0; speciID < CanteraGas_->nSpecies(); speciID++)
+        {
+            RR_[speciID][finalList[cellI].cellid] = finalList[cellI].RRi[speciID];
+            Qdot_[finalList[cellI].cellid] -= hc_[speciID] * RR_[speciID][finalList[cellI].cellid];
+        }
+    }
+
+    Info << "=== end torch&ode-CUDAsolve === " << endl;
+
+    std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
+    std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
+    std::cout << "allSolveTime = " << processingTime.count() << std::endl;
+    time_allsolve_ += processingTime.count();
+
+    return deltaTMin;
+}
+
 // ************************************************************************* //
diff --git a/src/dfChemistryModel/dfChemistryModel.H b/src/dfChemistryModel/dfChemistryModel.H
index 60b24f27..e7cd5ab7 100644
--- a/src/dfChemistryModel/dfChemistryModel.H
+++ b/src/dfChemistryModel/dfChemistryModel.H
@@ -42,6 +42,13 @@ SourceFiles
 #include <torch/script.h>
 #endif
 
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
+#include "GpuProblem.H"
+#include "GpuSolution.H"
+#include "DynamicBuffer.H"
+
 #include "CanteraMixture.H"
 #include "IOdictionary.H"
 #include "Switch.H"
@@ -122,6 +129,44 @@ public IOdictionary
     scalar Qdotact_;
 #endif
 
+    // profiling
+    double time_allsolve_;
+    double time_submaster_;
+    double time_sendProblem_;
+    double time_RecvProblem_;
+    double time_sendRecvSolution_;
+    double time_getDNNinputs_;
+    double time_DNNinference_;
+    double time_updateSolutionBuffer_;
+    double time_vec2ndarray_;
+    double time_python_;
+
+    word torchModelName1_;
+    scalarList Xmu1_;  //member function
+    scalarList Xstd1_;
+    scalarList Ymu1_;
+    scalarList Ystd1_;
+    scalar Tact1_;
+    scalar Qdotact1_;
+    
+    word torchModelName2_;
+    scalarList Xmu2_;  //member function
+    scalarList Xstd2_;
+    scalarList Ymu2_;
+    scalarList Ystd2_;
+    scalar Tact2_;
+    scalar Qdotact2_;
+    
+    word torchModelName3_;  
+    scalarList Xmu3_;  //member function
+    scalarList Xstd3_;
+    scalarList Ymu3_;
+    scalarList Ystd3_;
+    scalar Tact3_;
+    scalar Qdotact3_;
+
+    int coresPerGPU;
+
     // Load balancing object
     LoadBalancer balancer_;
     // Field containing chemistry CPU time information
@@ -152,6 +197,27 @@ public IOdictionary
         scalar torchSolve(const DeltaTType& deltaT);
 #endif
 
+        template<class DeltaTType>
+        scalar torchCUDAoneCoreSolve(const DeltaTType& deltaT);
+        
+        template<class DeltaTType>
+        scalar torchDCUSolve(const DeltaTType& deltaT);
+
+        /*=======================private methods for DCUSolve=======================*/
+        //- get a list of GPU problems to be solved
+        template<class DeltaTType>
+        DynamicList<GpuProblem> getGPUProblems(const DeltaTType& deltaT);
+
+        //- get the input for DNN inference
+        void getDNNinputs(const DynamicBuffer<GpuProblem>& problemBuffer, std::vector<label>& outputlength,
+        std::vector<std::vector<double>>& DNNinputs, std::vector<DynamicBuffer<label>>& cellIDBuffer, 
+        std::vector<std::vector<label>>& problemCounter);
+
+        //- construct the output
+        void updateSolutionBuffer(DynamicBuffer<GpuSolution>& solutionBuffer, const double* star,
+        const std::vector<label>& outputLength, const std::vector<DynamicBuffer<label>>& cellIDBuffer,
+        std::vector<std::vector<label>>& problemCounter);
+
         //- Solve a single ChemistryProblem and put the solution to ChemistrySolution
         void solveSingle(ChemistryProblem& problem, ChemistrySolution& solution);
 
diff --git a/src/dfCombustionModels/Make/options b/src/dfCombustionModels/Make/options
index f7ae3d66..a44f8f89 100644
--- a/src/dfCombustionModels/Make/options
+++ b/src/dfCombustionModels/Make/options
@@ -1,3 +1,6 @@
+-include $(GENERAL_RULES)/mplibType
+PYTHON_INC_DIR := $(shell python3 -m pybind11 --includes)
+
 EXE_INC = -std=c++14 \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
@@ -9,7 +12,8 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfChemistryModel/lnInclude \
     -I$(CANTERA_ROOT)/include \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,)
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR)
 
 LIB_LIBS = \
     -lcompressibleTransportModels \