Skip to content
Browse files

README section for kernel generation

  • Loading branch information...
1 parent a553926 commit 60f88e9251188aff1388f17097c81ab3783a620a @ddemidov committed Oct 2, 2012
Showing with 250 additions and 3 deletions.
  1. +90 −0 README.md
  2. +66 −0 examples/utests.cpp
  3. +3 −3 vexcl/generator.hpp
  4. +91 −0 vexcl/vexcl.hpp
View
90 README.md
@@ -277,6 +277,96 @@ be more effective. Multi-expressions like these may also be used with ordinary
vex::tie(dx,dy) = std::make_tuple(x + y, x - y);
```
+Converting existing algorithms to kernels
+-----------------------------------------------
+
+VexCL kernel generator allows to transparently convert existing CPU algorithm
+to an OpenCL kernel. In order to do this you need to record sequence of
+arithmetic expressions made by an algorithm and convert the recorded sequence
+to a kernel. The recording part is done with help of
+`vex::generator::symbolic<T>` class. The class supports arithmetic expression
+templates and simply outputs to provided stream any expressions it is being
+subjected to.
+
+To illustrate this, imagine that you have generic algorithm for a 4th order
+Runge-Kutta ODE stepper:
+
+```C++
+template <class state_type, class SysFunction>
+void runge_kutta_4(SysFunction sys, state_type &x, double dt) {
+ state_type xtmp, k1, k2, k3, k4;
+
+ sys(x, k1, dt);
+
+ xtmp = x + 0.5 * k1;
+ sys(xtmp, k2, dt);
+
+ xtmp = x + 0.5 * k2;
+ sys(xtmp, k3, dt);
+
+ xtmp = x + k3;
+ sys(xtmp, k4, dt);
+
+ x += (k1 + 2 * k2 + 2 * k3 + k4) / 6;
+}
+```
+To model equation `dx/dt = sin(x)` we also provide the following system function:
+```C++
+template <class state_type>
+void sys_func(const state_type &x, state_type &dx, double dt) {
+ dx = dt * sin(x);
+}
+```
+Now, to make a hundred of RK4 iterations for a `double` value on CPU, all that
+we need to do is
+```C++
+double x = 1;
+double dt = 0.01;
+for(int i = 0; i < 100; i++)
+ runge_kutta_4(sys_func<double>, x, dt);
+```
+Let us now generate the kernel for single RK4 step and apply the kernel to a
+`vex::vector<double>` (by doing this we essentially simpultaneously solve big
+number of same ODEs with different initial conditions).
+```C++
+// Set recorder for expression sequence.
+std::ostringstream body;
+vex::generator::set_recorder(body);
+
+// Create symbolic variable.
+typedef vex::generator::symbolic<double> sym_state;
+sym_state sym_x(sym_state::VectorParameter);
+
+// Record expression sequience.
+double dt = 0.01;
+runge_kutta_4(sys_func<sym_state>, sym_x, dt);
+
+// Build kernel.
+auto kernel = vex::generator::build_kernel(ctx.queue(),
+ "rk4_stepper", body.str(), sym_x);
+
+// Create and initialize vector of states.
+std::vector<double> xinit(n);
+std::generate(xinit.begin(), xinit.end(), [](){
+ return (double)rand() / RAND_MAX;
+ });
+vex::vector<double> x(ctx.queue(), n);
+vex::copy(xinit, x);
+
+// Make 100 rk4 steps.
+for(int i = 0; i < 100; i++) kernel(x);
+```
+This is much more effective than (for this to work correctly we would need to
+slightly change sys_func):
+```C++
+for(int i = 0; i < 100; i++)
+ runge_kutta_4(sys_func<vex::vector<double>>, x, dt);
+```
+The generated kernel is more effective because temporary values used in
+sys_func are now represented not as full-blown vex::vectors, but as fast
+register variables inside the kernel body. We have seen upto tenfold
+performance improvement with this technique.
+
Using custom kernels
--------------------
View
66 examples/utests.cpp
@@ -10,6 +10,10 @@
//#define VEXCL_SHOW_KERNELS
#include <vexcl/vexcl.hpp>
+#ifdef VEXCL_VARIADIC_TEMPLATES
+# include <vexcl/generator.hpp>
+#endif
+
using namespace vex;
static bool all_passed = true;
@@ -31,6 +35,29 @@ UserFunction<greater_body, size_t(double, double)> greater;
extern const char pow3_body[] = "return pow(prm1, 3.0);";
UserFunction<pow3_body, double(double)> pow3;
+
+template <class state_type>
+void sys_func(const state_type &x, state_type &dx, double dt) {
+ dx = dt * sin(x);
+}
+
+template <class state_type, class SysFunction>
+void runge_kutta_4(SysFunction sys, state_type &x, double dt) {
+ state_type xtmp, k1, k2, k3, k4;
+
+ sys(x, k1, dt);
+
+ xtmp = x + 0.5 * k1;
+ sys(xtmp, k2, dt);
+
+ xtmp = x + 0.5 * k2;
+ sys(xtmp, k3, dt);
+
+ xtmp = x + k3;
+ sys(xtmp, k4, dt);
+
+ x += (k1 + 2 * k2 + 2 * k3 + k4) / 6;
+}
#endif
extern const char pow3_oper_body[] = "return X[0] + pow(X[-1] + X[1], 3.0);";
@@ -1388,6 +1415,45 @@ int main(int argc, char *argv[]) {
return rc;
});
+#ifdef VEXCL_VARIADIC_TEMPLATES
+ run_test("Kernel auto-generation", [&]() -> bool {
+ bool rc = true;
+ const int n = 1 << 20;
+
+ std::ostringstream body;
+ generator::set_recorder(body);
+
+ typedef generator::symbolic<double> sym_state;
+
+ double dt = 0.01;
+ sym_state sym_x(sym_state::VectorParameter);
+
+ // Record expression sequience.
+ runge_kutta_4(sys_func<sym_state>, sym_x, dt);
+
+ // Build kernel.
+ auto kernel = generator::build_kernel(ctx.queue(),
+ "rk4_stepper", body.str(), sym_x);
+
+ std::vector<double> x(n);
+ std::generate(x.begin(), x.end(), [](){ return (double)rand() / RAND_MAX; });
+
+ vex::vector<double> X(ctx.queue(), x);
+
+ // Make 100 iterations on CPU with x[0].
+ for(int i = 0; i < 100; i++)
+ runge_kutta_4(sys_func<double>, x[0], dt);
+
+ // Make 100 iterations on GPU with full X vector.
+ for(int i = 0; i < 100; i++)
+ kernel(X);
+
+ // Compare results.
+ rc = rc && fabs(x[0] - X[0]) < 1e-8;
+ return rc;
+ });
+#endif
+
} catch (const cl::Error &err) {
std::cerr << "OpenCL error: " << err << std::endl;
return 1;
View
6 vexcl/generator.hpp
@@ -474,7 +474,7 @@ class symbolic_builtin {
public:
static const bool is_symbolic = true;
- symbolic_builtin(const Expr&... expr) : expr(expr...) {}
+ symbolic_builtin(const Expr&... expr) : expr(std::ref(expr)...) {}
std::string get_string() const {
std::ostringstream s;
@@ -485,7 +485,7 @@ class symbolic_builtin {
return s.str();
}
private:
- const std::tuple<Expr...> expr;
+ const std::tuple<std::reference_wrapper<const Expr>...> expr;
template <uint pos = 0, class Function>
typename std::enable_if<(pos == sizeof...(Expr)), void>::type
@@ -496,7 +496,7 @@ class symbolic_builtin {
typename std::enable_if<(pos < sizeof...(Expr)), void>::type
for_each(Function &f) const
{
- f( std::get<pos>(expr) );
+ f( std::get<pos>(expr).get() );
for_each<pos+1, Function>(f);
}
View
91 vexcl/vexcl.hpp
@@ -293,6 +293,97 @@ vex::vectors with help of vex::tie() function:
vex::tie(dx,dy) = std::make_tuple(x + y, x - y);
\endcode
+\section kernel_generator Converting existing algorithms to kernels
+
+VexCL kernel generator allows to transparently convert existing CPU algorithm
+to an OpenCL kernel. In order to do this you need to record sequence of
+arithmetic expressions made by an algorithm and convert the recorded sequence
+to a kernel. The recording part is done with help of
+vex::generator::symbolic<T> class. The class supports arithmetic expression
+templates and simply outputs to provided stream any expressions it is being
+subjected to.
+
+To illustrate this, imagine that you have generic algorithm for a 4th order
+Runge-Kutta ODE stepper:
+
+\code
+template <class state_type, class SysFunction>
+void runge_kutta_4(SysFunction sys, state_type &x, double dt) {
+ state_type xtmp, k1, k2, k3, k4;
+
+ sys(x, k1, dt);
+
+ xtmp = x + 0.5 * k1;
+ sys(xtmp, k2, dt);
+
+ xtmp = x + 0.5 * k2;
+ sys(xtmp, k3, dt);
+
+ xtmp = x + k3;
+ sys(xtmp, k4, dt);
+
+ x += (k1 + 2 * k2 + 2 * k3 + k4) / 6;
+}
+\endcode
+To model equation \f$\frac{dx}{dt} = sin(x)\f$ we also provide the following
+system function:
+\code
+template <class state_type>
+void sys_func(const state_type &x, state_type &dx, double dt) {
+ dx = dt * sin(x);
+}
+\endcode
+Now, to make a hundred of RK4 iterations for a double value on CPU, all that
+we need to do is
+\code
+double x = 1;
+double dt = 0.01;
+for(int i = 0; i < 100; i++)
+ runge_kutta_4(sys_func<double>, x, dt);
+\endcode
+Let us now generate the kernel for single RK4 step and apply the kernel to a
+vex::vector<double> (by doing this we essentially simpultaneously solve big
+number of same ODEs with different initial conditions).
+\code
+// Set recorder for expression sequence.
+std::ostringstream body;
+vex::generator::set_recorder(body);
+
+// Create symbolic variable.
+typedef vex::generator::symbolic<double> sym_state;
+sym_state sym_x(sym_state::VectorParameter);
+
+// Record expression sequience.
+double dt = 0.01;
+runge_kutta_4(sys_func<sym_state>, sym_x, dt);
+
+// Build kernel.
+auto kernel = vex::generator::build_kernel(ctx.queue(),
+ "rk4_stepper", body.str(), sym_x);
+
+// Create and initialize vector of states.
+std::vector<double> xinit(n);
+std::generate(xinit.begin(), xinit.end(), [](){
+ return (double)rand() / RAND_MAX;
+ });
+vex::vector<double> x(ctx.queue(), n);
+vex::copy(xinit, x);
+
+// Make 100 rk4 steps.
+for(int i = 0; i < 100; i++) kernel(x);
+\endcode
+
+This is much more effective than (for this to work correctly we would need to
+slightly change sys_func):
+\code
+for(int i = 0; i < 100; i++)
+ runge_kutta_4(sys_func<vex::vector<double>>, x, dt);
+\endcode
+The generated kernel is more effective because temporary values used in
+sys_func are now represented not as full-blown vex::vectors, but as fast
+register variables inside the kernel body. We have seen upto tenfold
+performance improvement with this technique.
+
\section custkern Using custom kernels
Custom kernels are of course possible as well. vector::operator(uint)

0 comments on commit 60f88e9

Please sign in to comment.
Something went wrong with that request. Please try again.