# CUDA Vector Add Demo

Adapted from https://github.com/NVIDIA/cuda-samples.git

## Main helpers

In [None]:
#include <cuda_runtime.h>
#include <stdio.h>

///

//#include "cuda_runtime.h"
#include "curand_kernel.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <limits>
#include <cstdio>
#include <cstdlib>

In [None]:
  constexpr double g_pi = 3.14159265358979323846;

  __host__ __device__ inline double Clamp(double v, 
                      double low = 0.0, double high = 1.0) noexcept
  {
    return fmin(fmax(v, low), high);
  }

  inline std::uint8_t ToByte(double color, double gamma = 2.2) noexcept
  {
    const double gcolor = std::pow(color, 1.0 / gamma);
    return static_cast<std::uint8_t>(Clamp(255.0 * gcolor, 0.0, 255.0));
  }

  inline void HandleError(cudaError_t err, const char* file, int line)
  {
    if (cudaSuccess != err) {
      std::printf("%s in %s at line %d\n", 
            cudaGetErrorString(err), file, line);
      std::exit(EXIT_FAILURE);
    }
  }
//}

#define CUDA_SAFE_CALL(call) { \
cudaError err = ( call);                                                \
if(cudaSuccess != err) {                                             \
    fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",    \
            __FILE__, __LINE__, cudaGetErrorString( err) );          \
} }


In [None]:
//namespace smallpt {

  struct Vector3 {

  //public:

    __host__ __device__ explicit Vector3(double xyz = 0.0) noexcept
      : Vector3(xyz, xyz, xyz) {}
    __host__ __device__ Vector3(double x, double y, double z) noexcept
      : m_x(x), m_y(y), m_z(z) {}
    Vector3(const Vector3& v) noexcept = default;
    Vector3(Vector3&& v) noexcept = default;
    ~Vector3() = default;

    Vector3& operator=(const Vector3& v) = default;
    Vector3& operator=(Vector3&& v) = default;

    __device__ bool HasNaNs() const {
      return std::isnan(m_x) || std::isnan(m_y) || std::isnan(m_z);
    }

    __device__ const Vector3 operator-() const {
      return { -m_x, -m_y, -m_z };
    }

    __device__ const Vector3 operator+(const Vector3& v) const {
      return { m_x + v.m_x, m_y + v.m_y, m_z + v.m_z };
    }
    __device__ const Vector3 operator-(const Vector3& v) const {
      return { m_x - v.m_x, m_y - v.m_y, m_z - v.m_z };
    }
    __device__ const Vector3 operator*(const Vector3& v) const {
      return { m_x * v.m_x, m_y * v.m_y, m_z * v.m_z };
    }
    __device__ const Vector3 operator/(const Vector3& v) const {
      return { m_x / v.m_x, m_y / v.m_y, m_z / v.m_z };
    }
    __device__ const Vector3 operator+(double a) const {
      return { m_x + a, m_y + a, m_z + a };
    }
    __device__ const Vector3 operator-(double a) const {
      return { m_x - a, m_y - a, m_z - a };
    }
    __device__ const Vector3 operator*(double a) const {
      return { m_x * a, m_y * a, m_z * a };
    }
    __device__ const Vector3 operator/(double a) const {
      const double inv_a = 1.0 / a;
      return { m_x * inv_a, m_y * inv_a, m_z * inv_a };
    }

    __device__ Vector3& operator+=(const Vector3& v) {
      m_x += v.m_x;
      m_y += v.m_y;
      m_z += v.m_z;
      return *this;
    }
    __device__ Vector3& operator-=(const Vector3& v) {
      m_x -= v.m_x;
      m_y -= v.m_y;
      m_z -= v.m_z;
      return *this;
    }
    __device__ Vector3& operator*=(const Vector3& v) {
      m_x *= v.m_x;
      m_y *= v.m_y;
      m_z *= v.m_z;
      return *this;
    }
    __device__ Vector3& operator/=(const Vector3& v) {
      m_x /= v.m_x;
      m_y /= v.m_y;
      m_z /= v.m_z;
      return *this;
    }
    __device__ Vector3& operator+=(double a) {
      m_x += a;
      m_y += a;
      m_z += a;
      return *this;
    }
    __device__ Vector3& operator-=(double a) {
      m_x -= a;
      m_y -= a;
      m_z -= a;
      return *this;
    }
    __device__ Vector3& operator*=(double a) {
      m_x *= a;
      m_y *= a;
      m_z *= a;
      return *this;
    }
    __device__ Vector3& operator/=(double a) {
      const double inv_a = 1.0 / a;
      m_x *= inv_a;
      m_y *= inv_a;
      m_z *= inv_a;
      return *this;
    }

    __device__ double Dot(const Vector3& v) const {
      return m_x * v.m_x + m_y * v.m_y + m_z * v.m_z;
    }
    __device__ const Vector3 Cross(const Vector3& v) const {
      return {
        m_y * v.m_z - m_z * v.m_y,
        m_z * v.m_x - m_x * v.m_z,
        m_x * v.m_y - m_y * v.m_x
      };
    }

    __device__ bool operator==(const Vector3& rhs) const {
      return m_x == rhs.m_x && m_y == rhs.m_y && m_z == rhs.m_z;
    }
    __device__ bool operator!=(const Vector3& rhs) const {
      return !(*this == rhs);
    }

    __device__ double& operator[](size_t i) {
      return (&m_x)[i];
    }
    __device__ double operator[](size_t i) const {
      return (&m_x)[i];
    }

    __device__ size_t MinDimension() const {
      return (m_x < m_y && m_x < m_z) ? 0u : ((m_y < m_z) ? 1u : 2u);
    }
    __device__ size_t MaxDimension() const {
      return (m_x > m_y && m_x > m_z) ? 0u : ((m_y > m_z) ? 1u : 2u);
    }
    __device__ double Min() const {
      return fmin(m_x, fmin(m_y, m_z));
    }
    __device__ double Max() const {
      return fmax(m_x, fmax(m_y, m_z));
    }

    __device__ double Norm2_squared() const {
      return m_x * m_x + m_y * m_y + m_z * m_z;
    }

    __device__ double Norm2() const {
      return std::sqrt(Norm2_squared());
    }

    __device__ void Normalize() {
      const double a = 1.0 / Norm2();
      m_x *= a;
      m_y *= a;
      m_z *= a;
    }

    double m_x, m_y, m_z;
  };

  __device__ inline const Vector3 operator+(double a, const Vector3& v) {
    return { a + v.m_x, a + v.m_y, a + v.m_z };
  }

  __device__ inline const Vector3 operator-(double a, const Vector3& v) {
    return { a - v.m_x, a - v.m_y, a - v.m_z };
  }

  __device__ inline const Vector3 operator*(double a, const Vector3& v) {
    return { a * v.m_x, a * v.m_y, a * v.m_z };
  }

  __device__ inline const Vector3 operator/(double a, const Vector3& v) {
    return { a / v.m_x, a / v.m_y, a / v.m_z };
  }

  __device__ inline const Vector3 Sqrt(const Vector3& v) {
    return {
      std::sqrt(v.m_x),
      std::sqrt(v.m_y),
      std::sqrt(v.m_z)
    };
  }

  __device__ inline const Vector3 Pow(const Vector3& v, double a) {
    return {
      std::pow(v.m_x, a),
      std::pow(v.m_y, a),
      std::pow(v.m_z, a)
    };
  }

  __device__ inline const Vector3 Abs(const Vector3& v) {
    return {
      std::abs(v.m_x),
      std::abs(v.m_y),
      std::abs(v.m_z)
    };
  }

  __device__ inline const Vector3 Min(const Vector3& v1, const Vector3& v2) {
    return {
      fmin(v1.m_x, v2.m_x),
      fmin(v1.m_y, v2.m_y),
      fmin(v1.m_z, v2.m_z)
    };
  }

  __device__ inline const Vector3 Max(const Vector3& v1, const Vector3& v2) {
    return {
      fmax(v1.m_x, v2.m_x),
      fmax(v1.m_y, v2.m_y),
      fmax(v1.m_z, v2.m_z)
    };
  }

  __device__ inline const Vector3 Round(const Vector3& v) {
    return {
      std::round(v.m_x),
      std::round(v.m_y),
      std::round(v.m_z)
    };
  }

  __device__ inline const Vector3 Floor(const Vector3& v) {
    return {
      std::floor(v.m_x),
      std::floor(v.m_y),
      std::floor(v.m_z)
    };
  }

  __device__ inline const Vector3 Ceil(const Vector3& v) {
    return {
      std::ceil(v.m_x),
      std::ceil(v.m_y),
      std::ceil(v.m_z)
    };
  }

  __device__ inline const Vector3 Trunc(const Vector3& v) {
    return {
      std::trunc(v.m_x),
      std::trunc(v.m_y),
      std::trunc(v.m_z)
    };
  }

  __device__ inline const Vector3 Clamp(const Vector3& v, 
                      double low = 0.0, double high = 1.0) {
    return {
      Clamp(v.m_x, low, high),
      Clamp(v.m_y, low, high),
      Clamp(v.m_z, low, high) }
    ;
  }

  __device__ inline const Vector3 Lerp(double a, 
                     const Vector3& v1, const Vector3& v2) {
    return v1 + a * (v2 - v1);
  }

  template<size_t X, size_t Y, size_t Z>
  __device__ inline const Vector3 Permute(const Vector3& v) {
    return { v[X], v[Y], v[Z] };
  }

  __device__ inline const Vector3 Normalize(const Vector3& v) {
    const double a = 1.0 / v.Norm2();
    return a * v;
  }
//}

In [None]:
#ifdef __unix
#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL
#endif

//namespace smallpt {

  inline void WritePPM(uint32_t w, uint32_t h, 
             const Vector3* Ls, 
             const char* fname = "cu-image.ppm") noexcept {
    
    FILE* fp;
    
    fopen_s(&fp, fname, "w");
    
    fprintf(fp, "P3\n%u %u\n%u\n", w, h, 255u);
    for (size_t i = 0; i < w * h; ++i) {
      fprintf(fp, "%u %u %u ", 
             ToByte(Ls[i].m_x), 
             ToByte(Ls[i].m_y), 
             ToByte(Ls[i].m_z));
    }
    
    fclose(fp);
  }
//}

In [None]:
#define REFRACTIVE_INDEX_OUT 1.0
#define REFRACTIVE_INDEX_IN  1.5
#define EPSILON_SPHERE 1e-4

//namespace smallpt {

  struct __align__(16) Ray {
    __device__ explicit Ray(Vector3 o, 
                Vector3 d, 
                double tmin = 0.0, 
                double tmax = std::numeric_limits< double >::infinity(), 
                std::uint32_t depth = 0u) noexcept
      : m_o(std::move(o)),
      m_d(std::move(d)),
      m_tmin(tmin),
      m_tmax(tmax),
      m_depth(depth) {};
    Ray(const Ray& ray) noexcept = default;
    Ray(Ray&& ray) noexcept = default;
    ~Ray() = default;

    Ray& operator=(const Ray& ray) = default;
    Ray& operator=(Ray&& ray) = default;

    __device__ const Vector3 operator()(double t) const {
      return m_o + m_d * t;
    }

    Vector3 m_o, m_d;
    mutable double m_tmin, m_tmax;
    std::uint32_t m_depth;
  };

  enum struct Reflection_t {
    Diffuse,
    Specular,
    Refractive
  };

  struct __align__(16) Sphere {
    __host__ __device__ explicit Sphere(double r,
                      Vector3 p,
                      Vector3 e,
                      Vector3 f,
                      Reflection_t reflection_t) noexcept
      : m_r(r),
      m_p(std::move(p)),
      m_e(std::move(e)),
      m_f(std::move(f)),
      m_reflection_t(reflection_t) {}
    Sphere(const Sphere& sphere) noexcept = default;
    Sphere(Sphere&& sphere) noexcept = default;
    ~Sphere() = default;

    Sphere& operator=(const Sphere& sphere) = default;
    Sphere& operator=(Sphere&& sphere) = default;
    
    __device__ bool Intersect(const Ray& ray) const {
      const Vector3 op = m_p - ray.m_o;
      const double dop = ray.m_d.Dot(op);
      const double D = dop * dop - op.Dot(op) + m_r * m_r;

      if (0.0 > D) {
        return false;
      }

      const double sqrtD = sqrt(D);

      const double tmin = dop - sqrtD;
      if (ray.m_tmin < tmin && tmin < ray.m_tmax) {
        ray.m_tmax = tmin;
        return true;
      }

      const double tmax = dop + sqrtD;
      if (ray.m_tmin < tmax && tmax < ray.m_tmax) {
        ray.m_tmax = tmax;
        return true;
      }

      return false;
    }

    double m_r;
    Vector3 m_p; // position
    Vector3 m_e; // emission
    Vector3 m_f; // reflection
    Reflection_t m_reflection_t;
  };

    
  const Sphere g_spheres[] = {
    Sphere(1e5,  Vector3(1e5 + 1, 40.8, 81.6),   Vector3(),   Vector3(0.75,0.25,0.25), Reflection_t::Diffuse),    //Left
    Sphere(1e5,  Vector3(-1e5 + 99, 40.8, 81.6), Vector3(),   Vector3(0.25,0.25,0.75), Reflection_t::Diffuse),    //Right
    Sphere(1e5,  Vector3(50, 40.8, 1e5),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Back
    Sphere(1e5,  Vector3(50, 40.8, -1e5 + 170),  Vector3(),   Vector3(),               Reflection_t::Diffuse),    //Front
    Sphere(1e5,  Vector3(50, 1e5, 81.6),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Bottom
    Sphere(1e5,  Vector3(50, -1e5 + 81.6, 81.6), Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Top
    Sphere(16.5, Vector3(27, 16.5, 47),          Vector3(),   Vector3(0.999),          Reflection_t::Specular),   //Mirror
    Sphere(16.5, Vector3(73, 16.5, 78),          Vector3(),   Vector3(0.999),          Reflection_t::Refractive), //Glass
    Sphere(600,  Vector3(50, 681.6 - .27, 81.6), Vector3(12), Vector3(),               Reflection_t::Diffuse)     //Light
  };


  __device__ inline Vector3 UniformSampleOnHemisphere(double u1,  double u2) {
    // u1 := cos_theta
    const double sin_theta = std::sqrt(fmax(0.0, 1.0 - u1 * u1));
    const double phi = 2.0 * g_pi * u2;
    return {
      std::cos(phi) * sin_theta,
      std::sin(phi) * sin_theta,
      u1
    };
  }

  __device__ inline Vector3 CosineWeightedSampleOnHemisphere(double u1, double u2) {
    const double cos_theta = sqrt(1.0 - u1);
    const double sin_theta = sqrt(u1);
    const double phi = 2.0 * g_pi * u2;
    return {
      std::cos(phi) * sin_theta,
      std::sin(phi) * sin_theta,
      cos_theta
    };
  }
    
  __device__ inline double Reflectance0(double n1, double n2) {
    const double sqrt_R0 = (n1 - n2) / (n1 + n2);
    return sqrt_R0 * sqrt_R0;
  }

  __device__ inline double SchlickReflectance(double n1, 
                        double n2, 
                        double c) {
    const double R0 = Reflectance0(n1, n2);
    return R0 + (1.0 - R0) * c * c * c * c * c;
  }

  __device__ inline const Vector3 IdealSpecularReflect(const Vector3& d, const Vector3& n) {
    return d - 2.0 * n.Dot(d) * n;
  }

  __device__ inline const Vector3 IdealSpecularTransmit(const Vector3& d, 
                              const Vector3& n, 
                              double n_out, 
                              double n_in, 
                              double& pr, 
                              curandState* state) {
    
    const Vector3 d_Re = IdealSpecularReflect(d, n);

    const bool out_to_in = (0.0 > n.Dot(d));
    const Vector3 nl = out_to_in ? n : -n;
    const double nn = out_to_in ? n_out / n_in : n_in / n_out;
    const double cos_theta = d.Dot(nl);
    const double cos2_phi = 1.0 - nn * nn * (1.0 - cos_theta * cos_theta);

    // Total Internal Reflection
    if (0.0 > cos2_phi) {
      pr = 1.0;
      return d_Re;
    }

    const Vector3 d_Tr = Normalize(nn * d - nl * (nn * cos_theta + sqrt(cos2_phi)));
    const double c = 1.0 - (out_to_in ? -cos_theta : d_Tr.Dot(n));

    const double Re = SchlickReflectance(n_out, n_in, c);
    const double p_Re = 0.25 + 0.5 * Re;
    if (curand_uniform_double(state) < p_Re) {
      pr = (Re / p_Re);
      return d_Re;
    }
    else {
      const double Tr = 1.0 - Re;
      const double p_Tr = 1.0 - p_Re;
      pr = (Tr / p_Tr);
      return d_Tr;
    }
  }

  __device__ inline bool Intersect(const Sphere* dev_spheres, 
                   size_t nb_spheres, 
                   const Ray& ray, 
                   size_t& id)
  {
    bool hit = false;
    for (size_t i = 0u; i < nb_spheres; ++i) {
      if (dev_spheres[i].Intersect(ray)) {
        hit = true;
        id  = i;
      }
    }

    return hit;
  }

//__device__ static Vector3 Radiance(const Sphere* dev_spheres, 
  __device__ Vector3 Radiance(const Sphere* dev_spheres, 
                     size_t nb_spheres,
                     const Ray& ray, 
                     curandState* state)
  {  
    Ray r = ray;
    Vector3 L;
    Vector3 F(1.0);

    while (true) {
      size_t id;
      if (!Intersect(dev_spheres, nb_spheres, r, id)) {
        return L;
      }

      const Sphere& shape = dev_spheres[id];
      const Vector3 p = r(r.m_tmax);
      const Vector3 n = Normalize(p - shape.m_p);

      L += F * shape.m_e;
      F *= shape.m_f;

      // Russian roulette
      if (4 < r.m_depth) {
        const double continue_probability = shape.m_f.Max();
        if (curand_uniform_double(state) >= continue_probability) {
          return L;
        }
        F /= continue_probability;
      }

      // Next path segment
      switch (shape.m_reflection_t) {
      
      case Reflection_t::Specular: {
        const Vector3 d = IdealSpecularReflect(r.m_d, n);
        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);
        break;
      }
      
      case Reflection_t::Refractive: {
        double pr;
        const Vector3 d = IdealSpecularTransmit(r.m_d, n, REFRACTIVE_INDEX_OUT, REFRACTIVE_INDEX_IN, pr, state);
        F *= pr;
        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);
        break;
      }
      
      default: {
        const Vector3 w = (0.0 > n.Dot(r.m_d)) ? n : -n;
        const Vector3 u = Normalize((abs(w.m_x) > 0.1 ? Vector3(0.0, 1.0, 0.0) : Vector3(1.0, 0.0, 0.0)).Cross(w));
        const Vector3 v = w.Cross(u);

        const Vector3 sample_d = CosineWeightedSampleOnHemisphere(curand_uniform_double(state), curand_uniform_double(state));
        const Vector3 d = Normalize(sample_d.m_x * u + sample_d.m_y * v + sample_d.m_z * w);
        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);
      }
      }
    }
  }

//  __global__ static void kernel(const Sphere* dev_spheres, 
  __global__ void kernel(const Sphere* dev_spheres, 
                  size_t nb_spheres,
                  std::uint32_t w, 
                  std::uint32_t h, 
                  Vector3* Ls, 
                  std::uint32_t nb_samples)
  {
      
    printf("!!!\n");

    const std::uint32_t x = threadIdx.x + blockIdx.x * blockDim.x;
    const std::uint32_t y = threadIdx.y + blockIdx.y * blockDim.y;
    const std::uint32_t offset = x + y * blockDim.x * gridDim.x;

    if (x >= w || y >= h) {
      return;
    }

    // RNG
    curandState state;
    curand_init(offset, 0u, 0u, &state);

    const Vector3 eye = { 50.0, 52.0, 295.6 };
    const Vector3 gaze = Normalize(Vector3(0.0, -0.042612, -1.0));
    const double fov = 0.5135;
    const Vector3 cx = { w * fov / h, 0.0, 0.0 };
    const Vector3 cy = Normalize(cx.Cross(gaze)) * fov;

    for (size_t sy = 0u, i = (h - 1u - y) * w + x; sy < 2u; ++sy) { // 2 subpixel row

      for (size_t sx = 0u; sx < 2u; ++sx) { // 2 subpixel column

        Vector3 L;

        for (size_t s = 0u; s < nb_samples; ++s) { // samples per subpixel
          const double u1 = 2.0 * curand_uniform_double(&state);
          const double u2 = 2.0 * curand_uniform_double(&state);
          const double dx = (u1 < 1.0) ? sqrt(u1) - 1.0 : 1.0 - sqrt(2.0 - u1);
          const double dy = (u2 < 1.0) ? sqrt(u2) - 1.0 : 1.0 - sqrt(2.0 - u2);
          const Vector3 d = cx * (((sx + 0.5 + dx) * 0.5 + x) / w - 0.5) +
                          cy * (((sy + 0.5 + dy) * 0.5 + y) / h - 0.5) + gaze;
          
          L += Radiance(dev_spheres, nb_spheres, 
            Ray(eye + d * 130, Normalize(d), EPSILON_SPHERE), &state) 
            * (1.0 / nb_samples);
        }
        
        Ls[i] += 0.25 * Clamp(L);
      }
    }
  }

//  static void Render(std::uint32_t nb_samples) noexcept
  void Render(std::uint32_t nb_samples) noexcept
  {
    const std::uint32_t w = 256u; //1024u;
    const std::uint32_t h = 192u; //768u;
    const std::uint32_t nb_pixels = w * h;

    // Set up device memory
    Sphere* dev_spheres;
    CUDA_SAFE_CALL(cudaMalloc((void**)&dev_spheres, sizeof(g_spheres)));
    CUDA_SAFE_CALL(cudaMemcpy(dev_spheres, g_spheres, sizeof(g_spheres), cudaMemcpyHostToDevice));
    Vector3* dev_Ls;
    CUDA_SAFE_CALL(cudaMalloc((void**)&dev_Ls, nb_pixels * sizeof(Vector3)));
    CUDA_SAFE_CALL(cudaMemset(dev_Ls, 0, nb_pixels * sizeof(Vector3)));

    // Kernel execution
    const dim3 nblocks(w / 16u, h / 16u);
    const dim3 nthreads(16u, 16u);

printf("@@@render kernel@@@\n");
    kernel<<<nblocks, nthreads>>>(dev_spheres, sizeof(g_spheres)/sizeof(g_spheres[0]),  w, h, dev_Ls, nb_samples);
    CUDA_SAFE_CALL(cudaPeekAtLastError());
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
      
    // Set up host memory
    Vector3* Ls = (Vector3*)malloc(nb_pixels * sizeof(Vector3));
    // Transfer device -> host
    CUDA_SAFE_CALL(cudaMemcpy(Ls, dev_Ls, nb_pixels * sizeof(Vector3), cudaMemcpyDeviceToHost));

    // Clean up device memory
    CUDA_SAFE_CALL(cudaFree(dev_Ls));
    CUDA_SAFE_CALL(cudaFree(dev_spheres));

    WritePPM(w, h, Ls);

    // Clean up host memory
    free(Ls);
  }

//const std::uint32_t nb_samples = 100;
//smallpt::
//Render(100);
//printf("Done Render\n");

In [None]:
void devicePropertyPrint()
{
  int dev = 0;
  cudaDeviceProp devProp;
  if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)
  {
    printf("Device %i, named: %s\n", dev, devProp.name);
    printf("Device compute capability: %i.%i\n", devProp.major, devProp.minor);
    printf("Device maxThreadDim: [%i, %i, %i]\n", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);
    printf("Device maxGridSize: [%i, %i, %i]\n", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);
    printf("Multi Processor Count: %i\n", devProp.multiProcessorCount);
    printf("Size of SharedMem Per-Block: %f KB\n", devProp.sharedMemPerBlock / 1024.0);
    printf("Max Threads Per-Block: %i\n", devProp.maxThreadsPerBlock);
    printf("Max Threads Per-MultiProcessor: %i\n", devProp.maxThreadsPerMultiProcessor);
    printf("\n");
  }
}

## CUDA kernel

In [None]:
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{  
  printf("###\n");

  int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < numElements) {
    C[i] = A[i] + B[i] + 0.0f;
  }
}

## Host code

In [None]:
int main(void) {

  // Error code to check return values for CUDA calls
  cudaError_t err = cudaSuccess;

  // Print the vector length to be used, and compute its size
  int numElements = 50000;
  size_t size = numElements * sizeof(float);
  printf("[Vector addition of %d elements]\n", numElements);

  // Allocate the host input vector A
  float *h_A = (float *)malloc(size);

  // Allocate the host input vector B
  float *h_B = (float *)malloc(size);

  // Allocate the host output vector C
  float *h_C = (float *)malloc(size);

  // Verify that allocations succeeded
  if (h_A == NULL || h_B == NULL || h_C == NULL) {
    printf("Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }

  // Initialize the host input vectors
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }

  // Allocate the device input vector A
  float *d_A = NULL;
  err = cudaMalloc((void **)&d_A, size);

  if (err != cudaSuccess) {
    printf("Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Allocate the device input vector B
  float *d_B = NULL;
  err = cudaMalloc((void **)&d_B, size);

  if (err != cudaSuccess) {
    printf("Failed to allocate device vector B (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Allocate the device output vector C
  float *d_C = NULL;
  err = cudaMalloc((void **)&d_C, size);

  if (err != cudaSuccess) {
    printf("Failed to allocate device vector C (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Copy the host input vectors A and B in host memory to the device input
  // vectors in
  // device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
    printf(
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  if (err != cudaSuccess) {
    printf(
            "Failed to copy vector B from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Launch the Vector Add CUDA Kernel
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
printf("@@@add kernel@@@\n");
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
  err = cudaGetLastError();

  if (err != cudaSuccess) {
    printf("Failed to launch vectorAdd kernel (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Copy the device result vector in device memory to the host result vector
  // in host memory.
  printf("Copy output data from the CUDA device to the host memory\n");
  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  if (err != cudaSuccess) {
    printf(
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Verify that the result vector is correct
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      printf("Result verification failed at element %d! A=%f B=%f C=%f\n", i, h_A[i], h_B[i], h_C[i]);
      return 1;
      exit(EXIT_FAILURE);
    }
  }

  printf("Test PASSED\n");

  // Free device global memory
  err = cudaFree(d_A);

  if (err != cudaSuccess) {
    printf("Failed to free device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = cudaFree(d_B);

  if (err != cudaSuccess) {
    printf("Failed to free device vector B (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = cudaFree(d_C);

  if (err != cudaSuccess) {
    printf("Failed to free device vector C (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  // Free host memory
  free(h_A);
  free(h_B);
  free(h_C);

  printf("Done Add\n");
  return 0;
}

///
Vector3 v;
///

devicePropertyPrint();
main();
main();

Render(100);
printf("Done Render\n");