# SmallPT CUDA Demo

A simple cuda version of [smallpt](http://www.kevinbeason.com/smallpt/) with some memory optimization.

Adapted from https://github.com/BentleyBlanks/smallptCuda.git

## Helpers

### Math

In [1]:
//#include "helper_math.h"
#include <math.h>
#include <cstdint>
#include "cuda_runtime.h"

inline __device__ __host__ float clamp(float f, float a, float b)
{
    return fmaxf(a, fminf(f, b));
}

### Image helper

In [2]:
#include <cstdio>

#ifdef __unix
#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL
#endif

inline uint8_t ToByte(float color, float gamma = 2.2f) noexcept
{
  const float gcolor = std::pow(color, 1.0 / gamma);
  return static_cast<int8_t>(clamp(255.0f * gcolor, 0.0f, 255.0f));
}

inline void WritePPM(uint32_t w, uint32_t h, 
             const float3* Ls, 
             const char* fname = "smallptCuda.ppm") noexcept
{
    FILE* fp;
    
    fopen_s(&fp, fname, "w");
    
    std::fprintf(fp, "P3\n%u %u\n%u\n", w, h, 255u);
    for (std::size_t i = 0; i < w * h; ++i) {
      std::fprintf(fp, "%u %u %u ", 
             ToByte(Ls[i].x), 
             ToByte(Ls[i].y), 
             ToByte(Ls[i].z));
    }
    
    std::fclose(fp);
}

# SmallPT CUDA

In [None]:
#include <curand_kernel.h>
#include <device_launch_parameters.h>
#include <iostream>

///#include <core/sTimer.h>
///#include <core/sRandom.h>
///#include <core/helper_math.h>
///#include <image/svpng.inc>

#define PI 3.14159265359f

#define CUDA_SAFE_CALL(call) { \
cudaError err = ( call);                                                \
if(cudaSuccess != err) {                                             \
    fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",    \
            __FILE__, __LINE__, cudaGetErrorString( err) );          \
} }

// -----------------------------------GPU Func-----------------------------------
// From [smallpt](http://www.kevinbeason.com/smallpt/)
enum materialType
{ 
    DIFFUSE = 0, 
    MIRROR, 
    GLASS
};

struct __align__(16) Ray
{
    __device__ Ray() {}

    __device__ Ray(float3 origin, float3 direction) 
        : origin(origin), direction(direction) {}

    float3 origin;
    float3 direction;
};

struct __align__(16) sphere
{
    float radius;
    float3 center, emission, reflectance;
    materialType type;

    __device__ double intersect(const Ray &r) const
    {

        float3 op = center - r.origin;
        float t, epsilon = 0.0001f;  // epsilon required to prevent floating point precision artefacts
        float b = dot(op, r.direction);    // b in quadratic equation
        float disc = b*b - dot(op, op) + radius*radius;  // discriminant quadratic equation
        if(disc < 0) return 0;       // if disc < 0, no real solution (we're not interested in complex roots) 
        else disc = sqrtf(disc);    // if disc >= 0, check for solutions using negative and positive discriminant
        return (t = b - disc) > epsilon ? t : ((t = b + disc) > epsilon ? t : 0); // pick closest point in front of ray origin
    }
};

__constant__ sphere spheres[] = {
    {1e5f,{1e5f + 1.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{0.75f, 0.25f, 0.25f}, DIFFUSE}, //Left 
    {1e5f,{-1e5f + 99.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{.25f, .25f, .75f}, DIFFUSE}, //Rght 
    {1e5f,{50.0f, 40.8f, 1e5f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Back 
    {1e5f,{50.0f, 40.8f, -1e5f + 170.0f},{0.0f, 0.0f, 0.0f},{0.0f, 0.0f, 0.0f}, DIFFUSE}, //Frnt 
    {1e5f,{50.0f, 1e5f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Botm 
    {1e5f,{50.0f, -1e5f + 81.6f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Top 
    {16.5f,{27.0f, 16.5f, 47.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, MIRROR},//Mirr
    {16.5f,{73.0f, 16.5f, 78.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, GLASS},//Glas
    {600.0f,{50.0f, 681.6f-.27f, 81.6f},{12, 12, 12},{0.0f, 0.0f, 0.0f}, DIFFUSE}  // Light
};

__constant__ const int nsphere = sizeof(spheres) / sizeof(sphere);

__device__ float rgbToLuminance(const float3& rgb)
{
    const float YWeight[3] = {0.212671f, 0.715160f, 0.072169f};
    return YWeight[0] * rgb.x + YWeight[1] * rgb.y + YWeight[2] * rgb.z;
}

__device__ bool intersectScene(const Ray &r, float &t, int &id, sphere* pshere, int &nsp)
{
    float d, inf = t = 1e20;  // t is distance to closest intersection, initialise t to a huge number outside scene
    for(int i = nsp; i--;)
    {
        // find closest hit object and point
        if((d = pshere[i].intersect(r)) && d < t)
        {
            t = d;
            id = i;
        }
    }
        
    return t < inf; // returns true if an intersection with the scene occurred, false when no hit
}

__device__ float clamp(float x) { return x < 0 ? 0 : x>1 ? 1 : x; }

__device__ float gammaCorrection(float x)
{
    return pow(clamp(x), 1 / 2.2f);
}

__device__ float3 radiance(Ray &r, curandState* rs, sphere* pshere, int &nsp)
{
    float3 L = make_float3(0.0f, 0.0f, 0.0f); // accumulates ray colour with each iteration through bounce loop
    float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
    int depth = 0;

    // ray bounce loop
    while(1)
    {
        float t;    
        int id = 0;         

        // find closest intersection with object's index
        if(!intersectScene(r, t, id, pshere, nsp))
            break;

        const sphere &obj = pshere[id];
        float3 hitpoint = r.origin + r.direction * t; 
        float3 normal = normalize(hitpoint - obj.center);
        float3 nl = dot(normal, r.direction) < 0 ? normal : normal * -1; // front facing normal

        // prevent self-intersection
        r.origin = hitpoint + nl * 0.05f;

        //float pdf = 1.0f;

        // add emission
        L += throughput * obj.emission;

        // different material
        if(obj.type == DIFFUSE)
        {        
            // uniform sampling hemisphere
            float r1 = 2 * PI * curand_uniform(rs);
            float r2 = curand_uniform(rs);
            float r2s = sqrtf(r2);

            // compute local coordinate on the hit point
            float3 w = nl;
            float3 u = normalize(cross((fabs(w.x) > .1 ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
            float3 v = cross(w, u);

            // local to world convert
            r.direction = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
            //pdf = 1.0f / PI;

            // importance sampling no need costheta
            //throughput *= obj.reflectance * dot(r.direction, nl);
            throughput *= obj.reflectance;
        }
        else if(obj.type == MIRROR)
        {
            r.direction = r.direction - normal * 2 * dot(normal, r.direction);
            throughput *= obj.reflectance;
            //pdf = 1.0f;
        }
        else
        {
            r.origin = hitpoint;

            // Ideal dielectric REFRACTION
            float3 reflectDir = r.direction - normal * 2 * dot(normal, r.direction);
            // Ray from outside going in?
            bool into = dot(normal, nl) > 0;
            float nc = 1, nt = 1.5, nnt = into ? nc / nt : nt / nc, ddn = dot(r.direction, nl), cos2t;
            
            // total internal reflection
            if((cos2t = 1 - nnt*nnt*(1 - ddn*ddn)) < 0)
            {
                r.direction = reflectDir;
                throughput *= obj.reflectance;
            }
            else
            {
                // refract or reflect
                float3 tdir = normalize(r.direction*nnt - normal*((into ? 1 : -1)*(ddn*nnt + sqrt(cos2t))));

                float a = nt - nc, b = nt + nc, R0 = a*a / (b*b), c = 1 - (into ? -ddn : dot(tdir, normal));

                float Re = R0 + (1 - R0)*c*c*c*c*c, Tr = 1 - Re, P = .25 + .5*Re, RP = Re / P, TP = Tr / (1 - P);
                
                if(curand_uniform(rs) < P)
                {
                    // reflect
                    r.direction = reflectDir;
                    throughput *= obj.reflectance * RP;
                }
                else
                {
                    //refract
                    r.direction = tdir;
                    throughput *= obj.reflectance * TP;
                    //throughput *= make_float3(1, 0, 0);
                }
            }
        }

        // Russian roulette Stop with at least some probability to avoid getting stuck
        if(depth++ >= 5)
        {
            float q = min(0.95f, rgbToLuminance(throughput));
            if(curand_uniform(rs) >= q)
                break;
            throughput /= q;
        }
    }

    return L;
}

__global__ void render1(int width, int height)
{
    printf("!1!\n");
}

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{  
  printf("!!!\n");

  int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < numElements) {
    C[i] = A[i] + B[i] + 0.0f;
  }
}

__global__ void render(int spp, int width, int height, float3* output)
{
    printf("!!!\n");
    
    //copy spheres to shared memory
    __shared__ int nsp;
    __shared__ sphere sspheres[nsphere];
    __shared__ Ray tRay;
    nsp = nsphere;

    sspheres[threadIdx.x % nsp] = spheres[threadIdx.x % nsp];

    __syncthreads();

    // position of current pixel
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // index of current pixel
    //int i = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    int i = (height - y - 1) * width + x;

    curandState rs;
    curand_init(i, 0, 0, &rs);

    Ray cam(make_float3(50, 52, 295.6), normalize(make_float3(0, -0.042612, -1))); // cam pos, dir 
    float3 cx = make_float3(width * 0.5135f / height, 0.0f, 0.0f);
    // .5135 is field of view angle
    float3 cy = normalize(cross(cx, cam.direction)) * 0.5135f;
    float3 color = make_float3(0.0f);

    for (int sy = 0; sy < 2; sy++)
    {
        for (int sx = 0; sx < 2; sx++)
        { 
            for(int s = 0; s < spp; s++)
            {
                float r1 = curand_uniform(&rs);
                float dx = r1 < 1 ? sqrtf(r1) - 1 : 1-sqrtf(2 - r1);
                float r2 = curand_uniform(&rs);
                float dy = r2 < 1 ? sqrtf(r2) - 1 : 1-sqrtf(2 - r2);
                //--! super sampling
                float3 d = cam.direction + cx*((((sx + dx + .5) / 2) + x) / width - .5) + 
                                           cy*((((sy + dy + .5) / 2) + y) / height - .5);

                //Ray tRay = Ray(cam.origin + d * 140, normalize(d));
                tRay.direction = normalize(d);
                tRay.origin = cam.origin + d * 140;
                color += radiance(tRay, &rs, sspheres, nsp) *(.25f / spp);
            }
        }
    }

    // output to the cache
    __shared__ float3 temp;
    temp = make_float3(clamp(color.x, 0.0f, 1.0f), clamp(color.y, 0.0f, 1.0f), clamp(color.z, 0.0f, 1.0f));
    output[i] = temp;
}

// -----------------------------------CPU Func-----------------------------------

void devicePropertyPrint()
{
  int dev = 0;
  cudaDeviceProp devProp;
  if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)
  {
    printf("Device %i, named: %s\n", dev, devProp.name);
    printf("Device compute capability: %i.%i\n", devProp.major, devProp.minor);
    printf("Device maxThreadDim: [%i, %i, %i]\n", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);
    printf("Device maxGridSize: [%i, %i, %i]\n", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);
    printf("Multi Processor Count: %i\n", devProp.multiProcessorCount);
    printf("Size of SharedMem Per-Block: %f KB\n", devProp.sharedMemPerBlock / 1024.0);
    printf("Max Threads Per-Block: %i\n", devProp.maxThreadsPerBlock);
    printf("Max Threads Per-MultiProcessor: %i\n", devProp.maxThreadsPerMultiProcessor);
    printf("\n");
  }
}

void Render(const std::uint32_t nb_samples) {

    // Image Size
    //int width = 1024, height = 768;
    int width = 256, height = 192;
    int spp = nb_samples/4;

    printf("\nRendering Size: [%d, %d], spp: %d\n", width, height, spp);
    printf("------------------Rendering Started------------------\n");

    //sTimer t;
    
    // Memory on CPU
    float3* outputCPU = new float3[width * height];
    float3* outputGPU;
    CUDA_SAFE_CALL(cudaMalloc(&outputGPU, width * height * sizeof(float3)));

    // Ray Pool
    dim3 blockSize(32, 32, 1);
    dim3 gridSize(width / blockSize.x, height / blockSize.y, 1);

    //t.start();

    
    cudaError_t err = cudaSuccess;
    int numElements = 50000;
    size_t size = numElements * sizeof(float);
    float *d_A = NULL;
    float *d_B = NULL;
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    if (err != cudaSuccess) {
      printf("Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
      exit(EXIT_FAILURE);
    }

    CUDA_SAFE_CALL(cudaDeviceSynchronize());
    
    // Render on GPU
printf("@1@\n");
      vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, numElements);
//printf("@1@\n");
    //render<<<gridSize, blockSize>>>(spp, width, height, outputGPU);
    CUDA_SAFE_CALL(cudaPeekAtLastError());
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
    
    //t.end();

    // Copy Mem from GPU to CPU
    CUDA_SAFE_CALL(cudaMemcpy(outputCPU, outputGPU, width * height * sizeof(float3), cudaMemcpyDeviceToHost));

    // free CUDA memory
    cudaFree(outputGPU);

    printf("------------------Rendering Ended------------------\n");
    //printf("Cost time: %f\n", t.difference());

   WritePPM(width, height, outputCPU);
}

const std::uint32_t nb_samples = 40;
devicePropertyPrint();
Render(nb_samples);