# SmallPT CUDA Demo

A simple cuda version of [smallpt](http://www.kevinbeason.com/smallpt/) with some memory optimization.

Adapted from https://github.com/BentleyBlanks/smallptCuda.git

## Helper math

In [1]:
#include "helper_math.h"

### Random helper

In [2]:
#include <curand_kernel.h>

//__device__ curandState randState;
//
//__device__ void initRandom(unsigned int seed)
//{
//    curand_init(seed, 0, 0, &randState);
//}
//
//// Generate a uniform distributed random number[0, 1]
//__device__ float getRandom()
//{
//    curand_uniform(&randState);
//}

### Timer helper

In [3]:
///#include <core/sTimer.h>
//#include <Common/t3Test.h>

///#ifndef T3_TIMER_H
///#define T3_TIMER_H

class sTimer
{
public:
    sTimer():startTime(0.0f){}
    double start();
    double end();
    double difference();
    double startTime, endTime;
};

///#endif

///

// http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking
///#if defined(_WIN32)
///#include <Windows.h>
///
///#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
#include <unistd.h>	/* POSIX flags */
#include <time.h>	/* clock_gettime(), time() */
#include <sys/time.h>	/* gethrtime(), gettimeofday() */
///
///#if defined(__MACH__) && defined(__APPLE__)
///#include <mach/mach.h>
///#include <mach/mach_time.h>
///#endif
///
///#else
///#error "Unable to define getRealTime( ) for an unknown OS."
///#endif

/**
 * Returns the real time, in seconds, or -1.0 if an error occurred.
 *
 * Time is measured since an arbitrary and OS-dependent start time.
 * The returned real time is only useful for computing an elapsed time
 * between two calls to this function.
 */
double t3GetRealTime( )
{
#if defined(_WIN32)
    FILETIME tm;
    ULONGLONG t;
#if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
    /* Windows 8, Windows Server 2012 and later. ---------------- */
    GetSystemTimeAsFileTime(&tm);
    //GetSystemTimePreciseAsFileTime( &tm );
#else
    /* Windows 2000 and later. ---------------------------------- */
    GetSystemTimeAsFileTime( &tm );
#endif
    t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
    return (double)t / 10000000.0;
    
#elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
    /* HP-UX, Solaris. ------------------------------------------ */
    return (double)gethrtime( ) / 1000000000.0;
    
#elif defined(__MACH__) && defined(__APPLE__)
    /* OSX. ----------------------------------------------------- */
    static double timeConvert = 0.0;
    if ( timeConvert == 0.0 )
    {
        mach_timebase_info_data_t timeBase;
        (void)mach_timebase_info( &timeBase );
        timeConvert = (double)timeBase.numer /
        (double)timeBase.denom / 1000000000.0;
    }
    return (double)mach_absolute_time( ) * timeConvert;
    
#elif defined(_POSIX_VERSION)
    /* POSIX. --------------------------------------------------- */
#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
    {
        struct timespec ts;
#if defined(CLOCK_MONOTONIC_PRECISE)
        /* BSD. --------------------------------------------- */
        const clockid_t id = CLOCK_MONOTONIC_PRECISE;
#elif defined(CLOCK_MONOTONIC_RAW)
        /* Linux. ------------------------------------------- */
        const clockid_t id = CLOCK_MONOTONIC_RAW;
#elif defined(CLOCK_HIGHRES)
        /* Solaris. ----------------------------------------- */
        const clockid_t id = CLOCK_HIGHRES;
#elif defined(CLOCK_MONOTONIC)
        /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
        const clockid_t id = CLOCK_MONOTONIC;
#elif defined(CLOCK_REALTIME)
        /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
        const clockid_t id = CLOCK_REALTIME;
#else
        const clockid_t id = (clockid_t)-1;	/* Unknown. */
#endif /* CLOCK_* */
        if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
            return (double)ts.tv_sec +
            (double)ts.tv_nsec / 1000000000.0;
        /* Fall thru. */
    }
#endif /* _POSIX_TIMERS */
    
    /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
    struct timeval tm;
    gettimeofday( &tm, NULL );
    return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
#else
    return -1.0;		/* Failed. */
#endif
}

double sTimer::start()
{
    startTime = t3GetRealTime();
    
    return startTime;
}

double sTimer::end()
{
    endTime = t3GetRealTime();
    
    return endTime;
}

double sTimer::difference()
{
    return endTime - startTime;
}

### SV PNG

In [4]:
/*
Copyright (C) 2017 Milo Yip. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of pngout nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/*! \file
    \brief      svpng() is a minimalistic C function for saving RGB/RGBA image into uncompressed PNG.
    \author     Milo Yip
    \version    0.1.1
    \copyright  MIT license
    \sa         http://github.com/miloyip/svpng
*/

///#ifndef SVPNG_INC_
///#define SVPNG_INC_

/*! \def SVPNG_LINKAGE
    \brief User customizable linkage for svpng() function.
    By default this macro is empty.
    User may define this macro as static for static linkage, 
    and/or inline in C99/C++, etc.
*/
///#ifndef SVPNG_LINKAGE
#define SVPNG_LINKAGE
///#endif

/*! \def SVPNG_OUTPUT
    \brief User customizable output stream.
    By default, it uses C file descriptor and fputc() to output bytes.
    In C++, for example, user may use std::ostream or std::vector instead.
*/
///#ifndef SVPNG_OUTPUT
#include <stdio.h>
#define SVPNG_OUTPUT FILE* fp
///#endif

/*! \def SVPNG_PUT
    \brief Write a byte
*/
///#ifndef SVPNG_PUT
#define SVPNG_PUT(u) fputc(u, fp)
///#endif


/*!
    \brief Save a RGB/RGBA image in PNG format.
    \param SVPNG_OUTPUT Output stream (by default using file descriptor).
    \param w Width of the image. (<16383)
    \param h Height of the image.
    \param img Image pixel data in 24-bit RGB or 32-bit RGBA format.
    \param alpha Whether the image contains alpha channel.
*/
SVPNG_LINKAGE void svpng(SVPNG_OUTPUT, unsigned w, unsigned h, const unsigned char* img, int alpha) {
    static const unsigned t[] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 
    /* CRC32 Table */    0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };
    unsigned a = 1, b = 0, c, p = w * (alpha ? 4 : 3) + 1, x, y, i;   /* ADLER-a, ADLER-b, CRC, pitch */
#define SVPNG_U8A(ua, l) for (i = 0; i < l; i++) SVPNG_PUT((ua)[i]);
#define SVPNG_U32(u) do { SVPNG_PUT((u) >> 24); SVPNG_PUT(((u) >> 16) & 255); SVPNG_PUT(((u) >> 8) & 255); SVPNG_PUT((u) & 255); } while(0)
#define SVPNG_U8C(u) do { SVPNG_PUT(u); c ^= (u); c = (c >> 4) ^ t[c & 15]; c = (c >> 4) ^ t[c & 15]; } while(0)
#define SVPNG_U8AC(ua, l) for (i = 0; i < l; i++) SVPNG_U8C((ua)[i])
#define SVPNG_U16LC(u) do { SVPNG_U8C((u) & 255); SVPNG_U8C(((u) >> 8) & 255); } while(0)
#define SVPNG_U32C(u) do { SVPNG_U8C((u) >> 24); SVPNG_U8C(((u) >> 16) & 255); SVPNG_U8C(((u) >> 8) & 255); SVPNG_U8C((u) & 255); } while(0)
#define SVPNG_U8ADLER(u) do { SVPNG_U8C(u); a = (a + (u)) % 65521; b = (b + a) % 65521; } while(0)
#define SVPNG_BEGIN(s, l) do { SVPNG_U32(l); c = ~0U; SVPNG_U8AC(s, 4); } while(0)
#define SVPNG_END() SVPNG_U32(~c)
    SVPNG_U8A("\x89PNG\r\n\32\n", 8);           /* Magic */
    SVPNG_BEGIN("IHDR", 13);                    /* IHDR chunk { */
    SVPNG_U32C(w); SVPNG_U32C(h);               /*   Width & Height (8 bytes) */
    SVPNG_U8C(8); SVPNG_U8C(alpha ? 6 : 2);     /*   Depth=8, Color=True color with/without alpha (2 bytes) */
    SVPNG_U8AC("\0\0\0", 3);                    /*   Compression=Deflate, Filter=No, Interlace=No (3 bytes) */
    SVPNG_END();                                /* } */
    SVPNG_BEGIN("IDAT", 2 + h * (5 + p) + 4);   /* IDAT chunk { */
    SVPNG_U8AC("\x78\1", 2);                    /*   Deflate block begin (2 bytes) */
    for (y = 0; y < h; y++) {                   /*   Each horizontal line makes a block for simplicity */
        SVPNG_U8C(y == h - 1);                  /*   1 for the last block, 0 for others (1 byte) */
        SVPNG_U16LC(p); SVPNG_U16LC(~p);        /*   Size of block in little endian and its 1's complement (4 bytes) */
        SVPNG_U8ADLER(0);                       /*   No filter prefix (1 byte) */
        for (x = 0; x < p - 1; x++, img++)
            SVPNG_U8ADLER(*img);                /*   Image pixel data */
    }
    SVPNG_U32C((b << 16) | a);                  /*   Deflate block end with adler (4 bytes) */
    SVPNG_END();                                /* } */
    SVPNG_BEGIN("IEND", 0); SVPNG_END();        /* IEND chunk {} */
}

///#endif /* SVPNG_INC_ */

### Image helper

In [4]:
// help functions for additional notebook functions
#include "xtl/xbase64.hpp"
#include "xeus/xjson.hpp"

// display image in the notebook
void display_image(std::vector< unsigned char> & image, bool clear_ouput){
    // memory objects for output in the web browser
    std::stringstream buffer;
    xeus::xjson mine;

    if(clear_ouput)
        xeus::get_interpreter().clear_output(true);

    buffer.str("");
    for(auto c : image){
        buffer << c;
    }

    mine["image/png"] = xtl::base64encode(buffer.str());
    xeus::get_interpreter().display_data(
        std::move(mine),
        xeus::xjson::object(),
        xeus::xjson::object());
}

# SmallPT CUDA

In [None]:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <math.h>

///#include <core/sTimer.h>
///#include <core/sRandom.h>
///#include <core/helper_math.h>
///#include <image/svpng.inc>

#define PI 3.14159265359f

#define CUDA_SAFE_CALL(call) {										 \
cudaError err = call;                                                \
if(cudaSuccess != err) {                                             \
    fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",    \
            __FILE__, __LINE__, cudaGetErrorString( err) );          \
} }

// -----------------------------------GPU Func-----------------------------------
// From [smallpt](http://www.kevinbeason.com/smallpt/)
enum materialType
{ 
    DIFFUSE = 0, 
    MIRROR, 
    GLASS
};

struct __align__(16) Ray
{
    __device__ Ray() {}

    __device__ Ray(float3 origin, float3 direction) 
        : origin(origin), direction(direction) {}

    float3 origin;
    float3 direction;
};

struct __align__(16) sphere
{
    float radius;
    float3 center, emission, reflectance;
    materialType type;

    __device__ double intersect(const Ray &r) const
    {

        float3 op = center - r.origin;
        float t, epsilon = 0.0001f;  // epsilon required to prevent floating point precision artefacts
        float b = dot(op, r.direction);    // b in quadratic equation
        float disc = b*b - dot(op, op) + radius*radius;  // discriminant quadratic equation
        if(disc < 0) return 0;       // if disc < 0, no real solution (we're not interested in complex roots) 
        else disc = sqrtf(disc);    // if disc >= 0, check for solutions using negative and positive discriminant
        return (t = b - disc) > epsilon ? t : ((t = b + disc) > epsilon ? t : 0); // pick closest point in front of ray origin
    }
};

__constant__ sphere spheres[] = {
    {1e5f,{1e5f + 1.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{0.75f, 0.25f, 0.25f}, DIFFUSE}, //Left 
    {1e5f,{-1e5f + 99.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{.25f, .25f, .75f}, DIFFUSE}, //Rght 
    {1e5f,{50.0f, 40.8f, 1e5f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Back 
    {1e5f,{50.0f, 40.8f, -1e5f + 170.0f},{0.0f, 0.0f, 0.0f},{0.0f, 0.0f, 0.0f}, DIFFUSE}, //Frnt 
    {1e5f,{50.0f, 1e5f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Botm 
    {1e5f,{50.0f, -1e5f + 81.6f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Top 
    {16.5f,{27.0f, 16.5f, 47.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, MIRROR},//Mirr
    {16.5f,{73.0f, 16.5f, 78.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, GLASS},//Glas
    {600.0f,{50.0f, 681.6f-.27f, 81.6f},{12, 12, 12},{0.0f, 0.0f, 0.0f}, DIFFUSE}  // Light
};

__constant__ const int nsphere = sizeof(spheres) / sizeof(sphere);

__device__ float rgbToLuminance(const float3& rgb)
{
    const float YWeight[3] = {0.212671f, 0.715160f, 0.072169f};
    return YWeight[0] * rgb.x + YWeight[1] * rgb.y + YWeight[2] * rgb.z;
}

__device__ bool intersectScene(const Ray &r, float &t, int &id, sphere* pshere, int &nsp)
{
    float d, inf = t = 1e20;  // t is distance to closest intersection, initialise t to a huge number outside scene
    for(int i = nsp; i--;)
    {
        // find closest hit object and point
        if((d = pshere[i].intersect(r)) && d < t)
        {
            t = d;
            id = i;
        }
    }
        
    return t < inf; // returns true if an intersection with the scene occurred, false when no hit
}

__device__ float clamp(float x) { return x < 0 ? 0 : x>1 ? 1 : x; }

__device__ float gammaCorrection(float x)
{
    return pow(clamp(x), 1 / 2.2f);
}

__device__ float3 radiance(Ray &r, curandState* rs, sphere* pshere, int &nsp)
{
    float3 L = make_float3(0.0f, 0.0f, 0.0f); // accumulates ray colour with each iteration through bounce loop
    float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
    int depth = 0;

    // ray bounce loop
    while(1)
    {
        float t;    
        int id = 0;         

        // find closest intersection with object's index
        if(!intersectScene(r, t, id, pshere, nsp))
            break;

        const sphere &obj = pshere[id];
        float3 hitpoint = r.origin + r.direction * t; 
        float3 normal = normalize(hitpoint - obj.center);
        float3 nl = dot(normal, r.direction) < 0 ? normal : normal * -1; // front facing normal

        // prevent self-intersection
        r.origin = hitpoint + nl * 0.05f;

        //float pdf = 1.0f;

        // add emission
        L += throughput * obj.emission;

        // different material
        if(obj.type == DIFFUSE)
        {        
            // uniform sampling hemisphere
            float r1 = 2 * PI * curand_uniform(rs);
            float r2 = curand_uniform(rs);
            float r2s = sqrtf(r2);

            // compute local coordinate on the hit point
            float3 w = nl;
            float3 u = normalize(cross((fabs(w.x) > .1 ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
            float3 v = cross(w, u);

            // local to world convert
            r.direction = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
            //pdf = 1.0f / PI;

            // importance sampling no need costheta
            //throughput *= obj.reflectance * dot(r.direction, nl);
            throughput *= obj.reflectance;
        }
        else if(obj.type == MIRROR)
        {
            r.direction = r.direction - normal * 2 * dot(normal, r.direction);
            throughput *= obj.reflectance;
            //pdf = 1.0f;
        }
        else
        {
            r.origin = hitpoint;

            // Ideal dielectric REFRACTION
            float3 reflectDir = r.direction - normal * 2 * dot(normal, r.direction);
            // Ray from outside going in?
            bool into = dot(normal, nl) > 0;
            float nc = 1, nt = 1.5, nnt = into ? nc / nt : nt / nc, ddn = dot(r.direction, nl), cos2t;
            
            // total internal reflection
            if((cos2t = 1 - nnt*nnt*(1 - ddn*ddn)) < 0)
            {
                r.direction = reflectDir;
                throughput *= obj.reflectance;
            }
            else
            {
                // refract or reflect
                float3 tdir = normalize(r.direction*nnt - normal*((into ? 1 : -1)*(ddn*nnt + sqrt(cos2t))));

                float a = nt - nc, b = nt + nc, R0 = a*a / (b*b), c = 1 - (into ? -ddn : dot(tdir, normal));

                float Re = R0 + (1 - R0)*c*c*c*c*c, Tr = 1 - Re, P = .25 + .5*Re, RP = Re / P, TP = Tr / (1 - P);
                
                if(curand_uniform(rs) < P)
                {
                    // reflect
                    r.direction = reflectDir;
                    throughput *= obj.reflectance * RP;
                }
                else
                {
                    //refract
                    r.direction = tdir;
                    throughput *= obj.reflectance * TP;
                    //throughput *= make_float3(1, 0, 0);
                }
            }
        }

        // Russian roulette Stop with at least some probability to avoid getting stuck
        if(depth++ >= 5)
        {
            float q = min(0.95f, rgbToLuminance(throughput));
            if(curand_uniform(rs) >= q)
                break;
            throughput /= q;
        }
    }

    return L;
}

__global__ void render(int spp, int width, int height, float3* output)
{
    //copy spheres to shared memory
    __shared__ int nsp;
    __shared__ sphere sspheres[nsphere];
    __shared__ Ray tRay;
    nsp = nsphere;

    sspheres[threadIdx.x % nsp] = spheres[threadIdx.x % nsp];

    __syncthreads();

    // position of current pixel
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // index of current pixel
    //int i = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    int i = (height - y - 1) * width + x;

    curandState rs;
    curand_init(i, 0, 0, &rs);

    Ray cam(make_float3(50, 52, 295.6), normalize(make_float3(0, -0.042612, -1))); // cam pos, dir 
    float3 cx = make_float3(width * 0.5135f / height, 0.0f, 0.0f);
    // .5135 is field of view angle
    float3 cy = normalize(cross(cx, cam.direction)) * 0.5135f;
    float3 color = make_float3(0.0f);

    for (int sy = 0; sy < 2; sy++)
    {
        for (int sx = 0; sx < 2; sx++)
        { 
            for(int s = 0; s < spp; s++)
            {
                float r1 = curand_uniform(&rs);
                float dx = r1 < 1 ? sqrtf(r1) - 1 : 1-sqrtf(2 - r1);
                float r2 = curand_uniform(&rs);
                float dy = r2 < 1 ? sqrtf(r2) - 1 : 1-sqrtf(2 - r2);
                //--! super sampling
                float3 d = cam.direction + cx*((((sx + dx + .5) / 2) + x) / width - .5) + 
                                           cy*((((sy + dy + .5) / 2) + y) / height - .5);

                //Ray tRay = Ray(cam.origin + d * 140, normalize(d));
                tRay.direction = normalize(d);
                tRay.origin = cam.origin + d * 140;
                color += radiance(tRay, &rs, sspheres, nsp) *(.25f / spp);
            }
        }
    }

    // output to the cache
    __shared__ float3 temp;
    temp = make_float3(clamp(color.x, 0.0f, 1.0f), clamp(color.y, 0.0f, 1.0f), clamp(color.z, 0.0f, 1.0f));
    output[i] = temp;
}

// -----------------------------------CPU Func-----------------------------------
int toInt(float x)
{
    return (int) (pow(clamp(x, 0.0f, 1.0f), 1.0f / 2.2f) * 255 + 0.5f);
}

void save(const char* fileName, int width, int height, float3* data)
{
    FILE *fp = fopen(fileName, "wb");

    // Convert from float3 array to uchar array
    unsigned char* output = new unsigned char[width * height * 3];

    for(int i = 0; i < width * height; i++)
    {
        //printf_s("%f %f %f \n", data[i].x, data[i].y, data[i].z);
        output[i * 3 + 0] = toInt(data[i].x);
        output[i * 3 + 1] = toInt(data[i].y);
        output[i * 3 + 2] = toInt(data[i].z);
    }

    svpng(fp, width, height, output, 0);
    fclose(fp);
    delete[] output;
}

void devicePropertyPrint()
{
    // Device
    int dev = 0;
    cudaDeviceProp devProp;
    if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)
    {
        std::cout << "Device " << dev << ", named: " << devProp.name << std::endl;
        std::cout << "Multi Processor Count: " << devProp.multiProcessorCount << std::endl;
        std::cout << "Size of SharedMem Per-Block: " << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
        std::cout << "Max Threads Per-Block: " << devProp.maxThreadsPerBlock << std::endl;
        std::cout << "Max Threads Per-MultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
    }
}

int main(int argc, char *argv[]) {
    
    devicePropertyPrint();

    // Image Size
    //int width = 1024, height = 768;
    int width = 256, height = 192;
    int spp = argc==2 ? atoi(argv[1])/4 : 2048/4;

    printf("\nRendering Size: [%d, %d], spp: %d\n", width, height, spp);
    printf("------------------Rendering Started------------------\n");

    sTimer t;
    
    // Memory on CPU
    float3* outputCPU = new float3[width * height];
    float3* outputGPU;
    CUDA_SAFE_CALL(cudaMalloc(&outputGPU, width * height * sizeof(float3)));

    // Ray Pool
    dim3 blockSize(32, 32, 1);
    dim3 gridSize(width / blockSize.x, height / blockSize.y, 1);

    t.start();

    // Render on GPU
    render<<<gridSize, blockSize>>>(spp, width, height, outputGPU);

    cudaDeviceSynchronize();
    t.end();

    // Copy Mem from GPU to CPU
    CUDA_SAFE_CALL(cudaMemcpy(outputCPU, outputGPU, width * height * sizeof(float3), cudaMemcpyDeviceToHost));

    // free CUDA memory
    cudaFree(outputGPU);

    printf("------------------Rendering Ended------------------\n");
    printf("Cost time: %f\n", t.difference());

    save("test.png", width, height, outputCPU);
    
    return 0;
}

int argc = 2;
char* argv[] = {"demo", "4000", ""};
int r = main(argc, argv);

Device 0, named: NVIDIA GeForce GTX 1050 Ti
Multi Processor Count: 6
Size of SharedMem Per-Block: 48 KB
Max Threads Per-Block: 1024
Max Threads Per-MultiProcessor: 2048

Rendering Size: [256, 192], spp: 1000
------------------Rendering Started------------------
------------------Rendering Ended------------------
Cost time: 0.000009
