Megacommit with various improvements: Coarse-grained refactoring, mas…

…sive simplification of foveated sample generation, 3D culling planes for samples, full 3D samples throughout the pipeline, improved clustering for foveated samples, fast bicubic interpolation.
facebookresearch · Oct 8, 2018 · 3f5c7b2 · 3f5c7b2
1 parent 41d8d03
commit 3f5c7b2
Show file tree

Hide file tree

Showing 62 changed files with 1,789 additions and 1,821 deletions.
diff --git a/libraries/hvvr/cuda.props b/libraries/hvvr/cuda.props
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ImportGroup Label="PropertySheets" />
   <PropertyGroup>
@@ -12,9 +12,9 @@
   </ItemDefinitionGroup>
   <ItemGroup />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 9.1.props" />
   </ImportGroup>
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.targets" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 9.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/libraries/hvvr/raycaster/camera.cpp b/libraries/hvvr/raycaster/camera.cpp
@@ -22,17 +22,16 @@ namespace hvvr {
 SampleData::SampleData(const Sample* rawSamples,
                        uint32_t rawSampleCount,
                        uint32_t splitColorSamples,
-                       const matrix3x3& sampleToCamera,
-                       ThinLens lens,
+                       Sample2Dto3DMappingSettings settings2DTo3D,
                        uint32_t rtWidth,
                        uint32_t rtHeight)
-    : splitColorSamples(splitColorSamples), lens(lens) {
+    : splitColorSamples(splitColorSamples) {
     DynamicArray<SortedSample> sortedSamples(rawSampleCount);
     for (size_t n = 0; n < rawSampleCount; n++) {
         sortedSamples[n] = SortedSample(rawSamples[n], n % splitColorSamples);
     }
     uint32_t blockCount = uint32_t((sortedSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE);
-    // TODO(whunt): allow different clustering methods
+    // TODO: allow different clustering methods
     naiveXYCluster(ArrayView<SortedSample>(sortedSamples), blockCount);
 
     sampleBounds = {vector2(1.0f, 1.0f), vector2(0.0f, 0.0f)};
@@ -48,15 +47,13 @@ SampleData::SampleData(const Sample* rawSamples,
     cullRect.upper.x = INFINITY;
     cullRect.upper.y = INFINITY;
     validSampleCount = uint32_t(rawSampleCount);
-    samples.blockFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount);
-    samples.tileFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount * TILES_PER_BLOCK);
     samples.blockFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount);
     samples.tileFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount * TILES_PER_BLOCK);
-    blockedSamplePositions = DynamicArray<float>(blockCount * BLOCK_SIZE * 2);
-    blockedSampleExtents = DynamicArray<Sample::Extents>(blockCount * BLOCK_SIZE);
-    samples.generate(sortedSamples, blockCount, validSampleCount, cullRect, blockedSamplePositions,
-                     blockedSampleExtents, lens, sampleToCamera);
-    sampleCount = uint32_t(blockCount * BLOCK_SIZE);
+    samples.directionalSamples = DynamicArray<DirectionalBeam>(blockCount * BLOCK_SIZE);
+
+    samples2D = SampleHierarchy2D(sortedSamples, blockCount, validSampleCount, cullRect, settings2DTo3D.thinLens,
+                                  settings2DTo3D.sampleToCamera);
+    samples.generateFrom2D(samples2D, settings2DTo3D);
 
     imageLocationToSampleIndex = DynamicArray<int32_t>(rtWidth * rtHeight * splitColorSamples);
     memset(imageLocationToSampleIndex.data(), 0xff, sizeof(int32_t) * imageLocationToSampleIndex.size()); // clear to -1
@@ -67,6 +64,10 @@ SampleData::SampleData(const Sample* rawSamples,
     }
 }
 
+void SampleData::generate3Dfrom2D(Sample2Dto3DMappingSettings settings) {
+    samples.generateFrom2D(samples2D, settings);
+}
+
 Camera::Camera(const FloatRect& viewport, float apertureRadius, GPUContext& gpuContext)
     : _gpuCamera(nullptr), _lens({apertureRadius, 1.0f}), _eyeDir(0.0f, 0.0f, -1.0f) {
     setViewport(viewport);
@@ -133,46 +134,43 @@ void Camera::setRenderTarget(const ImageResourceDescriptor& newRenderTarget) {
 }
 
 void Camera::setSamples(const Sample* rawSamples, uint32_t rawSampleCount, uint32_t splitColorSamples) {
-    setSampleData(SampleData(rawSamples, rawSampleCount, splitColorSamples, getSampleToCamera(), _lens,
+    setSampleData(SampleData(rawSamples, rawSampleCount, splitColorSamples, get2DSampleMappingSettings(),
                              _renderTarget.width, _renderTarget.height));
 }
 
 void Camera::setSampleData(const SampleData& sampleData) {
     _sampleData = sampleData;
 
-    uint32_t blockCount = uint32_t(_sampleData.samples.blockFrusta3D.size());
-    uint32_t tileCount = uint32_t(_sampleData.samples.tileFrusta3D.size());
+    uint32_t blockCount = uint32_t(sampleData.samples.blockFrusta3D.size());
+    uint32_t tileCount = uint32_t(sampleData.samples.tileFrusta3D.size());
 
-    if (blockCount != _blockFrustaTransformed.size()) {
-        _blockFrustaTransformed = DynamicArray<RayPacketFrustum3D>(blockCount);
+    if (blockCount != _cpuHierarchy._blockFrusta.size()) {
+        _cpuHierarchy._blockFrusta = DynamicArray<RayPacketFrustum3D>(blockCount);
     }
-    if (tileCount != _tileFrustaTransformed.size()) {
-        _tileFrustaTransformed = DynamicArray<RayPacketFrustum3D>(tileCount);
+    if (tileCount != _cpuHierarchy._tileFrusta.size()) {
+        _cpuHierarchy._tileFrusta = DynamicArray<RayPacketFrustum3D>(tileCount);
     }
-
-    _gpuCamera->updateConfig(_outputMode, sampleData.imageLocationToSampleIndex.data(),
-                             sampleData.blockedSamplePositions.data(), sampleData.blockedSampleExtents.data(), _lens,
-                             sampleData.sampleCount, _renderTarget.width, _renderTarget.height,
+    const DynamicArray<DirectionalBeam>& samples = sampleData.samples.directionalSamples;
+    _gpuCamera->updateConfig(_outputFormat, sampleData.imageLocationToSampleIndex.data(), samples.data(), _lens,
+                             uint32_t(samples.size()), _renderTarget.width, _renderTarget.height,
                              uint32_t(_renderTarget.stride), sampleData.splitColorSamples);
 }
 
 const SampleData& Camera::getSampleData() const {
     return _sampleData;
 }
 
+const uint32_t Camera::getSampleCount() const {
+    return _gpuCamera != nullptr ? _gpuCamera->validSampleCount : 0;
+}
+
+
 matrix3x3 Camera::getSampleToCamera() const {
     return matrix3x3(vector3(_viewport.upper.x - _viewport.lower.x, 0, 0),
                      vector3(0, _viewport.lower.y - _viewport.upper.y, 0),
                      vector3(_viewport.lower.x, _viewport.upper.y, -1));
 }
 
-matrix4x4 Camera::getSampleToWorld() const {
-    return matrix4x4(_cameraToWorld) * matrix4x4(getSampleToCamera());
-}
-
-matrix4x4 Camera::getWorldToSample() const {
-    return invert(getSampleToWorld());
-}
 
 void Camera::setCameraToWorld(const transform& cameraToWorld) {
     _cameraToWorld = cameraToWorld;
@@ -186,8 +184,37 @@ const vector3& Camera::getTranslation() const {
     return _cameraToWorld.translation;
 }
 
-vector3 Camera::getForward() const {
-    return vector3(-normalize(getCameraToWorld().m2));
+void Camera::setupRenderTarget(GPUContext& context) {
+    if (!getEnabled())
+        return;
+    GPUCamera* gpuCamera = _gpuCamera;
+    if (_renderTarget.isHardwareRenderTarget() && _newHardwareTarget) {
+        gpuCamera->bindTexture(context, _renderTarget);
+        _newHardwareTarget = false;
+    }
+}
+
+void Camera::extractImage() {
+    GPUCamera* gpuCamera = _gpuCamera;
+    if (_renderTarget.isHardwareRenderTarget()) {
+        gpuCamera->copyImageToBoundTexture();
+    } else {
+        gpuCamera->copyImageToCPU((uint32_t*)_renderTarget.data, _renderTarget.width, _renderTarget.height,
+                                  uint32_t(_renderTarget.stride));
+    }
+}
+
+Sample2Dto3DMappingSettings Camera::get2DSampleMappingSettings() const {
+    if (_fovXDegrees > 0.0f) {
+        return Sample2Dto3DMappingSettings::sphericalSection(getSampleToCamera(), _lens, _fovXDegrees, _fovYDegrees);
+    } else {
+        return Sample2Dto3DMappingSettings(getSampleToCamera(), _lens);
+    }
+}
+
+void Camera::setSphericalWarpSettings(float fovXDegrees, float fovYDegrees) {
+    _fovXDegrees = fovXDegrees;
+    _fovYDegrees = fovYDegrees;
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/camera.h b/libraries/hvvr/raycaster/camera.h
@@ -10,8 +10,9 @@
  */
 
 #include "dynamic_array.h"
-#include "graphics_types.h"
 #include "foveated.h"
+#include "gpu_samples.h"
+#include "graphics_types.h"
 #include "sample_hierarchy.h"
 #include "samples.h"
 
@@ -23,36 +24,36 @@ namespace hvvr {
 class GPUCamera;
 class GPUContext;
 
+
 // preprocessed samples, ready for rendering
 struct SampleData {
+    SampleHierarchy2D samples2D;
+    Sample2Dto3DMappingSettings settings2DTo3D;
     SampleHierarchy samples;
     uint32_t splitColorSamples = 1;
-    uint32_t sampleCount;
 
     DynamicArray<int32_t> imageLocationToSampleIndex;
-    // Flat array of sample positions (in vector2 format) without fancy swizzling for CPU vectorization
-    DynamicArray<float> blockedSamplePositions;
-    DynamicArray<Sample::Extents> blockedSampleExtents;
 
     FloatRect sampleBounds = {{0.0f, 0.0f}, {0.0f, 0.0f}};
     uint32_t validSampleCount = 0;
-    ThinLens lens = {0.0f, 5.0f};
 
     SampleData(){};
     SampleData(const Sample* rawSamples,
                uint32_t rawSampleCount,
                uint32_t splitColorSamples,
-               const matrix3x3& sampleToCamera,
-               ThinLens lens,
+               Sample2Dto3DMappingSettings settings2DTo3D,
                uint32_t rtWidth,
                uint32_t rtHeight);
+    void generate3Dfrom2D(Sample2Dto3DMappingSettings settings);
 };
 
+
 // TODO(anankervis): merge with GPU version of this class
 class Camera {
     friend class Raycaster;
     // TODO(anankervis): remove
     friend void polarSpaceFoveatedSetup(Raycaster* raycaster);
+
 public:
     Camera(const FloatRect& viewport, float apertureRadius, GPUContext& gpuContext);
     ~Camera();
@@ -84,27 +85,35 @@ class Camera {
     void setRenderTarget(const ImageResourceDescriptor& newRenderTarget);
     void setSamples(const Sample* rawSamples, uint32_t rawSampleCount, uint32_t splitColorSamples);
 
+    // If called with nonzero values, this camera uses a spherical section for ray generation
+    // (instead of the standard perspective transform).
+    void setSphericalWarpSettings(float fovXDegrees, float fovYDegrees);
+
     void setSampleData(const SampleData& sampleData);
     const SampleData& getSampleData() const;
+    const uint32_t getSampleCount() const;
 
     matrix3x3 getSampleToCamera() const;
-    // Beware - this isn't actually suitable for taking a 2D sample coordinate + Z and converting to world space.
-    // Samples can be in any arbitrary space, packing, or function we choose. What's important is that when we
-    // unpack them, they turn into camera-relative 3D rays (origin offset + direction). From there, we can convert
-    // into world space using cameraToWorld.
-    matrix4x4 getSampleToWorld() const;
-    matrix4x4 getWorldToSample() const;
+
     void setCameraToWorld(const transform& cameraToWorld);
     matrix4x4 getCameraToWorld() const;
     const vector3& getTranslation() const;
-    vector3 getForward() const;
+
+    void setupRenderTarget(GPUContext& context);
+    void extractImage();
 
 protected:
+    Sample2Dto3DMappingSettings get2DSampleMappingSettings() const;
+
+    float _fovXDegrees = 0.0f;
+    float _fovYDegrees = 0.0f;
+
     // TODO(anankervis): clean up direct access of protected members by Raycaster
 
     GPUCamera* _gpuCamera;
 
-    matrix4x4 _worldToEyePrevious = matrix4x4::identity();
+    // Initialize to an invalid transform since there is no previous frame on the initial frame
+    matrix4x4 _worldToEyePrevious = matrix4x4::zero();
     matrix3x3 _eyePreviousToSamplePrevious = matrix3x3::identity();
 
     // Incremeted on every render
@@ -116,13 +125,16 @@ class Camera {
     ThinLens _lens = {0.0f, 1.0f};
     bool _enabled = true;
     ImageResourceDescriptor _renderTarget;
-    RaycasterOutputMode _outputMode = RaycasterOutputMode::COLOR_RGBA8;
+    RaycasterOutputFormat _outputFormat = RaycasterOutputFormat::COLOR_RGBA8;
     FoveatedSampleData _foveatedSampleData;
+
     // Only for polar foveated sampling
     std::vector<vector2ui> _polarRemapToPixel;
 
-    DynamicArray<RayPacketFrustum3D> _blockFrustaTransformed;
-    DynamicArray<RayPacketFrustum3D> _tileFrustaTransformed;
+    struct CPUHierarchy {
+        DynamicArray<RayPacketFrustum3D> _blockFrusta;
+        DynamicArray<RayPacketFrustum3D> _tileFrusta;
+    } _cpuHierarchy;
 
     transform _cameraToWorld = transform::identity();
 

diff --git a/libraries/hvvr/raycaster/cuda_util.h b/libraries/hvvr/raycaster/cuda_util.h
@@ -19,8 +19,9 @@
 #define cutilSafeCall(error) __cudaSafeCall(error, __FILE__, __LINE__)
 inline void __cudaSafeCall(cudaError_t error, const char* file, const int line) {
     if (error != cudaSuccess) {
-        fprintf(stderr, "error: CudaSafeCall() failed at %s:%d with %s\n", file, line, cudaGetErrorString(error));
-#ifdef _WIN32
+        fprintf(stderr, "error %d: CudaSafeCall() failed at %s:%d with %s\n", error, file, line,
+                cudaGetErrorString(error));
+#if defined(_WIN32)
         __debugbreak();
 #else
         exit(error);
@@ -56,4 +57,75 @@ struct KernelDim {
     }
 };
 
-#define CUDA_INF __int_as_float(0x7f800000)
+#define CUDA_INF __int_as_float(0x7f800000)
+
+
+// Based on https://stackoverflow.com/questions/52286202/dynamic-dispatch-to-template-function-c
+// Use to generate all template function permutations and dispatch properly at runtime for a prefix of template booleans
+// Makes calling cuda kernels with many permutations concise.
+// Example:
+// Change
+// if (b0) {
+// 	if (b1) {
+// 		if (b2) {
+// 			myFunc<true, true, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<true, true, false, otherArgs>(args);
+// 		}
+// 	} else {
+// 		if (b2) {
+// 			myFunc<true, false, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<true, false, false, otherArgs>(args);
+// 		}
+// 	}
+// } else {
+// 	if (b1) {
+// 		if (b2) {
+// 			myFunc<false, true, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<false, true, false, otherArgs>(args);
+// 		}
+// 	} else {
+// 		if (b2) {
+// 			myFunc<false, false, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<false, false, false, otherArgs>(args);
+// 		}
+// 	}
+// }
+// into:
+// std::array<bool, 3> bargs = { { b0, b1, b2 } };
+// dispatch_bools<3>{}(bargs, [&](auto...Bargs) {
+//     myFunc<decltype(Bargs)::value..., otherArgs>(args);
+// });
+//
+// You may want to #pragma warning( disable : 4100) around the call, since there will be unrefenced Bargs in the call
+// chain
+template <bool b>
+using kbool = std::integral_constant<bool, b>;
+
+#pragma warning(push)
+#pragma warning(disable : 4100)
+template <std::size_t max>
+struct dispatch_bools {
+    template <std::size_t N, class F, class... Bools>
+    void operator()(std::array<bool, N> const& input, F&& continuation, Bools...) {
+        if (input[max - 1])
+            dispatch_bools<max - 1>{}(input, continuation, kbool<true>{}, Bools{}...);
+        else
+            dispatch_bools<max - 1>{}(input, continuation, kbool<false>{}, Bools{}...);
+    }
+};
+template <>
+struct dispatch_bools<0> {
+    template <std::size_t N, class F, class... Bools>
+    void operator()(std::array<bool, N> const& input, F&& continuation, Bools...) {
+        continuation(Bools{}...);
+    }
+};
+#pragma warning(pop)