From 3f5c7b2c158373145d3551e8249bf10b3ff8e6a1 Mon Sep 17 00:00:00 2001
From: Michael Mara <Michael.Mara@Oculus.com>
Date: Sun, 7 Oct 2018 17:48:24 -0700
Subject: [PATCH] Megacommit with various improvements: Coarse-grained
 refactoring, massive simplification of foveated sample generation, 3D culling
 planes for samples, full 3D samples throughout the pipeline, improved
 clustering for foveated samples, fast bicubic interpolation.

---
 libraries/hvvr/cuda.props                     |   6 +-
 libraries/hvvr/raycaster/camera.cpp           |  89 +--
 libraries/hvvr/raycaster/camera.h             |  50 +-
 libraries/hvvr/raycaster/cuda_util.h          |  78 ++-
 libraries/hvvr/raycaster/foveated.cpp         |  96 +---
 libraries/hvvr/raycaster/foveated.h           | 114 +++-
 libraries/hvvr/raycaster/frusta.cu            | 224 +++-----
 libraries/hvvr/raycaster/frusta.h             |  22 +-
 libraries/hvvr/raycaster/gbuffer.h            |  29 -
 libraries/hvvr/raycaster/gpu_camera.cu        | 102 ++--
 libraries/hvvr/raycaster/gpu_camera.h         |  85 ++-
 libraries/hvvr/raycaster/gpu_context.cu       |  29 +-
 libraries/hvvr/raycaster/gpu_context.h        |   3 +-
 libraries/hvvr/raycaster/gpu_foveated.cu      | 421 ++++++--------
 libraries/hvvr/raycaster/gpu_foveated.h       |  10 -
 libraries/hvvr/raycaster/gpu_image.h          |  11 +-
 libraries/hvvr/raycaster/gpu_samples.cu       |   5 +-
 libraries/hvvr/raycaster/gpu_samples.h        | 113 +---
 libraries/hvvr/raycaster/intersect.cu         | 118 ++--
 libraries/hvvr/raycaster/kernel_constants.h   |  11 +-
 libraries/hvvr/raycaster/material.h           |  17 +-
 libraries/hvvr/raycaster/prim_tests.h         |  77 +--
 libraries/hvvr/raycaster/raycaster.cpp        |  23 +-
 libraries/hvvr/raycaster/raycaster.h          |  23 +-
 libraries/hvvr/raycaster/raycaster.props      |   2 +-
 libraries/hvvr/raycaster/raycaster.vcxproj    |   6 +-
 .../hvvr/raycaster/raycaster.vcxproj.filters  |   6 -
 libraries/hvvr/raycaster/raycaster_common.h   |  12 -
 libraries/hvvr/raycaster/raycaster_spec.h     |  37 +-
 libraries/hvvr/raycaster/remap.cu             |   8 +-
 libraries/hvvr/raycaster/render.cpp           | 342 ++++++------
 libraries/hvvr/raycaster/resolve.cu           | 150 +++--
 libraries/hvvr/raycaster/sample_hierarchy.cpp | 137 +++--
 libraries/hvvr/raycaster/sample_hierarchy.h   |  56 +-
 libraries/hvvr/raycaster/samples.cpp          | 137 +++--
 libraries/hvvr/raycaster/samples.h            |  68 ++-
 libraries/hvvr/raycaster/scene.cpp            |   1 +
 libraries/hvvr/raycaster/scene_update.cpp     |   1 -
 libraries/hvvr/raycaster/shading_helpers.h    |   8 +
 libraries/hvvr/raycaster/sort.h               |  15 +-
 libraries/hvvr/raycaster/texture.cu           |  20 +-
 libraries/hvvr/raycaster/texture.h            |   5 +-
 libraries/hvvr/raycaster/texture_internal.h   |   2 +
 libraries/hvvr/raycaster/tile_data.h          |  44 +-
 libraries/hvvr/raycaster/traversal.cpp        |  65 ++-
 libraries/hvvr/raycaster/traversal.h          |  16 +-
 libraries/hvvr/raycaster/warp_ops.h           |   6 +-
 .../hvvr/samples_shared/model_import_bin.cpp  |   6 +-
 .../hvvr/samples_shared/samples_shared.props  |   2 +-
 .../samples_shared/samples_shared.vcxproj     |   4 +-
 .../hvvr/samples_shared/window_d3d11.cpp      |   2 +-
 libraries/hvvr/samples_shared/window_d3d11.h  |  21 +-
 libraries/hvvr/shared/cuda_decl.h             |   9 +-
 libraries/hvvr/shared/graphics_types.h        |   7 +-
 libraries/hvvr/shared/shared.props            |   2 +-
 libraries/hvvr/shared/shared.vcxproj          |   6 +-
 libraries/hvvr/shared/vector_math.h           | 121 +---
 .../modelconvert/modelconvert.vcxproj         |   4 +-
 .../modelconvert/modelconvert.vcxproj.filters |   4 +-
 .../hvvr_samples/modelviewer/modelviewer.cpp  | 516 ++++++++++--------
 .../modelviewer/modelviewer.vcxproj           |   4 +-
 vs2015/hvvr.sln                               |   2 +-
 62 files changed, 1789 insertions(+), 1821 deletions(-)
 delete mode 100644 libraries/hvvr/raycaster/gbuffer.h
 delete mode 100644 libraries/hvvr/raycaster/gpu_foveated.h

diff --git a/libraries/hvvr/cuda.props b/libraries/hvvr/cuda.props
index bae6e36..9712a0c 100644
--- a/libraries/hvvr/cuda.props
+++ b/libraries/hvvr/cuda.props
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ImportGroup Label="PropertySheets" />
   <PropertyGroup>
@@ -12,9 +12,9 @@
   </ItemDefinitionGroup>
   <ItemGroup />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 9.1.props" />
   </ImportGroup>
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 8.0.targets" />
+    <Import Project="$(VCTargetsPath14)\BuildCustomizations\CUDA 9.1.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/libraries/hvvr/raycaster/camera.cpp b/libraries/hvvr/raycaster/camera.cpp
index 988591b..0e0001e 100644
--- a/libraries/hvvr/raycaster/camera.cpp
+++ b/libraries/hvvr/raycaster/camera.cpp
@@ -22,17 +22,16 @@ namespace hvvr {
 SampleData::SampleData(const Sample* rawSamples,
                        uint32_t rawSampleCount,
                        uint32_t splitColorSamples,
-                       const matrix3x3& sampleToCamera,
-                       ThinLens lens,
+                       Sample2Dto3DMappingSettings settings2DTo3D,
                        uint32_t rtWidth,
                        uint32_t rtHeight)
-    : splitColorSamples(splitColorSamples), lens(lens) {
+    : splitColorSamples(splitColorSamples) {
     DynamicArray<SortedSample> sortedSamples(rawSampleCount);
     for (size_t n = 0; n < rawSampleCount; n++) {
         sortedSamples[n] = SortedSample(rawSamples[n], n % splitColorSamples);
     }
     uint32_t blockCount = uint32_t((sortedSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE);
-    // TODO(whunt): allow different clustering methods
+    // TODO: allow different clustering methods
     naiveXYCluster(ArrayView<SortedSample>(sortedSamples), blockCount);
 
     sampleBounds = {vector2(1.0f, 1.0f), vector2(0.0f, 0.0f)};
@@ -48,15 +47,13 @@ SampleData::SampleData(const Sample* rawSamples,
     cullRect.upper.x = INFINITY;
     cullRect.upper.y = INFINITY;
     validSampleCount = uint32_t(rawSampleCount);
-    samples.blockFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount);
-    samples.tileFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount * TILES_PER_BLOCK);
     samples.blockFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount);
     samples.tileFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount * TILES_PER_BLOCK);
-    blockedSamplePositions = DynamicArray<float>(blockCount * BLOCK_SIZE * 2);
-    blockedSampleExtents = DynamicArray<Sample::Extents>(blockCount * BLOCK_SIZE);
-    samples.generate(sortedSamples, blockCount, validSampleCount, cullRect, blockedSamplePositions,
-                     blockedSampleExtents, lens, sampleToCamera);
-    sampleCount = uint32_t(blockCount * BLOCK_SIZE);
+    samples.directionalSamples = DynamicArray<DirectionalBeam>(blockCount * BLOCK_SIZE);
+
+    samples2D = SampleHierarchy2D(sortedSamples, blockCount, validSampleCount, cullRect, settings2DTo3D.thinLens,
+                                  settings2DTo3D.sampleToCamera);
+    samples.generateFrom2D(samples2D, settings2DTo3D);
 
     imageLocationToSampleIndex = DynamicArray<int32_t>(rtWidth * rtHeight * splitColorSamples);
     memset(imageLocationToSampleIndex.data(), 0xff, sizeof(int32_t) * imageLocationToSampleIndex.size()); // clear to -1
@@ -67,6 +64,10 @@ SampleData::SampleData(const Sample* rawSamples,
     }
 }
 
+void SampleData::generate3Dfrom2D(Sample2Dto3DMappingSettings settings) {
+    samples.generateFrom2D(samples2D, settings);
+}
+
 Camera::Camera(const FloatRect& viewport, float apertureRadius, GPUContext& gpuContext)
     : _gpuCamera(nullptr), _lens({apertureRadius, 1.0f}), _eyeDir(0.0f, 0.0f, -1.0f) {
     setViewport(viewport);
@@ -133,26 +134,25 @@ void Camera::setRenderTarget(const ImageResourceDescriptor& newRenderTarget) {
 }
 
 void Camera::setSamples(const Sample* rawSamples, uint32_t rawSampleCount, uint32_t splitColorSamples) {
-    setSampleData(SampleData(rawSamples, rawSampleCount, splitColorSamples, getSampleToCamera(), _lens,
+    setSampleData(SampleData(rawSamples, rawSampleCount, splitColorSamples, get2DSampleMappingSettings(),
                              _renderTarget.width, _renderTarget.height));
 }
 
 void Camera::setSampleData(const SampleData& sampleData) {
     _sampleData = sampleData;
 
-    uint32_t blockCount = uint32_t(_sampleData.samples.blockFrusta3D.size());
-    uint32_t tileCount = uint32_t(_sampleData.samples.tileFrusta3D.size());
+    uint32_t blockCount = uint32_t(sampleData.samples.blockFrusta3D.size());
+    uint32_t tileCount = uint32_t(sampleData.samples.tileFrusta3D.size());
 
-    if (blockCount != _blockFrustaTransformed.size()) {
-        _blockFrustaTransformed = DynamicArray<RayPacketFrustum3D>(blockCount);
+    if (blockCount != _cpuHierarchy._blockFrusta.size()) {
+        _cpuHierarchy._blockFrusta = DynamicArray<RayPacketFrustum3D>(blockCount);
     }
-    if (tileCount != _tileFrustaTransformed.size()) {
-        _tileFrustaTransformed = DynamicArray<RayPacketFrustum3D>(tileCount);
+    if (tileCount != _cpuHierarchy._tileFrusta.size()) {
+        _cpuHierarchy._tileFrusta = DynamicArray<RayPacketFrustum3D>(tileCount);
     }
-
-    _gpuCamera->updateConfig(_outputMode, sampleData.imageLocationToSampleIndex.data(),
-                             sampleData.blockedSamplePositions.data(), sampleData.blockedSampleExtents.data(), _lens,
-                             sampleData.sampleCount, _renderTarget.width, _renderTarget.height,
+    const DynamicArray<DirectionalBeam>& samples = sampleData.samples.directionalSamples;
+    _gpuCamera->updateConfig(_outputFormat, sampleData.imageLocationToSampleIndex.data(), samples.data(), _lens,
+                             uint32_t(samples.size()), _renderTarget.width, _renderTarget.height,
                              uint32_t(_renderTarget.stride), sampleData.splitColorSamples);
 }
 
@@ -160,19 +160,17 @@ const SampleData& Camera::getSampleData() const {
     return _sampleData;
 }
 
+const uint32_t Camera::getSampleCount() const {
+    return _gpuCamera != nullptr ? _gpuCamera->validSampleCount : 0;
+}
+
+
 matrix3x3 Camera::getSampleToCamera() const {
     return matrix3x3(vector3(_viewport.upper.x - _viewport.lower.x, 0, 0),
                      vector3(0, _viewport.lower.y - _viewport.upper.y, 0),
                      vector3(_viewport.lower.x, _viewport.upper.y, -1));
 }
 
-matrix4x4 Camera::getSampleToWorld() const {
-    return matrix4x4(_cameraToWorld) * matrix4x4(getSampleToCamera());
-}
-
-matrix4x4 Camera::getWorldToSample() const {
-    return invert(getSampleToWorld());
-}
 
 void Camera::setCameraToWorld(const transform& cameraToWorld) {
     _cameraToWorld = cameraToWorld;
@@ -186,8 +184,37 @@ const vector3& Camera::getTranslation() const {
     return _cameraToWorld.translation;
 }
 
-vector3 Camera::getForward() const {
-    return vector3(-normalize(getCameraToWorld().m2));
+void Camera::setupRenderTarget(GPUContext& context) {
+    if (!getEnabled())
+        return;
+    GPUCamera* gpuCamera = _gpuCamera;
+    if (_renderTarget.isHardwareRenderTarget() && _newHardwareTarget) {
+        gpuCamera->bindTexture(context, _renderTarget);
+        _newHardwareTarget = false;
+    }
+}
+
+void Camera::extractImage() {
+    GPUCamera* gpuCamera = _gpuCamera;
+    if (_renderTarget.isHardwareRenderTarget()) {
+        gpuCamera->copyImageToBoundTexture();
+    } else {
+        gpuCamera->copyImageToCPU((uint32_t*)_renderTarget.data, _renderTarget.width, _renderTarget.height,
+                                  uint32_t(_renderTarget.stride));
+    }
+}
+
+Sample2Dto3DMappingSettings Camera::get2DSampleMappingSettings() const {
+    if (_fovXDegrees > 0.0f) {
+        return Sample2Dto3DMappingSettings::sphericalSection(getSampleToCamera(), _lens, _fovXDegrees, _fovYDegrees);
+    } else {
+        return Sample2Dto3DMappingSettings(getSampleToCamera(), _lens);
+    }
+}
+
+void Camera::setSphericalWarpSettings(float fovXDegrees, float fovYDegrees) {
+    _fovXDegrees = fovXDegrees;
+    _fovYDegrees = fovYDegrees;
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/camera.h b/libraries/hvvr/raycaster/camera.h
index a7107bf..a406ef0 100644
--- a/libraries/hvvr/raycaster/camera.h
+++ b/libraries/hvvr/raycaster/camera.h
@@ -10,8 +10,9 @@
  */
 
 #include "dynamic_array.h"
-#include "graphics_types.h"
 #include "foveated.h"
+#include "gpu_samples.h"
+#include "graphics_types.h"
 #include "sample_hierarchy.h"
 #include "samples.h"
 
@@ -23,36 +24,36 @@ namespace hvvr {
 class GPUCamera;
 class GPUContext;
 
+
 // preprocessed samples, ready for rendering
 struct SampleData {
+    SampleHierarchy2D samples2D;
+    Sample2Dto3DMappingSettings settings2DTo3D;
     SampleHierarchy samples;
     uint32_t splitColorSamples = 1;
-    uint32_t sampleCount;
 
     DynamicArray<int32_t> imageLocationToSampleIndex;
-    // Flat array of sample positions (in vector2 format) without fancy swizzling for CPU vectorization
-    DynamicArray<float> blockedSamplePositions;
-    DynamicArray<Sample::Extents> blockedSampleExtents;
 
     FloatRect sampleBounds = {{0.0f, 0.0f}, {0.0f, 0.0f}};
     uint32_t validSampleCount = 0;
-    ThinLens lens = {0.0f, 5.0f};
 
     SampleData(){};
     SampleData(const Sample* rawSamples,
                uint32_t rawSampleCount,
                uint32_t splitColorSamples,
-               const matrix3x3& sampleToCamera,
-               ThinLens lens,
+               Sample2Dto3DMappingSettings settings2DTo3D,
                uint32_t rtWidth,
                uint32_t rtHeight);
+    void generate3Dfrom2D(Sample2Dto3DMappingSettings settings);
 };
 
+
 // TODO(anankervis): merge with GPU version of this class
 class Camera {
     friend class Raycaster;
     // TODO(anankervis): remove
     friend void polarSpaceFoveatedSetup(Raycaster* raycaster);
+
 public:
     Camera(const FloatRect& viewport, float apertureRadius, GPUContext& gpuContext);
     ~Camera();
@@ -84,27 +85,35 @@ class Camera {
     void setRenderTarget(const ImageResourceDescriptor& newRenderTarget);
     void setSamples(const Sample* rawSamples, uint32_t rawSampleCount, uint32_t splitColorSamples);
 
+    // If called with nonzero values, this camera uses a spherical section for ray generation
+    // (instead of the standard perspective transform).
+    void setSphericalWarpSettings(float fovXDegrees, float fovYDegrees);
+
     void setSampleData(const SampleData& sampleData);
     const SampleData& getSampleData() const;
+    const uint32_t getSampleCount() const;
 
     matrix3x3 getSampleToCamera() const;
-    // Beware - this isn't actually suitable for taking a 2D sample coordinate + Z and converting to world space.
-    // Samples can be in any arbitrary space, packing, or function we choose. What's important is that when we
-    // unpack them, they turn into camera-relative 3D rays (origin offset + direction). From there, we can convert
-    // into world space using cameraToWorld.
-    matrix4x4 getSampleToWorld() const;
-    matrix4x4 getWorldToSample() const;
+
     void setCameraToWorld(const transform& cameraToWorld);
     matrix4x4 getCameraToWorld() const;
     const vector3& getTranslation() const;
-    vector3 getForward() const;
+
+    void setupRenderTarget(GPUContext& context);
+    void extractImage();
 
 protected:
+    Sample2Dto3DMappingSettings get2DSampleMappingSettings() const;
+
+    float _fovXDegrees = 0.0f;
+    float _fovYDegrees = 0.0f;
+
     // TODO(anankervis): clean up direct access of protected members by Raycaster
 
     GPUCamera* _gpuCamera;
 
-    matrix4x4 _worldToEyePrevious = matrix4x4::identity();
+    // Initialize to an invalid transform since there is no previous frame on the initial frame
+    matrix4x4 _worldToEyePrevious = matrix4x4::zero();
     matrix3x3 _eyePreviousToSamplePrevious = matrix3x3::identity();
 
     // Incremeted on every render
@@ -116,13 +125,16 @@ class Camera {
     ThinLens _lens = {0.0f, 1.0f};
     bool _enabled = true;
     ImageResourceDescriptor _renderTarget;
-    RaycasterOutputMode _outputMode = RaycasterOutputMode::COLOR_RGBA8;
+    RaycasterOutputFormat _outputFormat = RaycasterOutputFormat::COLOR_RGBA8;
     FoveatedSampleData _foveatedSampleData;
+
     // Only for polar foveated sampling
     std::vector<vector2ui> _polarRemapToPixel;
 
-    DynamicArray<RayPacketFrustum3D> _blockFrustaTransformed;
-    DynamicArray<RayPacketFrustum3D> _tileFrustaTransformed;
+    struct CPUHierarchy {
+        DynamicArray<RayPacketFrustum3D> _blockFrusta;
+        DynamicArray<RayPacketFrustum3D> _tileFrusta;
+    } _cpuHierarchy;
 
     transform _cameraToWorld = transform::identity();
 
diff --git a/libraries/hvvr/raycaster/cuda_util.h b/libraries/hvvr/raycaster/cuda_util.h
index d5820d9..11f6ad8 100644
--- a/libraries/hvvr/raycaster/cuda_util.h
+++ b/libraries/hvvr/raycaster/cuda_util.h
@@ -19,8 +19,9 @@
 #define cutilSafeCall(error) __cudaSafeCall(error, __FILE__, __LINE__)
 inline void __cudaSafeCall(cudaError_t error, const char* file, const int line) {
     if (error != cudaSuccess) {
-        fprintf(stderr, "error: CudaSafeCall() failed at %s:%d with %s\n", file, line, cudaGetErrorString(error));
-#ifdef _WIN32
+        fprintf(stderr, "error %d: CudaSafeCall() failed at %s:%d with %s\n", error, file, line,
+                cudaGetErrorString(error));
+#if defined(_WIN32)
         __debugbreak();
 #else
         exit(error);
@@ -56,4 +57,75 @@ struct KernelDim {
     }
 };
 
-#define CUDA_INF __int_as_float(0x7f800000)
\ No newline at end of file
+#define CUDA_INF __int_as_float(0x7f800000)
+
+
+// Based on https://stackoverflow.com/questions/52286202/dynamic-dispatch-to-template-function-c
+// Use to generate all template function permutations and dispatch properly at runtime for a prefix of template booleans
+// Makes calling cuda kernels with many permutations concise.
+// Example:
+// Change
+// if (b0) {
+// 	if (b1) {
+// 		if (b2) {
+// 			myFunc<true, true, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<true, true, false, otherArgs>(args);
+// 		}
+// 	} else {
+// 		if (b2) {
+// 			myFunc<true, false, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<true, false, false, otherArgs>(args);
+// 		}
+// 	}
+// } else {
+// 	if (b1) {
+// 		if (b2) {
+// 			myFunc<false, true, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<false, true, false, otherArgs>(args);
+// 		}
+// 	} else {
+// 		if (b2) {
+// 			myFunc<false, false, true, otherArgs>(args);
+// 		}
+// 		else {
+// 			myFunc<false, false, false, otherArgs>(args);
+// 		}
+// 	}
+// }
+// into:
+// std::array<bool, 3> bargs = { { b0, b1, b2 } };
+// dispatch_bools<3>{}(bargs, [&](auto...Bargs) {
+//     myFunc<decltype(Bargs)::value..., otherArgs>(args);
+// });
+//
+// You may want to #pragma warning( disable : 4100) around the call, since there will be unrefenced Bargs in the call
+// chain
+template <bool b>
+using kbool = std::integral_constant<bool, b>;
+
+#pragma warning(push)
+#pragma warning(disable : 4100)
+template <std::size_t max>
+struct dispatch_bools {
+    template <std::size_t N, class F, class... Bools>
+    void operator()(std::array<bool, N> const& input, F&& continuation, Bools...) {
+        if (input[max - 1])
+            dispatch_bools<max - 1>{}(input, continuation, kbool<true>{}, Bools{}...);
+        else
+            dispatch_bools<max - 1>{}(input, continuation, kbool<false>{}, Bools{}...);
+    }
+};
+template <>
+struct dispatch_bools<0> {
+    template <std::size_t N, class F, class... Bools>
+    void operator()(std::array<bool, N> const& input, F&& continuation, Bools...) {
+        continuation(Bools{}...);
+    }
+};
+#pragma warning(pop)
diff --git a/libraries/hvvr/raycaster/foveated.cpp b/libraries/hvvr/raycaster/foveated.cpp
index e8d47ca..1ac4ddb 100644
--- a/libraries/hvvr/raycaster/foveated.cpp
+++ b/libraries/hvvr/raycaster/foveated.cpp
@@ -17,7 +17,7 @@
 
 namespace hvvr {
 
-void generateRemapForFoveatedSamples(ArrayView<DirectionSample> unsortedSamples,
+void generateRemapForFoveatedSamples(ArrayView<DirectionalBeam> unsortedSamples,
                                      ArrayView<uint32_t> remap) { // Create a remapping from the original directional
                                                                   // samples to ones that are binned decently
 
@@ -25,14 +25,18 @@ void generateRemapForFoveatedSamples(ArrayView<DirectionSample> unsortedSamples,
         vector2 position; // eccentricity,theta (octahedral is a bad match for the naive clustering algorithm)
         uint32_t originalIndex;
     };
+    // There is a massive performance difference between using eccentricity for the y coordinate and the x coordinate in
+    // this clustering This points to this clustering being unstable and we might want a better algorithm. ~3.5x
+    // performance improvement in intersection when eccentricity is the y coordinate, which leads to better distribution
+    // (for default foveation settings at least)
     DynamicArray<DirectionalSampleToSort> toSort(unsortedSamples.size());
     for (size_t i = 0; i < toSort.size(); ++i) {
-        vector3 v = unsortedSamples[i].direction;
+        vector3 v = unsortedSamples[i].centerRay;
         // Eccentricity angle
-        toSort[i].position.x = acosf(-v.z);
-        vector3 dir = vector3(v.x, v.y, 0.0f);
+        toSort[i].position.y = acosf(-v.z);
+        vector3 dir = normalize(vector3(v.x, v.y, 0.0f));
         // Angle of rotation about z, measured from x
-        toSort[i].position.y = atan2f(normalize(dir).y, normalize(dir).x);
+        toSort[i].position.x = atan2f(dir.y, dir.x);
         toSort[i].originalIndex = (uint32_t)i;
     }
     auto blockCount = (toSort.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
@@ -43,82 +47,31 @@ void generateRemapForFoveatedSamples(ArrayView<DirectionSample> unsortedSamples,
     }
 }
 
-void getEccentricityRemap(std::vector<float>& eccentricityRemap,
-                          const std::vector<float>& ringEccentricities,
-                          float maxEccentricityRadians,
-                          size_t mapSize) {
-    eccentricityRemap.resize(mapSize);
-    for (int i = 0; i < eccentricityRemap.size(); ++i) {
-        float eccentricity = ((i + 0.5f) / eccentricityRemap.size()) * maxEccentricityRadians;
-        int j = 0;
-        bool inBetween = false;
-        while (j < ringEccentricities.size()) {
-            if (eccentricity < ringEccentricities[j]) {
-                inBetween = true;
-                break;
-            }
-            ++j;
-        }
-        // TODO: fit spline instead of piecewise linear approx
-        if (j == 0) {
-            eccentricityRemap[i] = ((eccentricity / ringEccentricities[0]) * 0.5f) / ringEccentricities.size();
-        } else if (inBetween) {
-            float lower = ringEccentricities[j - 1];
-            float higher = ringEccentricities[j];
-            float alpha = (eccentricity - lower) / (higher - lower);
-            eccentricityRemap[i] = (((j - 1) * (1.0f - alpha) + j * alpha) + 0.5f) / ringEccentricities.size();
-        } else {
-            size_t lastIndex = ringEccentricities.size() - 1;
-            float lastDiff = (ringEccentricities[lastIndex] - ringEccentricities[lastIndex - 1]);
-            float extrapolation = (eccentricity - ringEccentricities[lastIndex]) / lastDiff;
-            eccentricityRemap[i] =
-                min(1.0f, (ringEccentricities.size() - 0.5f + extrapolation) / ringEccentricities.size());
-        }
-    }
-}
-
 void generateEyeSpacePolarFoveatedSampleData(FoveatedSampleData& foveatedSampleData,
                                              std::vector<vector2ui>& polarRemapToPixel,
-                                             std::vector<float>& ringEccentricities,
-                                             std::vector<float>& eccentricityRemap,
+                                             EccentricityMap& eccentricityMap,
                                              size_t& samplesPerRing,
                                              RayCasterSpecification::FoveatedSamplePattern pattern) {
-    if (foveatedSampleData.eyeSpaceSamples.size() == 0) {
-        DynamicArray<DirectionSample> unsortedEyeSpaceSamples = getEyeSpacePolarFoveatedSamples(
-            ringEccentricities, samplesPerRing, pattern.degreeTrackingError, pattern.minMAR, pattern.maxMAR,
-            pattern.maxFOVDegrees, pattern.marSlope, pattern.fovealMARDegrees, pattern.zenithJitterStrength,
-            pattern.ringJitterStrength);
+    if (foveatedSampleData.samples.directionalSamples.size() == 0) {
+        DynamicArray<DirectionalBeam> unsortedEyeSpaceSamples =
+            getEyeSpacePolarFoveatedSamples(samplesPerRing, eccentricityMap, pattern.degreeTrackingError,
+                                            pattern.maxFOVDegrees, pattern.marSlope, pattern.fovealMARDegrees);
+
         printf("Generated eyes space foveated samples: %d, in %dx%d polar grid (azimuth x zenith)\n",
                uint32_t(unsortedEyeSpaceSamples.size()), uint32_t(samplesPerRing),
                uint32_t(unsortedEyeSpaceSamples.size() / samplesPerRing));
-        float maxEccentricityRadians = pattern.maxFOVDegrees * RadiansPerDegree;
-        getEccentricityRemap(eccentricityRemap, ringEccentricities, maxEccentricityRadians, 10000);
         {
             DynamicArray<uint32_t> oldToNewRemap(unsortedEyeSpaceSamples.size());
             generateRemapForFoveatedSamples(unsortedEyeSpaceSamples, oldToNewRemap);
-            foveatedSampleData.eyeSpaceSamples = DynamicArray<DirectionSample>(unsortedEyeSpaceSamples.size());
-            for (size_t i = 0; i < unsortedEyeSpaceSamples.size(); ++i) {
-                foveatedSampleData.eyeSpaceSamples[oldToNewRemap[i]] = unsortedEyeSpaceSamples[i];
-            }
+			foveatedSampleData.samples.directionalSamples = DynamicArray<DirectionalBeam>(unsortedEyeSpaceSamples.size());
             polarRemapToPixel.resize(unsortedEyeSpaceSamples.size());
             for (size_t i = 0; i < unsortedEyeSpaceSamples.size(); ++i) {
+				foveatedSampleData.samples.directionalSamples[oldToNewRemap[i]] = unsortedEyeSpaceSamples[i];
                 polarRemapToPixel[oldToNewRemap[i]] = {(uint32_t)(i % samplesPerRing), (uint32_t)(i / samplesPerRing)};
             }
         }
-        {
-            foveatedSampleData.precomputedEyeSpaceSamples =
-                DynamicArray<PrecomputedDirectionSample>(foveatedSampleData.eyeSpaceSamples.size());
-            for (size_t i = 0; i < unsortedEyeSpaceSamples.size(); ++i) {
-                auto& p = foveatedSampleData.precomputedEyeSpaceSamples[i];
-                p.center = foveatedSampleData.eyeSpaceSamples[i].direction;
-                p.d1 = p.center + foveatedSampleData.eyeSpaceSamples[i].azimuthalDifferential;
-                p.d2 = p.center + foveatedSampleData.eyeSpaceSamples[i].zenithDifferential;
-            }
-        }
         auto blockCount = (unsortedEyeSpaceSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
         // Allocate the most you will ever need to prevent per-frame allocation
-        foveatedSampleData.samples.blockFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount);
-        foveatedSampleData.samples.tileFrusta2D = DynamicArray<RayPacketFrustum2D>(blockCount * TILES_PER_BLOCK);
         foveatedSampleData.samples.blockFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount);
         foveatedSampleData.samples.tileFrusta3D = DynamicArray<RayPacketFrustum3D>(blockCount * TILES_PER_BLOCK);
     }
@@ -129,21 +82,20 @@ void polarSpaceFoveatedSetup(Raycaster* raycaster) {
         if (!camera->getEnabled())
             continue;
         // Generate eye space samples if necessary
-        if (camera->_foveatedSampleData.eyeSpaceSamples.size() == 0) {
+        if (camera->_foveatedSampleData.samples.directionalSamples.size() == 0) {
             size_t samplesPerRing;
 
-            std::vector<float> eccentricityRemap;
-            std::vector<float> ringEccentricities;
+            EccentricityMap eccentricityMap;
             generateEyeSpacePolarFoveatedSampleData(camera->_foveatedSampleData, camera->_polarRemapToPixel,
-                                                    ringEccentricities, eccentricityRemap, samplesPerRing,
+                                                    eccentricityMap, samplesPerRing,
                                                     raycaster->_spec.foveatedSamplePattern);
             float maxEccentricityRadians = raycaster->_spec.foveatedSamplePattern.maxFOVDegrees * RadiansPerDegree;
             size_t paddedSampleCount =
-                ((camera->_foveatedSampleData.eyeSpaceSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+                ((camera->_foveatedSampleData.samples.directionalSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
             camera->_gpuCamera->registerPolarFoveatedSamples(camera->_polarRemapToPixel, maxEccentricityRadians,
-                                                             ringEccentricities, eccentricityRemap,
-                                                             uint32_t(samplesPerRing), uint32_t(paddedSampleCount));
-            camera->_gpuCamera->updateEyeSpaceFoveatedSamples(camera->_foveatedSampleData.precomputedEyeSpaceSamples);
+                                                             eccentricityMap, uint32_t(samplesPerRing),
+                                                             uint32_t(paddedSampleCount));
+            camera->_gpuCamera->updateEyeSpaceFoveatedSamples(camera->_foveatedSampleData.samples.directionalSamples);
             camera->_foveatedSampleData.simpleBlockFrusta =
                 DynamicArray<SimpleRayFrustum>(camera->_foveatedSampleData.samples.blockFrusta3D.size());
             camera->_foveatedSampleData.simpleTileFrusta =
diff --git a/libraries/hvvr/raycaster/foveated.h b/libraries/hvvr/raycaster/foveated.h
index 24de722..66621e9 100644
--- a/libraries/hvvr/raycaster/foveated.h
+++ b/libraries/hvvr/raycaster/foveated.h
@@ -9,37 +9,117 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
+#include "graphics_types.h"
 #include "raycaster_spec.h"
 #include "sample_hierarchy.h"
-#include "graphics_types.h"
+#include <constants_math.h>
+#include <cuda_decl.h>
 
 namespace hvvr {
 
 class Raycaster;
 
-// precomputed eye-space derivatives of the direction sample
-struct PrecomputedDirectionSample {
-    vector3 center; // original ray
-    vector3 d1;     // differential ray 1
-    vector3 d2;     // differential ray 2
+struct FoveatedSampleData {
+    size_t validSampleCount = 0;
+    size_t blockCount = 0;
+    SampleHierarchy samples;
+    DynamicArray<SimpleRayFrustum> simpleTileFrusta;
+    DynamicArray<SimpleRayFrustum> simpleBlockFrusta;
 };
 
-struct FoveatedSampleData {
-	uint32_t triangleCount = 0;
-	size_t validSampleCount = 0;
-	size_t blockCount = 0;
-	SampleHierarchy samples;
-	DynamicArray<SimpleRayFrustum> simpleTileFrusta;
-	DynamicArray<SimpleRayFrustum> simpleBlockFrusta;
-	DynamicArray<DirectionSample> eyeSpaceSamples;
-	DynamicArray<PrecomputedDirectionSample> precomputedEyeSpaceSamples;
+/* Abstraction of the mapping R -> E where:
+R is the space of ring coordinates and
+E is the eccentricity, given in degrees
+*/
+struct EccentricityMap {
+    float m;
+    float w_0;
+    float S;
+
+
+    // Precomputed potentially expensive intermediates
+    // log(m+1)
+    float invLogA;
+    // 1 / w_0
+    float invW_0;
+    // 1 / m
+    float invM;
+
+    EccentricityMap() {}
+    EccentricityMap(float marSlope, float maxMARDegrees, float maxResolutionDegrees) {
+        m = marSlope;
+        w_0 = maxMARDegrees * RadiansPerDegree;
+        S = maxResolutionDegrees * RadiansPerDegree;
+
+        // Compute in double precision since it only happens once
+        invLogA = float(1.0 / log(1.0 + (double)m));
+        invW_0 = float(1.0 / w_0);
+        invM = float(1.0 / m);
+    }
+    /**
+    Mapping from ring coordinates to eccentricity, where E(0) = 0, where we don't start
+    the resolution falloff until the eccentricity is S
+    let S = maxEyeTrackingUncertaintyDegrees
+
+    This is a function that is a continuous generalization of this recurrence:
+    w(n) = m * max(E(n) - S, 0) + w_0
+    E(n+1) = E(n) + w(n)
+
+    The character of the function is obviously different starting at E(n) - S = 0, so we can
+    split it piecewise
+    w(n) = w_0              | E(n) <= S
+    w(n) = m*(E(n)-S) + w_0 | E(n) > S
+
+    For  E(n) <= S, this results in the linear equation:
+    E(n) = w_0*n
+
+    Eccentricity intially increases at a constant rate (within the radius of uncertainty).
+
+    Now for the more complicated case
+
+    Solving a recurrence relation for the hyperbolic falloff (after a simple coordinate transform):
+    let x = (w_0*n-S) / w_0
+    let a = 1+m
+    Then we are solving for g(x) where:
+    g(x+1) = a*g(x)+w_0
+    and
+    g(0) = 0
+    Which gives us (https://m.wolframalpha.com/input/?i=g%28n%2B1%29%3Da*g%28n%29%2Bw%2C+g%280%29%3D0):
+    g(x) = (w_0*(a^x-1))/(a-1)
+
+    Then we can transform the coordinates back:
+    E(n) = g((w_0*n-S) / w_0)+S
+
+    We can invert g to help get a map from eccentricity to ring location:
+    g = (w_0*(a^x-1))/(a-1)
+    (a-1)g/w_0=(a^x-1)
+    a^x=(a-1)g/w_0+1
+    x = ln((a-1)g/w_0+1)/ln(a)
+
+    */
+    // Transform ring coordinates to eccentricity, given in radians
+    CHD float apply(float i) {
+        float e0 = i * w_0;
+        float x = (e0 - S) * invW_0;
+        float g_x = (w_0 * (powf(1.0f + m, x) - 1.0f)) * invM;
+        float e1 = g_x + S;
+        return (e0 < S) ? e0 : e1;
+    };
+    // Transform eccentricity, given in radians, to ring coordinates
+    CHD float applyInverse(float E) {
+        float i0 = E * invW_0;
+        float sDivW = S * invW_0;
+        float x = logf((m * (i0 - sDivW)) + 1.0f) * invLogA;
+        float i1 = x + sDivW;
+        return (E < S) ? i0 : i1;
+    };
 };
 
+
 // TODO(anankervis): merge into Raycaster class
 void generateEyeSpacePolarFoveatedSampleData(FoveatedSampleData& foveatedSampleData,
                                              std::vector<vector2ui>& polarRemapToPixel,
-                                             std::vector<float>& ringEccentricities,
-                                             std::vector<float>& eccentricityRemap,
+                                             EccentricityMap& eccentricityMap,
                                              size_t& samplesPerRing,
                                              RayCasterSpecification::FoveatedSamplePattern pattern);
 
diff --git a/libraries/hvvr/raycaster/frusta.cu b/libraries/hvvr/raycaster/frusta.cu
index 45edb9c..aace58a 100644
--- a/libraries/hvvr/raycaster/frusta.cu
+++ b/libraries/hvvr/raycaster/frusta.cu
@@ -7,7 +7,6 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#include "traversal.h"
 #include "constants_math.h"
 #include "cuda_decl.h"
 #include "foveated.h"
@@ -18,40 +17,36 @@
 #include "kernel_constants.h"
 #include "memory_helpers.h"
 #include "sort.h"
+#include "traversal.h"
 #include "vector_math.h"
 #include "warp_ops.h"
 
 
 namespace hvvr {
 
-// TODO(mmara): Handle beyond single point of origin rays
-void ComputeEyeSpaceFrusta(const GPUBuffer<PrecomputedDirectionSample>& dirSamples,
+void ComputeEyeSpaceFrusta(const GPUBuffer<DirectionalBeam>& dirSamples,
                            GPUBuffer<SimpleRayFrustum>& tileFrusta,
                            GPUBuffer<SimpleRayFrustum>& blockFrusta) {
-    DynamicArray<PrecomputedDirectionSample> samples = makeDynamicArray(dirSamples);
+    DynamicArray<DirectionalBeam> samples = makeDynamicArray(dirSamples);
     DynamicArray<SimpleRayFrustum> tFrusta = makeDynamicArray(tileFrusta);
     DynamicArray<SimpleRayFrustum> bFrusta = makeDynamicArray(blockFrusta);
 
+    const bool checkFrustaAccuracy = false;
+    const bool printStats = false;
+
     auto generateFrusta = [](DynamicArray<SimpleRayFrustum>& frusta, unsigned int frustaSampleCount,
-                             const DynamicArray<PrecomputedDirectionSample>& samples, float slopFactor,
-                             int numOrientationsToTry) {
-        auto toDir = [](const matrix3x3& rot, float u, float v) {
-            return rot * normalize(vector3(u, v, 1.0f));
-        };
+                             const DynamicArray<DirectionalBeam>& samples, float slopFactor, int numOrientationsToTry) {
+        auto toDir = [](const matrix3x3& rot, float u, float v) { return rot * normalize(vector3(u, v, 1.0f)); };
         for (int i = 0; i < frusta.size(); ++i) {
             int sBegin = i * frustaSampleCount;
             int sEnd = min((int)((i + 1) * frustaSampleCount), (int)samples.size());
             vector3 dominantDirection(0.0f);
             for (int s = sBegin; s < sEnd; ++s) {
-                // printf("samples[%d].center : %f, %f, %f\n", s, samples[s].center.x, samples[s].center.y,
-                // samples[s].center.z);
-                dominantDirection += samples[s].center;
+                dominantDirection += samples[s].centerRay;
             }
             dominantDirection = normalize(dominantDirection);
-            // printf("Dominant Direction %d: %f, %f, %f\n", i, dominantDirection.x, dominantDirection.y,
-            // dominantDirection.z);
 
-            // Try several different orientations for the plane, pick the one that 
+            // Try several different orientations for the plane, pick the one that
             // gives the smallest bounding box in uv space
             matrix3x3 rot(matrix3x3::rotationFromZAxis(dominantDirection));
             float bestUVArea = INFINITY;
@@ -59,22 +54,23 @@ void ComputeEyeSpaceFrusta(const GPUBuffer<PrecomputedDirectionSample>& dirSampl
             vector2 bestMinUV = vector2(INFINITY);
             vector2 bestMaxUV = vector2(-INFINITY);
             for (int o = 0; o < numOrientationsToTry; ++o) {
-                matrix3x3 currRot = matrix3x3::axisAngle(vector3(0, 0, 1), (Pi * o / float(numOrientationsToTry))) * rot;
+                const float range = (Pi / 2.0f) * 0.8f;
+                matrix3x3 currRot =
+                    matrix3x3::axisAngle(vector3(0, 0, 1), (range * o / float(numOrientationsToTry)) - (range / 2.0f)) *
+                    rot;
                 matrix3x3 invCurrRot = invert(currRot);
                 vector2 minUV = vector2(INFINITY);
                 vector2 maxUV = vector2(-INFINITY);
                 for (int s = sBegin; s < sEnd; ++s) {
-                    vector3 v = invCurrRot * samples[s].center;
+                    vector3 v = invCurrRot * samples[s].centerRay;
                     vector2 uv = vector2(v.x / v.z, v.y / v.z);
-
                     // TODO: check math here
-                    v = invCurrRot * samples[s].d1;
+                    v = invCurrRot * (samples[s].du + samples[s].centerRay);
                     float uvRadius = length(uv - vector2(v.x / v.z, v.y / v.z));
-                    v = invCurrRot * samples[s].d2;
+                    v = invCurrRot * (samples[s].dv + samples[s].centerRay);
                     uvRadius = max(uvRadius, length(uv - vector2(v.x / v.z, v.y / v.z)));
                     // slop; TODO: is this necessary, or can we do something more principled?
                     uvRadius *= slopFactor;
-
                     minUV = min(minUV, uv - uvRadius);
                     maxUV = max(maxUV, uv + uvRadius);
                 }
@@ -96,147 +92,44 @@ void ComputeEyeSpaceFrusta(const GPUBuffer<PrecomputedDirectionSample>& dirSampl
             f.directions[2] = toDir(bestRot, bestMaxUV.x, bestMinUV.y);
             f.directions[3] = toDir(bestRot, bestMinUV.x, bestMinUV.y);
 
-#if 0
-            // Make sure all samples points are within the frustum...
-            RayPacketFrustum3D checker(f);
-            for (int s = sBegin; s < sEnd; ++s) {
-                vector4 v = toVec(samples[s].center);
-                if (!checker.testPoint(v)) {
-                    printf("TROUBLE!\n");
+            if (printStats) {
+                for (int o = 0; o < 4; ++o) {
+                    printf("f[%d].directions[%d]: %f, %f, %f\n", i, o, f.directions[o].x, f.directions[o].y,
+                           f.directions[o].z);
                 }
+                printf("f[%d].bestUVArea: %f\n", i, bestUVArea);
+                printf("Dominant Direction: %f %f %f\n", dominantDirection.x, dominantDirection.y, dominantDirection.z);
             }
-#endif
-
-            for (int o = 0; o < 4; ++o) {
-                printf("f[%d].directions[%d]: %f, %f, %f\n", i, o, f.directions[o].x, f.directions[o].y,
-                       f.directions[o].z);
+            if (checkFrustaAccuracy) {
+                // Make sure all samples points are within the frustum...
+                RayPacketFrustum3D checker(f);
+                for (int s = sBegin; s < sEnd; ++s) {
+                    auto C = samples[s].centerRay;
+                    if (!checker.testPoint(C)) {
+                        printf("TROUBLE: f[%d]: s[%d]:%f %f %f \n", i, s, C.x, C.y, C.z);
+                    }
+                }
             }
-            printf("f[%d].bestUVArea: %f\n", i, bestUVArea);
 
             frusta[i] = f;
         }
     };
-    generateFrusta(tFrusta, TILE_SIZE, samples, 4.0f, 64);
-    generateFrusta(bFrusta, BLOCK_SIZE, samples, 4.0f, 64);
+    generateFrusta(tFrusta, TILE_SIZE, samples, 2.0f, 63);
+    generateFrusta(bFrusta, BLOCK_SIZE, samples, 2.0f, 63);
 
     tileFrusta = makeGPUBuffer(tFrusta);
     blockFrusta = makeGPUBuffer(bFrusta);
 }
 
-CUDA_KERNEL void CalculateSampleCullFrustaKernel(GPURayPacketFrustum* d_blockFrusta,
-                                                 GPURayPacketFrustum* d_tileFrusta,
-                                                 SampleInfo sampleInfo,
-                                                 const uint32_t sampleCount) {
-    uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
-    uint32_t tileIndex, blockIndex;
-    float minLocX = CUDA_INF;
-    float minLocY = CUDA_INF;
-    float negMaxLocX = CUDA_INF;
-    float negMaxLocY = CUDA_INF;
-    if (index < sampleCount) {
-        tileIndex = index / TILE_SIZE;
-        blockIndex = index / BLOCK_SIZE;
-        // TODO: see if we can cut down on work here
-        UnpackedSample s = GetFullSample(index, sampleInfo);
-        vector2 location = s.center;
-        float radius = sampleInfo.extents[index].majorAxisLength;
-        if (location.x != CUDA_INF) {
-            minLocX = location.x - radius;
-            minLocY = location.y - radius;
-            negMaxLocX = -(location.x + radius);
-            negMaxLocY = -(location.y + radius);
-        }
-    }
-    // Do warp reduction
-    auto minOp = [](float a, float b) -> float { return min(a, b); };
-    minLocX = warpReduce(minLocX, minOp);
-    minLocY = warpReduce(minLocY, minOp);
-    negMaxLocX = warpReduce(negMaxLocX, minOp);
-    negMaxLocY = warpReduce(negMaxLocY, minOp);
-    int lane = threadIdx.x % WARP_SIZE;
-    // All min values are in lane 0, go ahead and atomic min the results
-    if (lane == 0 && (index < sampleCount)) {
-        // No native float atomics, so we need to encode to handle our floats
-        atomicMin((uint32_t*)(&d_tileFrusta[tileIndex].xMin), FloatFlipF(minLocX));
-        atomicMin((uint32_t*)(&d_tileFrusta[tileIndex].yMin), FloatFlipF(minLocY));
-        atomicMin((uint32_t*)(&d_tileFrusta[tileIndex].xNegMax), FloatFlipF(negMaxLocX));
-        atomicMin((uint32_t*)(&d_tileFrusta[tileIndex].yNegMax), FloatFlipF(negMaxLocY));
-
-        atomicMin((uint32_t*)(&d_blockFrusta[blockIndex].xMin), FloatFlipF(minLocX));
-        atomicMin((uint32_t*)(&d_blockFrusta[blockIndex].yMin), FloatFlipF(minLocY));
-        atomicMin((uint32_t*)(&d_blockFrusta[blockIndex].xNegMax), FloatFlipF(negMaxLocX));
-        atomicMin((uint32_t*)(&d_blockFrusta[blockIndex].yNegMax), FloatFlipF(negMaxLocY));
-    }
-}
-CUDA_KERNEL void DecodeSampleCullFrustaKernel(GPURayPacketFrustum* d_blockFrusta,
-                                              uint32_t blockCount,
-                                              GPURayPacketFrustum* d_tileFrusta,
-                                              uint32_t tileCount) {
-    const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < blockCount + tileCount) {
-        uint32_t* ptr;
-        if (index < blockCount) {
-            ptr = (uint32_t*)(&d_blockFrusta[index]);
-        } else {
-            ptr = (uint32_t*)(&d_tileFrusta[index - blockCount]);
-        }
-        for (int i = 0; i < 4; ++i) {
-            ptr[i] = IFloatFlip(ptr[i]);
-        }
-    }
-}
-
-CUDA_KERNEL void ResetCullFrustaKernel(GPURayPacketFrustum* d_blockFrusta,
-                                       uint32_t blockCount,
-                                       GPURayPacketFrustum* d_tileFrusta,
-                                       uint32_t tileCount) {
-    const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < blockCount + tileCount) {
-        GPURayPacketFrustum* ptr;
-        if (index < blockCount) {
-            ptr = &d_blockFrusta[index];
-        } else {
-            ptr = &d_tileFrusta[index - blockCount];
-        }
-        // Set to -INFINITY, as its the FlipFloat encoding of +INFINITY
-        ptr[0].xMin = -CUDA_INF;
-        ptr[0].xNegMax = -CUDA_INF;
-        ptr[0].yMin = -CUDA_INF;
-        ptr[0].yNegMax = -CUDA_INF;
-    }
-}
 
-void ResetCullFrusta(GPURayPacketFrustum* d_blockFrusta,
-                     GPURayPacketFrustum* d_tileFrusta,
-                     const uint32_t tileCount,
-                     const uint32_t blockCount,
-                     cudaStream_t stream) {
-    {
-        size_t combinedBlockAndTileCount = tileCount + blockCount;
-        KernelDim dim = KernelDim(combinedBlockAndTileCount, CUDA_GROUP_SIZE);
-        ResetCullFrustaKernel<<<dim.grid, dim.block, 0, stream>>>(d_blockFrusta, blockCount, d_tileFrusta, tileCount);
-    }
-}
-
-void CalculateSampleCullFrusta(GPURayPacketFrustum* d_blockFrusta,
-                               GPURayPacketFrustum* d_tileFrusta,
-                               SampleInfo sampleInfo,
-                               const uint32_t sampleCount,
-                               const uint32_t tileCount,
-                               const uint32_t blockCount,
-                               cudaStream_t stream) {
-    static_assert((TILE_SIZE % 32 == 0), "TILE_SIZE must be a multiple of 32, the CUDA warp size");
-    {
-        KernelDim dim = KernelDim(sampleCount, CUDA_GROUP_SIZE);
-        CalculateSampleCullFrustaKernel<<<dim.grid, dim.block, 0, stream>>>(d_blockFrusta, d_tileFrusta, sampleInfo,
-                                                                            sampleCount);
-    }
-    {
-        size_t combinedBlockAndTileCount = tileCount + blockCount;
-        KernelDim dim = KernelDim(combinedBlockAndTileCount, CUDA_GROUP_SIZE);
-        DecodeSampleCullFrustaKernel<<<dim.grid, dim.block, 0, stream>>>(d_blockFrusta, blockCount, d_tileFrusta,
-                                                                         tileCount);
+CUDA_HOST_DEVICE_INL bool planeCullsFrustum(const Plane plane, const SimpleRayFrustum& frustum) {
+    bool allout = true;
+    for (int i = 0; i < 4; ++i) {
+        allout = allout && dot(plane.normal, frustum.origins[i]) > plane.dist;
+        // Extend rays far-out
+        allout = allout && dot(plane.normal, frustum.origins[i] + frustum.directions[i] * 10000.0f) > plane.dist;
     }
+    return allout;
 }
 
 CUDA_KERNEL void CalculateWorldSpaceFrustaKernel(SimpleRayFrustum* blockFrustaWS,
@@ -244,12 +137,12 @@ CUDA_KERNEL void CalculateWorldSpaceFrustaKernel(SimpleRayFrustum* blockFrustaWS
                                                  SimpleRayFrustum* blockFrustaES,
                                                  SimpleRayFrustum* tileFrustaES,
                                                  matrix4x4 eyeToWorldMatrix,
+                                                 FourPlanes cullPlanes,
                                                  uint32_t blockCount,
                                                  uint32_t tileCount) {
     const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
     if (index < blockCount + tileCount) {
-        SimpleRayFrustum* inPtr;
-        SimpleRayFrustum* outPtr;
+        SimpleRayFrustum *inPtr, *outPtr;
         if (index < blockCount) {
             inPtr = (&blockFrustaES[index]);
             outPtr = (&blockFrustaWS[index]);
@@ -257,19 +150,25 @@ CUDA_KERNEL void CalculateWorldSpaceFrustaKernel(SimpleRayFrustum* blockFrustaWS
             inPtr = (&tileFrustaES[index - blockCount]);
             outPtr = (&tileFrustaWS[index - blockCount]);
         }
+        bool culled = false;
+        SimpleRayFrustum f;
+        for (int i = 0; i < 4; ++i) {
+            f.origins[i] = vector3(eyeToWorldMatrix * vector4((*inPtr).origins[i], 1.0f));
+            // We do not handle non-uniform scaling (could use inverse transpose of eyeToWorld to do so)
+            f.directions[i] = normalize(vector3(eyeToWorldMatrix * vector4((*inPtr).directions[i], 0.0f)));
+        }
         for (int i = 0; i < 4; ++i) {
-            vector4 origin =
-                eyeToWorldMatrix * vector4((*inPtr).origins[i].x, (*inPtr).origins[i].y, (*inPtr).origins[i].z, 1.0f);
-            (*outPtr).origins[i].x = origin.x;
-            (*outPtr).origins[i].y = origin.y;
-            (*outPtr).origins[i].z = origin.z;
-            // TODO(mmara): use inverse transpose to handle non-uniform scale?
-            vector4 direction = eyeToWorldMatrix * vector4((*inPtr).directions[i].x, (*inPtr).directions[i].y,
-                                                           (*inPtr).directions[i].z, 0.0f);
-            (*outPtr).directions[i].x = direction.x;
-            (*outPtr).directions[i].y = direction.y;
-            (*outPtr).directions[i].z = direction.z;
+            Plane p = cullPlanes.data[i];
+            culled = culled || planeCullsFrustum(p, f);
+        }
+        if (culled) {
+            for (int i = 0; i < 4; ++i) {
+                // Signal degenerate frustum
+                f.origins[i] = vector3(INFINITY, INFINITY, INFINITY);
+                f.directions[i] = vector3(0, 0, 0);
+            }
         }
+        (*outPtr) = f;
     }
 }
 
@@ -278,6 +177,7 @@ void CalculateWorldSpaceFrusta(SimpleRayFrustum* blockFrustaWS,
                                SimpleRayFrustum* blockFrustaES,
                                SimpleRayFrustum* tileFrustaES,
                                matrix4x4 eyeToWorldMatrix,
+                               FourPlanes cullPlanes,
                                uint32_t blockCount,
                                uint32_t tileCount,
                                cudaStream_t stream) {
@@ -285,7 +185,7 @@ void CalculateWorldSpaceFrusta(SimpleRayFrustum* blockFrustaWS,
     size_t combinedBlockAndTileCount = tileCount + blockCount;
     KernelDim dim = KernelDim(combinedBlockAndTileCount, CUDA_GROUP_SIZE);
     CalculateWorldSpaceFrustaKernel<<<dim.grid, dim.block, 0, stream>>>(
-        blockFrustaWS, tileFrustaWS, blockFrustaES, tileFrustaES, eyeToWorldMatrix, blockCount, tileCount);
+        blockFrustaWS, tileFrustaWS, blockFrustaES, tileFrustaES, eyeToWorldMatrix, cullPlanes, blockCount, tileCount);
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/frusta.h b/libraries/hvvr/raycaster/frusta.h
index b211bfb..ece8ad7 100644
--- a/libraries/hvvr/raycaster/frusta.h
+++ b/libraries/hvvr/raycaster/frusta.h
@@ -17,29 +17,21 @@
 
 namespace hvvr {
 
-void ComputeEyeSpaceFrusta(const GPUBuffer<PrecomputedDirectionSample>& dirSamples,
+// For conveniently passing four planes by value to the world space transformation kernel.
+struct FourPlanes {
+    Plane data[4];
+};
+
+void ComputeEyeSpaceFrusta(const GPUBuffer<DirectionalBeam>& dirSamples,
                            GPUBuffer<SimpleRayFrustum>& tileFrusta,
                            GPUBuffer<SimpleRayFrustum>& blockFrusta);
 
-void ResetCullFrusta(GPURayPacketFrustum* d_blockFrusta,
-                     GPURayPacketFrustum* d_tileFrusta,
-                     const uint32_t tileCount,
-                     const uint32_t blockCount,
-                     cudaStream_t stream);
-
-void CalculateSampleCullFrusta(GPURayPacketFrustum* d_blockFrusta,
-                               GPURayPacketFrustum* d_tileFrusta,
-                               SampleInfo sampleInfo,
-                               const uint32_t sampleCount,
-                               const uint32_t tileCount,
-                               const uint32_t blockCount,
-                               cudaStream_t stream);
-
 void CalculateWorldSpaceFrusta(SimpleRayFrustum* blockFrustaWS,
                                SimpleRayFrustum* tileFrustaWS,
                                SimpleRayFrustum* blockFrustaES,
                                SimpleRayFrustum* tileFrustaES,
                                matrix4x4 eyeToWorldMatrix,
+                               FourPlanes cullPlanes,
                                uint32_t blockCount,
                                uint32_t tileCount,
                                cudaStream_t stream);
diff --git a/libraries/hvvr/raycaster/gbuffer.h b/libraries/hvvr/raycaster/gbuffer.h
deleted file mode 100644
index 660b24d..0000000
--- a/libraries/hvvr/raycaster/gbuffer.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree. An additional grant
- * of patent rights can be found in the PATENTS file in the same directory.
- */
-
-#include "cuda_decl.h"
-
-#include <stdint.h>
-
-namespace hvvr {
-
-// TODO(anankervis): merge this file into another file
-struct RaycasterGBufferSubsample {
-    uint32_t triIndex;
-    uint32_t sampleMask;
-
-    template <uint32_t AARate>
-    CUDA_HOST_DEVICE static constexpr uint32_t getSampleMaskAll() {
-        return (AARate < 32) ? ((uint32_t(1) << AARate) - 1) : ~uint32_t(0);
-    }
-};
-
-} // namespace hvvr
diff --git a/libraries/hvvr/raycaster/gpu_camera.cu b/libraries/hvvr/raycaster/gpu_camera.cu
index b236313..fe8d977 100644
--- a/libraries/hvvr/raycaster/gpu_camera.cu
+++ b/libraries/hvvr/raycaster/gpu_camera.cu
@@ -14,18 +14,10 @@
 #include "kernel_constants.h"
 #include "magic_constants.h"
 #include "memory_helpers.h"
-
-#include <functional>
-#include <random>
-
-#ifdef DX_SUPPORTED
 #include <cuda_d3d11_interop.h>
-#else
-struct ID3D11Texture2D;
-struct ID3D11Buffer;
-#endif
-
 #include <cuda_gl_interop.h>
+#include <functional>
+#include <random>
 
 
 namespace hvvr {
@@ -34,6 +26,8 @@ uint32_t pixelFormatSize(PixelFormat pixelFormat) {
     switch (pixelFormat) {
         case PixelFormat::RGBA8_SRGB:
             return 4;
+        case PixelFormat::RGBA16:
+            return 8;
         case PixelFormat::RGBA32F:
             return 16;
         default:
@@ -66,7 +60,8 @@ GPUCamera::GPUCamera(const Camera* cameraPtr) : streamedIndexCPU(0), streamedInd
 void GPUCamera::initLookupTables(int _MSAARate) {
     // getSubsampleUnitOffset needs a compile-time constant for MSAARate
     enum { MSAARate = COLOR_MODE_MSAA_RATE };
-    assert(MSAARate == _MSAARate);
+    if (MSAARate != _MSAARate)
+        fail("MSAARate for lookup table must match compile-time constant\n");
 
     std::uniform_real_distribution<float> uniformRandomDist(0.0f, 1.0f);
     std::mt19937 generator;
@@ -132,28 +127,23 @@ void GPUCamera::streamedDataGpuDone() {
 }
 
 void GPUCamera::setCameraJitter(vector2 jitter) {
-    frameJitter.x = jitter.x;
-    frameJitter.y = jitter.y;
+    frameJitter = jitter;
 }
 
-void GPUCamera::updatePerFrame(vector3 cameraPos,
-                               vector3 cameraLookVector,
-                               const matrix3x3& _sampleToCamera,
-                               const matrix4x4& _cameraToWorld) {
-    position = vector3(cameraPos.x, cameraPos.y, cameraPos.z);
-    lookVector = vector3(cameraLookVector.x, cameraLookVector.y, cameraLookVector.z);
-    sampleToCamera = _sampleToCamera;
+void GPUCamera::updateTransform(const matrix4x4& _cameraToWorld) {
     cameraToWorld = _cameraToWorld;
 }
 
-static int getMSAARate(RaycasterOutputMode outputMode) {
-    return (outputMode == RaycasterOutputMode::COLOR_RGBA8) ? COLOR_MODE_MSAA_RATE : 1;
+static int getMSAARate(RaycasterOutputFormat outputMode) {
+    return (outputMode == RaycasterOutputFormat::COLOR_RGBA8) ? COLOR_MODE_MSAA_RATE : 1;
 }
 
 static TextureFormat pixelFormatToTextureFormat(PixelFormat format) {
     switch (format) {
         case PixelFormat::RGBA8_SRGB:
             return TextureFormat::r8g8b8a8_unorm_srgb;
+        case PixelFormat::RGBA16:
+            return TextureFormat::r16g16b16a16_unorm;
         case PixelFormat::RGBA32F:
             return TextureFormat::r32g32b32a32_float;
         default:
@@ -163,10 +153,9 @@ static TextureFormat pixelFormatToTextureFormat(PixelFormat format) {
 }
 
 // TODO(anankervis): merge the different functions that duplicate camera resource creation
-void GPUCamera::updateConfig(RaycasterOutputMode _outputMode,
+void GPUCamera::updateConfig(RaycasterOutputFormat _outputMode,
                              int32_t* sampleRemap,
-                             float* sampleLocations,
-                             Sample::Extents* sampleExtents,
+                             DirectionalBeam* directionalSamples,
                              ThinLens _lens,
                              uint32_t _sampleCount,
                              uint32_t imageWidth,
@@ -180,9 +169,7 @@ void GPUCamera::updateConfig(RaycasterOutputMode _outputMode,
     validSampleCount = imageWidth * imageHeight * splitColorSamples;
     d_sampleRemap = GPUBuffer<int32_t>(sampleRemap, sampleRemap + validSampleCount);
     sampleCount = _sampleCount;
-    d_sampleLocations = GPUBuffer<vector2>((vector2*)sampleLocations, (vector2*)(sampleLocations) + sampleCount);
-    d_sampleExtents =
-        GPUBuffer<Sample::Extents>((Sample::Extents*)sampleExtents, (Sample::Extents*)(sampleExtents) + sampleCount);
+    d_directionalBeams = GPUBuffer<DirectionalBeam>(directionalSamples, directionalSamples + sampleCount);
 
     outputMode = _outputMode;
     int msaaRate = getMSAARate(outputMode);
@@ -191,17 +178,16 @@ void GPUCamera::updateConfig(RaycasterOutputMode _outputMode,
     PixelFormat outputFormat = outputModeToPixelFormat(outputMode);
     TextureFormat textureFormat = pixelFormatToTextureFormat(outputFormat);
 
-    previousResultTexture =
-        createEmptyTexture(imageWidth, imageHeight, textureFormat, cudaAddressModeClamp, cudaAddressModeClamp);
-    resultTexture =
-        createEmptyTexture(imageWidth, imageHeight, textureFormat, cudaAddressModeClamp, cudaAddressModeClamp);
+    auto createImageSizedTexture = [&]() {
+        return createEmptyTexture(imageWidth, imageHeight, textureFormat, cudaAddressModeClamp, cudaAddressModeClamp);
+    };
 
+    previousResultTexture = createImageSizedTexture();
+    resultTexture = createImageSizedTexture();
     contrastEnhancementSettings.enable = true;
-    contrastEnhancementSettings.f_e = 0.2f;
-    contrastEnhancementBuffers.horizontallyFiltered =
-        createEmptyTexture(imageWidth, imageHeight, textureFormat, cudaAddressModeClamp, cudaAddressModeClamp);
-    contrastEnhancementBuffers.fullyFiltered =
-        createEmptyTexture(imageWidth, imageHeight, textureFormat, cudaAddressModeClamp, cudaAddressModeClamp);
+    contrastEnhancementSettings.f_e = 1.0f;
+    contrastEnhancementBuffers.horizontallyFiltered = createImageSizedTexture();
+    contrastEnhancementBuffers.fullyFiltered = createImageSizedTexture();
 
     auto pixelFormat = outputModeToPixelFormat(outputMode);
     d_sampleResults =
@@ -214,24 +200,19 @@ void GPUCamera::updateConfig(RaycasterOutputMode _outputMode,
 
 void GPUCamera::registerPolarFoveatedSamples(const std::vector<vector2ui>& polarRemapToPixel,
                                              float _maxEccentricityRadians,
-                                             const std::vector<float>& ringEccentricities,
-                                             const std::vector<float>& eccentricityCoordinateMap,
+                                             const EccentricityMap& eMap,
                                              uint32_t samplesPerRing,
                                              uint32_t paddedSampleCount) {
     PixelFormat outputFormat = outputModeToPixelFormat(outputMode);
     sampleCount = paddedSampleCount;
     d_sampleResults = GPUBuffer<uint32_t>((paddedSampleCount * pixelFormatSize(outputFormat) + sizeof(uint32_t) - 1) /
                                           sizeof(uint32_t));
-    d_sampleLocations = GPUBuffer<vector2>(paddedSampleCount);
-    d_sampleExtents = GPUBuffer<Sample::Extents>(paddedSampleCount);
     d_sampleRemap = GPUBuffer<int32_t>(paddedSampleCount);
 
     // For temporal filtering
     d_tMaxBuffer = GPUBuffer<float>(paddedSampleCount);
-
+    eccentricityMap = eMap;
     maxEccentricityRadians = _maxEccentricityRadians;
-    d_eccentricityCoordinateMap = makeGPUBuffer(eccentricityCoordinateMap);
-    d_ringEccentricities = makeGPUBuffer(ringEccentricities);
 
     int msaaRate = getMSAARate(outputMode);
     size_t totalSubsampleCount = paddedSampleCount * msaaRate;
@@ -243,15 +224,15 @@ void GPUCamera::registerPolarFoveatedSamples(const std::vector<vector2ui>& polar
 
     TextureFormat textureFormat = pixelFormatToTextureFormat(outputFormat);
 
-    polarFoveatedImage = createEmptyTexture(samplesPerRing, uint32_t(polarRemapToPixel.size() / samplesPerRing),
-                                            textureFormat, cudaAddressModeWrap, cudaAddressModeClamp);
-    previousPolarFoveatedImage = createEmptyTexture(samplesPerRing, uint32_t(polarRemapToPixel.size() / samplesPerRing),
-                                                    textureFormat, cudaAddressModeWrap, cudaAddressModeClamp);
-    rawPolarFoveatedImage = createEmptyTexture(samplesPerRing, uint32_t(polarRemapToPixel.size() / samplesPerRing),
-                                               textureFormat, cudaAddressModeWrap, cudaAddressModeClamp, false);
-    polarFoveatedDepthImage =
-        createEmptyTexture(samplesPerRing, uint32_t(polarRemapToPixel.size() / samplesPerRing),
-                           TextureFormat::r32_float, cudaAddressModeWrap, cudaAddressModeClamp, false);
+    uint32_t ringCount = uint32_t(polarRemapToPixel.size() / samplesPerRing);
+    auto createFoveatedImage = [&](TextureFormat format, bool linearFilter = true) {
+        return createEmptyTexture(samplesPerRing, ringCount, format, cudaAddressModeWrap, cudaAddressModeClamp,
+                                  linearFilter);
+    };
+    polarTextures.raw = createFoveatedImage(textureFormat);
+    polarTextures.depth = createFoveatedImage(TextureFormat::r32_float, false);
+    polarTextures.moment1 = createFoveatedImage(TextureFormat::r16g16b16a16_unorm);
+    polarTextures.moment2 = createFoveatedImage(TextureFormat::r16g16b16a16_unorm);
 
     initLookupTables(msaaRate);
 }
@@ -262,14 +243,15 @@ bool GPUCamera::bindTexture(GPUContext& gpuContext, ImageResourceDescriptor text
         cutilSafeCall(cudaGraphicsUnregisterResource(resultsResource));
         resultsResource = nullptr;
     }
-#ifdef DX_SUPPORTED
     if (texture.memoryType == ImageResourceDescriptor::MemoryType::DX_TEXTURE) {
+#if defined(_WIN32)
         // cudaGraphicsRegisterFlagsNone is only valid flag as of 7/22/2016
         cutilSafeCall(cudaGraphicsD3D11RegisterResource(&resultsResource, (ID3D11Texture2D*)texture.data,
                                                         cudaGraphicsRegisterFlagsNone));
-    }
+#else
+        assert(false, "Cannot do DirectX interop on non-windows platforms");
 #endif
-    if (texture.memoryType == ImageResourceDescriptor::MemoryType::OPENGL_TEXTURE) {
+    } else if (texture.memoryType == ImageResourceDescriptor::MemoryType::OPENGL_TEXTURE) {
         cutilSafeCall(cudaGraphicsGLRegisterImage(&resultsResource, (GLuint)(uint64_t)texture.data, GL_TEXTURE_2D,
                                                   cudaGraphicsMapFlagsWriteDiscard));
     }
@@ -296,7 +278,7 @@ void GPUCamera::copyImageToCPU(uint32_t* imageData, uint32_t imageWidth, uint32_
 void GPUCamera::acquireTileCullData(SimpleRayFrustum* tileFrusta, SimpleRayFrustum* blockFrusta) {
     cutilSafeCall(cudaEventSynchronize(transferTileToCPUEvent));
 
-    size_t blockCount = d_cullBlockFrusta.size();
+    size_t blockCount = d_foveatedWorldSpaceBlockFrusta.size();
     memcpy(blockFrusta, foveatedWorldSpaceBlockFrustaPinned, sizeof(SimpleRayFrustum) * blockCount);
     memcpy(tileFrusta, foveatedWorldSpaceTileFrustaPinned, sizeof(SimpleRayFrustum) * blockCount * TILES_PER_BLOCK);
 }
@@ -316,10 +298,10 @@ void GPUCamera::intersectShadeResolve(GPUSceneState& sceneState) {
         clearEmpty();
     }
 
-    SampleInfo sampleInfo(*this);
+    CameraBeams cameraBeams(*this);
     if (streamedData.tileCountOccupied > 0) {
-        intersect(sceneState, sampleInfo);
-        shadeAndResolve(sceneState, sampleInfo);
+        intersect(sceneState, cameraBeams);
+        shadeAndResolve(sceneState, cameraBeams);
     }
 
     streamedDataGpuDone();
diff --git a/libraries/hvvr/raycaster/gpu_camera.h b/libraries/hvvr/raycaster/gpu_camera.h
index 18557f7..e321504 100644
--- a/libraries/hvvr/raycaster/gpu_camera.h
+++ b/libraries/hvvr/raycaster/gpu_camera.h
@@ -10,24 +10,36 @@
  */
 
 #include "cuda_decl.h"
-#include "gbuffer.h"
+#include "foveated.h"
 #include "gpu_buffer.h"
 #include "gpu_image.h"
+#include "gpu_samples.h"
 #include "raycaster_spec.h"
 #include "samples.h"
 #include "texture_internal.h"
 #include "vector_math.h"
-#include "foveated.h"
+#include <stdint.h>
 
 namespace hvvr {
 
 class Camera;
 class GPUContext;
 class GPUSceneState;
-struct SampleInfo;
+struct CameraBeams;
 struct EccentricityToTexCoordMapping;
 
-// Keep in sync with blockcull.h/RayPacketFrustum
+
+struct RaycasterGBufferSubsample {
+    uint32_t triIndex;
+    uint32_t sampleMask;
+
+    template <uint32_t AARate>
+    CUDA_HOST_DEVICE static constexpr uint32_t getSampleMaskAll() {
+        return (AARate < 32) ? ((uint32_t(1) << AARate) - 1) : ~uint32_t(0);
+    }
+};
+
+// Keep in sync with blockcull.h/RayPacketFrustum2D
 struct GPURayPacketFrustum {
     float xMin;
     float xNegMax;
@@ -37,9 +49,8 @@ struct GPURayPacketFrustum {
 
 struct TemporalFilterSettings {
     // 1 entirely use current sample
-    float alpha = 1.f;
-    float stddevMultiplier = 4.0f;
-    bool inPolarSpace = false;
+    float alpha = .05f;
+    float stddevMultiplier = 1.0f;
 };
 
 struct TileTriRange {
@@ -86,7 +97,7 @@ struct ContrastEnhancementSettings {
     float f_e;
 };
 
-inline PixelFormat outputModeToPixelFormat(RaycasterOutputMode mode) {
+inline PixelFormat outputModeToPixelFormat(RaycasterOutputFormat mode) {
     (void)mode;
     return PixelFormat::RGBA8_SRGB;
 }
@@ -101,8 +112,7 @@ class GPUCamera {
     TemporalFilterSettings temporalFilterSettings;
 
     GPUBuffer<RaycasterGBufferSubsample> d_gBuffer;
-    GPUBuffer<vector2> d_sampleLocations;
-    GPUBuffer<Sample::Extents> d_sampleExtents;
+    GPUBuffer<DirectionalBeam> d_directionalBeams;
     GPUBuffer<int32_t> d_sampleRemap;
     uint32_t splitColorSamples = 1;
 
@@ -126,7 +136,7 @@ class GPUCamera {
 
     GPUBuffer<vector2ui> d_polarRemapToPixel;
 
-    GPUBuffer<PrecomputedDirectionSample> d_foveatedDirectionalSamples;
+    GPUBuffer<DirectionalBeam> d_foveatedEyeDirectionalSamples;
     GPUBuffer<SimpleRayFrustum> d_foveatedEyeSpaceTileFrusta;
     GPUBuffer<SimpleRayFrustum> d_foveatedEyeSpaceBlockFrusta;
     GPUBuffer<SimpleRayFrustum> d_foveatedWorldSpaceTileFrusta;
@@ -134,24 +144,23 @@ class GPUCamera {
     SimpleRayFrustum* foveatedWorldSpaceTileFrustaPinned = nullptr;
     SimpleRayFrustum* foveatedWorldSpaceBlockFrustaPinned = nullptr;
 
-    GPUBuffer<GPURayPacketFrustum> d_tileFrusta;
-    GPUBuffer<GPURayPacketFrustum> d_cullBlockFrusta;
-    GPURayPacketFrustum* tileFrustaPinned = nullptr;
-    GPURayPacketFrustum* cullBlockFrustaPinned = nullptr;
-
-    GPUBuffer<float> d_ringEccentricities;
-
     cudaEvent_t transferTileToCPUEvent = nullptr;
 
     GPUBuffer<uint32_t> d_sampleResults;
+    GPUBuffer<uint32_t> d_sampleResultsRemapped;
+    GPUBuffer<uint32_t> d_foveaMask;
     uint32_t sampleCount;
     GPUBuffer<float> d_tMaxBuffer;
+
     GPUImage resultImage;
     // For polarFoveatedReconstruction
-    Texture2D rawPolarFoveatedImage;
-    Texture2D previousPolarFoveatedImage;
-    Texture2D polarFoveatedImage;
-    Texture2D polarFoveatedDepthImage;
+    struct PolarTextures {
+        Texture2D raw;
+        Texture2D depth;
+
+        Texture2D moment1;
+        Texture2D moment2;
+    } polarTextures;
 
     // For temporal filtering in polarFoveatedReconstruction
     Texture2D previousResultTexture;
@@ -160,22 +169,18 @@ class GPUCamera {
     ContrastEnhancementBuffers contrastEnhancementBuffers;
     ContrastEnhancementSettings contrastEnhancementSettings;
 
+    EccentricityMap eccentricityMap;
     float maxEccentricityRadians;
-    GPUBuffer<float> d_eccentricityCoordinateMap;
 
     cudaGraphicsResource_t resultsResource = NULL;
     uint32_t validSampleCount = 0;
 
-    vector3 position;
-    vector3 lookVector;
-    matrix3x3 sampleToCamera;
     matrix4x4 cameraToWorld;
-
     ThinLens lens;
 
     cudaStream_t stream = 0;
 
-    RaycasterOutputMode outputMode = RaycasterOutputMode::COLOR_RGBA8;
+    RaycasterOutputFormat outputMode = RaycasterOutputFormat::COLOR_RGBA8;
 
     // how far to jitter the samples this frame. length() < 1
     vector2 frameJitter = {0.0f, 0.0f};
@@ -188,20 +193,16 @@ class GPUCamera {
 
     // per-frame updates
     void setCameraJitter(vector2 jitter);
-    void updatePerFrame(vector3 cameraPos,
-                        vector3 cameraLookVector,
-                        const matrix3x3& _sampleToCamera,
-                        const matrix4x4& _cameraToWorld);
+    void updateTransform(const matrix4x4& _cameraToWorld);
     void updatePerFrameFoveatedData(const FloatRect& sampleBounds,
                                     const matrix3x3& cameraToSample,
                                     const matrix3x3& eyeToCamera,
                                     const matrix4x4& eyeToWorld);
 
     // sample config updates
-    void updateConfig(RaycasterOutputMode _outputMode,
+    void updateConfig(RaycasterOutputFormat _outputMode,
                       int32_t* sampleRemap,
-                      float* sampleLocations,
-                      Sample::Extents* sampleExtents,
+                      DirectionalBeam* directionalSamples,
                       ThinLens _lens,
                       uint32_t _sampleCount,
                       uint32_t imageWidth,
@@ -210,13 +211,12 @@ class GPUCamera {
                       uint32_t _splitColorSamples);
     void registerPolarFoveatedSamples(const std::vector<vector2ui>& polarRemapToPixel,
                                       float _maxEccentricityRadians,
-                                      const std::vector<float>& ringEccentricities,
-                                      const std::vector<float>& eccentricityCoordinateMap,
+                                      const EccentricityMap& eccentricityMap,
                                       uint32_t samplesPerRing,
                                       uint32_t paddedSampleCount);
-    void updateEyeSpaceFoveatedSamples(const ArrayView<PrecomputedDirectionSample> precomputedDirectionalSamples);
+    void updateEyeSpaceFoveatedSamples(const ArrayView<DirectionalBeam> cameraBeams);
 
-    // attach a texture from a 3D API, used when OUTPUT_MODE = OUTPUT_MODE_3D_API
+    // attach a texture from a 3D API
     bool bindTexture(GPUContext& gpuContext, ImageResourceDescriptor texture);
     // GPUContext::graphicsResourcesMapped must be true before calling this function
     void copyImageToBoundTexture();
@@ -231,6 +231,7 @@ class GPUCamera {
 
     // convert from linear post-resolve results buffer to results image
     void remap();
+
     void remapPolarFoveated();
 
     void foveatedPolarToScreenSpace(const matrix4x4& eyeToEyePrevious,
@@ -241,17 +242,15 @@ class GPUCamera {
     void dumpRays(std::vector<SimpleRay>& rays, bool outputScanlineOrder);
 
 protected:
-
     // intersect triangles
-    void intersect(GPUSceneState& sceneState, const SampleInfo& sampleInfo);
+    void intersect(GPUSceneState& sceneState, const CameraBeams& cameraBeams);
 
     // fill empty tiles with default clear value
     void clearEmpty();
     // shade occupied tiles and resolve AA subsamples
-    void shadeAndResolve(GPUSceneState& sceneState, const SampleInfo& sampleInfo);
+    void shadeAndResolve(GPUSceneState& sceneState, const CameraBeams& cameraBeams);
 
     void getEccentricityMap(EccentricityToTexCoordMapping& map) const;
-    void foveatedPolarTemporalFilter(const matrix4x4& eyeToEyePrevious);
 };
 #pragma warning(pop)
 
diff --git a/libraries/hvvr/raycaster/gpu_context.cu b/libraries/hvvr/raycaster/gpu_context.cu
index 68626a7..48f886e 100644
--- a/libraries/hvvr/raycaster/gpu_context.cu
+++ b/libraries/hvvr/raycaster/gpu_context.cu
@@ -15,28 +15,31 @@
 
 namespace hvvr {
 
-bool GPUContext::cudaInit() {
+bool GPUContext::cudaInit(bool forceNonTCC) {
     int deviceCount = 0;
     cutilSafeCall(cudaGetDeviceCount(&deviceCount));
 
     int device = 0;
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    cudaDeviceProp deviceProps = {};
-
-    // if we're on Windows, search for a non-TCC device
-    for (int n = 0; n < deviceCount; n++) {
-        cudaGetDeviceProperties(&deviceProps, n);
-        if (deviceProps.tccDriver == 0) {
-            device = n;
-            break;
+
+    if (forceNonTCC) {
+        cudaDeviceProp deviceProps = {};
+
+        // if we're on Windows, search for a non-TCC device
+        for (int n = 0; n < deviceCount; n++) {
+            cudaGetDeviceProperties(&deviceProps, n);
+            if (deviceProps.tccDriver == 0) {
+                device = n;
+                break;
+            }
         }
     }
-#endif
     cutilSafeCall(cudaSetDevice(device));
 
     uint32_t deviceFlags = 0;
     deviceFlags |= cudaDeviceMapHost;
-    if (cudaSuccess != cudaSetDeviceFlags(deviceFlags)) {
+    auto error = cudaSetDeviceFlags(deviceFlags);
+    if (cudaSuccess != error) {
+        fprintf(stderr, "error %d: cuda call failed with %s\n", error, cudaGetErrorString(error));
         assert(false);
         return false;
     }
@@ -108,8 +111,6 @@ void GPUContext::cleanup() {
 
         safeCudaEventDestroy(c->transferTileToCPUEvent);
         safeCudaStreamDestroy(c->stream);
-        safeCudaFreeHost(c->tileFrustaPinned);
-        safeCudaFreeHost(c->cullBlockFrustaPinned);
         safeCudaFreeHost(c->foveatedWorldSpaceTileFrustaPinned);
         safeCudaFreeHost(c->foveatedWorldSpaceBlockFrustaPinned);
     }
diff --git a/libraries/hvvr/raycaster/gpu_context.h b/libraries/hvvr/raycaster/gpu_context.h
index b38101b..85e7538 100644
--- a/libraries/hvvr/raycaster/gpu_context.h
+++ b/libraries/hvvr/raycaster/gpu_context.h
@@ -24,7 +24,8 @@ class GPUContext {
     GPUSceneState sceneState;
     bool graphicsResourcesMapped;
 
-    static bool cudaInit();
+    // If forceNonTcc is true, select a cuda device with a non-TCC driver.
+    static bool cudaInit(bool forceNonTcc);
     static void cudaCleanup();
 
     GPUContext();
diff --git a/libraries/hvvr/raycaster/gpu_foveated.cu b/libraries/hvvr/raycaster/gpu_foveated.cu
index d31a112..1db5922 100644
--- a/libraries/hvvr/raycaster/gpu_foveated.cu
+++ b/libraries/hvvr/raycaster/gpu_foveated.cu
@@ -13,7 +13,6 @@
 #include "frusta.h"
 #include "gpu_camera.h"
 #include "gpu_context.h"
-#include "gpu_foveated.h"
 #include "gpu_samples.h"
 #include "graphics_types.h"
 #include "kernel_constants.h"
@@ -24,36 +23,53 @@
 
 namespace hvvr {
 
-// TODO: move to helper header, make generic;
-// potentially optimize down to 4 taps using smart tap placement
-CUDA_DEVICE vector4 bicubic(Texture2D tex, vector2 coord) {
+template <PixelFormat PixelFormat>
+CUDA_DEVICE void writeSurface(vector4 val, Texture2D tex, unsigned int x, unsigned int y) {
+    if (PixelFormat == PixelFormat::RGBA32F) {
+        surf2Dwrite(float4(val), tex.d_surfaceObject, x * sizeof(float4), y);
+    } else if (PixelFormat == PixelFormat::RGBA16) {
+        surf2Dwrite(ToColor4Unorm16(val), tex.d_surfaceObject, x * sizeof(uint64_t), y);
+    } else {
+        surf2Dwrite(ToColor4Unorm8SRgb(val), tex.d_surfaceObject, x * sizeof(uchar4), y);
+    }
+}
+
+// 4-tap B-spline, based on http://vec3.ca/bicubic-filtering-in-fewer-taps/
+CUDA_DEVICE vector4 bicubicFast(Texture2D tex, vector2 coord) {
     vector2 pixCoord = coord * vector2(tex.width, tex.height);
     vector2 pixCenter = vector2(floorf(pixCoord.x - 0.5f), floorf(pixCoord.y - 0.5f)) + 0.5f;
 
     vector2 iDim = vector2(1.0f / tex.width, 1.0f / tex.height);
 
+    vector2 one = vector2(1.0f, 1.0f);
+
     // fractionalOffset
     vector2 f = pixCoord - pixCenter;
     vector2 f2 = f * f;
     vector2 f3 = f2 * f;
-    vector2 omf2 = (vector2(1.0f) - f) * (vector2(1.0f) - f);
-    vector2 omf3 = omf2 * (vector2(1.0f) - f);
+
+    vector2 omf2 = (one - f) * (one - f);
+    vector2 omf3 = omf2 * (one - f);
     float sixth = (1.0f / 6.0f);
-    // B-spline
-    vector2 w[4] = {sixth * omf3, sixth * (4.0f + 3.0f * f3 - 6.0f * f2), sixth * (4.0f + 3.0f * omf3 - 6.0f * omf2),
-                    sixth * f3};
+    vector2 w0 = sixth * omf3;
+    vector2 w1 = ((4.0f / 6.0f) * one + 0.5f * f3 - f2);
+    vector2 w3 = sixth * f3;
+    vector2 w2 = one - w0 - w1 - w3;
 
-    vector2 tc[4] = {pixCenter + vector2(-1), pixCenter, pixCenter + vector2(1), pixCenter + vector2(2)};
+    vector2 s0 = w0 + w1;
+    vector2 s1 = w2 + w3;
 
-    vector4 result = vector4(0.0f);
-    for (int y = 0; y < 4; ++y) {
-        for (int x = 0; x < 4; ++x) {
-            result += vector4(tex2D<float4>(tex.d_texObject, tc[x].x * iDim.x, tc[y].y * iDim.y)) * w[x].x * w[y].y;
-        }
-    }
-    return result;
-}
+    vector2 f0 = w1 / (w0 + w1);
+    vector2 f1 = w3 / (w2 + w3);
 
+    vector2 t0 = (pixCenter - one + f0) * iDim;
+    vector2 t1 = (pixCenter + one + f1) * iDim;
+
+    auto T = tex.d_texObject;
+    // and sample and blend
+    return vector4(tex2D<float4>(T, t0.x, t0.y)) * s0.x * s0.y + vector4(tex2D<float4>(T, t1.x, t0.y)) * s1.x * s0.y +
+           vector4(tex2D<float4>(T, t0.x, t1.y)) * s0.x * s1.y + vector4(tex2D<float4>(T, t1.x, t1.y)) * s1.x * s1.y;
+}
 
 CUDA_DEVICE vector2 directionToSampleSpaceSample(const matrix3x3& eyeSpaceToSampleSpaceMatrix,
                                                  const vector3& direction) {
@@ -67,100 +83,65 @@ CUDA_DEVICE_INL bool rectContains(const FloatRect r, const vector2 p) {
     return (p.x >= r.lower.x) && (p.x <= r.upper.x) && (p.y >= r.lower.y) && (p.y <= r.upper.y);
 }
 
-CUDA_KERNEL void TransformFoveatedSamplesToSampleSpaceKernel(
-    const matrix3x3 eyeSpaceToSampleSpaceMatrix,
-    const matrix3x3 eyeSpaceToCameraSpace,
-    const FloatRect cullRect,
-    const PrecomputedDirectionSample* precomputedEyeSpaceSamples,
-    SampleInfo sampleInfo,
-    int* remap,
-    const uint32_t sampleCount) {
+CUDA_KERNEL void TransformFoveatedSamplesToCameraSpaceKernel(const matrix3x3 eyeSpaceToSampleSpaceMatrix,
+                                                             const matrix3x3 eyeSpaceToCameraSpace,
+                                                             const FloatRect cullRect,
+                                                             const DirectionalBeam* eyeSpaceSamples,
+                                                             CameraBeams cameraBeams,
+                                                             int* remap,
+                                                             const uint32_t sampleCount) {
     unsigned index = blockIdx.x * blockDim.x + threadIdx.x;
     if (index < sampleCount) {
-        vector3 direction = precomputedEyeSpaceSamples[index].center;
+        DirectionalBeam eyeBeam = eyeSpaceSamples[index];
         vector2 c = {CUDA_INF, CUDA_INF};
-        if ((eyeSpaceToCameraSpace * direction).z < 0.0f) {
-            vector2 s = directionToSampleSpaceSample(eyeSpaceToSampleSpaceMatrix, direction);
+        DirectionalBeam cameraBeam = eyeSpaceToCameraSpace * eyeBeam;
+        if (cameraBeam.centerRay.z < 0.0f) {
+            vector2 s = directionToSampleSpaceSample(eyeSpaceToSampleSpaceMatrix, eyeBeam.centerRay);
             if (rectContains(cullRect, s)) {
-                c = s;
-                vector2 d1 =
-                    directionToSampleSpaceSample(eyeSpaceToSampleSpaceMatrix, precomputedEyeSpaceSamples[index].d1) - c;
-                vector2 d2 =
-                    directionToSampleSpaceSample(eyeSpaceToSampleSpaceMatrix, precomputedEyeSpaceSamples[index].d2) - c;
-                float sqExtent1 = dot(d1, d1);
-                float sqExtent2 = dot(d2, d2);
-                ;
-                Sample::Extents extent;
-                if (sqExtent1 > sqExtent2) {
-                    extent.minorAxis = d2;
-                    extent.majorAxisLength = sqrtf(sqExtent1);
-                } else if (sqExtent2 == 0) {
-                    extent.minorAxis.x = 0;
-                    extent.minorAxis.y = 0;
-                    extent.majorAxisLength = 0;
-                } else { // sqExtent2 >= sqExtent1, sqExtent2 != 0
-                    extent.minorAxis = d1;
-                    extent.majorAxisLength = sqrtf(sqExtent2);
-                }
                 remap[index] = (int)index;
-                sampleInfo.extents[index] = extent;
             }
         }
-        sampleInfo.centers[index] = c;
+        cameraBeams.directionalBeams[index] = cameraBeam;
     }
 }
 
-void TransformFoveatedSamplesToSampleSpace(const matrix3x3& eyeSpaceToSampleSpaceMatrix,
+void TransformFoveatedSamplesToCameraSpace(const matrix3x3& eyeSpaceToSampleSpaceMatrix,
                                            const matrix3x3& eyeSpaceToCameraSpaceMatrix,
                                            const FloatRect& sampleBounds,
-                                           const PrecomputedDirectionSample* d_precomputedEyeSpaceSamples,
-                                           SampleInfo sampleInfo,
+                                           const DirectionalBeam* d_eyeBeams,
+                                           CameraBeams cameraBeams,
                                            int* d_remap,
                                            const uint32_t sampleCount,
                                            cudaStream_t stream) {
     KernelDim dim = KernelDim(sampleCount, CUDA_GROUP_SIZE);
-    TransformFoveatedSamplesToSampleSpaceKernel<<<dim.grid, dim.block, 0, stream>>>(
-        eyeSpaceToSampleSpaceMatrix, eyeSpaceToCameraSpaceMatrix, sampleBounds, d_precomputedEyeSpaceSamples,
-        sampleInfo, d_remap, sampleCount);
+    TransformFoveatedSamplesToCameraSpaceKernel<<<dim.grid, dim.block, 0, stream>>>(
+        eyeSpaceToSampleSpaceMatrix, eyeSpaceToCameraSpaceMatrix, sampleBounds, d_eyeBeams, cameraBeams, d_remap,
+        sampleCount);
 }
 
 struct EccentricityToTexCoordMapping {
-    float maxEccentricityRadians;
-    float* forwardMap;
-    float forwardMapSize;
-    float* backwardMap;
-    float backwardMapSize;
+    EccentricityMap eccentricityMap;
+    float texMapSize;
+    float invTexMapSize;
+    float invMaxEccentricity;
 };
 
 void GPUCamera::getEccentricityMap(EccentricityToTexCoordMapping& map) const {
-    map.maxEccentricityRadians = maxEccentricityRadians;
-    map.forwardMap = d_eccentricityCoordinateMap;
-    map.forwardMapSize = (float)d_eccentricityCoordinateMap.size();
-    map.backwardMap = (float*)d_ringEccentricities;
-    map.backwardMapSize = (float)d_ringEccentricities.size();
-}
-
-CUDA_DEVICE float bilinearRead1D(float coord, float* map, float mapSize) {
-    float pixelCoord = coord * mapSize - 0.5f;
-    float integralPart = floor(pixelCoord);
-    int maxCoord = (int)mapSize - 1;
-    int lowerCoord = clamp((int)integralPart, 0, maxCoord);
-    int upperCoord = clamp((int)integralPart + 1, 0, maxCoord);
-    float alpha = pixelCoord - integralPart;
-    return lerp(map[lowerCoord], map[upperCoord], alpha);
+    map.eccentricityMap = eccentricityMap;
+    map.texMapSize = (float)polarTextures.raw.height;
+    map.invTexMapSize = 1.0f / polarTextures.raw.height;
+    map.invMaxEccentricity = 1.0f / maxEccentricityRadians;
 }
 
 // eccentricity is in the range [0,maxEccentricityRadians]
 CUDA_DEVICE float eccentricityToTexCoord(float eccentricity, EccentricityToTexCoordMapping eToTexMap) {
-    float normalizedE = eccentricity / eToTexMap.maxEccentricityRadians;
-    return bilinearRead1D(normalizedE, eToTexMap.forwardMap, eToTexMap.forwardMapSize);
+    return (eToTexMap.eccentricityMap.applyInverse(eccentricity) + 0.5f) * eToTexMap.invTexMapSize;
 }
 
 CUDA_DEVICE vector2 getNormalizedCoord(int x, int y, int width, int height) {
     return vector2(((float)x + 0.5f) / (float)width, ((float)y + 0.5f) / (float)height);
 }
 
-// TODO: Canonicalize the negations...
 // Aligned along z axis
 CUDA_DEVICE vector3 angularEyeCoordToDirection(float theta, float e) {
     float z = -cosf(e);
@@ -169,7 +150,6 @@ CUDA_DEVICE vector3 angularEyeCoordToDirection(float theta, float e) {
     return {xy.x, xy.y, z};
 }
 CUDA_DEVICE void eyeSpaceDirectionToAngularEyeCoord(vector3 dir, float& theta, float& eccentricity) {
-    // TODO: get rid of transcendentals in calculation
     eccentricity = acosf(-dir.z);
     // Angle of rotation about z, measured from x
     theta = -atan2f(dir.y, dir.x);
@@ -179,7 +159,7 @@ CUDA_DEVICE void polarTextureCoordToAngularEyeCoord(vector2 coord,
                                                     EccentricityToTexCoordMapping eToTexMap,
                                                     float& theta,
                                                     float& eccentricity) {
-    eccentricity = bilinearRead1D(coord.y, eToTexMap.backwardMap, eToTexMap.backwardMapSize);
+    eccentricity = eToTexMap.eccentricityMap.apply(coord.y * eToTexMap.texMapSize - 0.5f);
     theta = (2.0f * Pi * coord.x) - Pi;
 }
 CUDA_DEVICE vector2 angularEyeCoordToPolarTextureCoord(float theta,
@@ -207,15 +187,17 @@ CUDA_DEVICE void computeMoments3x3Window(
     m_2 *= weight;
 }
 
-CUDA_DEVICE vector4 sqrtf(vector4 v) {
-    return vector4(v.x, v.y, v.z, v.w);
+CUDA_DEVICE vector4 sqrt(vector4 v) {
+    return vector4(sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w));
 }
 
-CUDA_DEVICE vector4 clampToNeighborhood(
-    vector4 oldValue, cudaTextureObject_t tex, vector2 coord, vector2 invDim, TemporalFilterSettings settings) {
-    vector4 m_1, m_2;
-    computeMoments3x3Window(tex, coord, invDim, m_1, m_2);
-    vector4 stdDev = sqrtf(m_2 - (m_1 * m_1));
+CUDA_DEVICE vector4 clampToNeighborhood(vector4 oldValue,
+                                        GPUCamera::PolarTextures polarTex,
+                                        vector2 coord,
+                                        TemporalFilterSettings settings) {
+    vector4 m_1 = vector4(tex2D<float4>(polarTex.moment1.d_texObject, coord.x, coord.y));
+    vector4 m_2 = vector4(tex2D<float4>(polarTex.moment2.d_texObject, coord.x, coord.y));
+    vector4 stdDev = sqrt(m_2 - (m_1 * m_1));
     // Arbitrary
     float scaleFactor = settings.stddevMultiplier;
     vector4 minC = m_1 - (stdDev * scaleFactor);
@@ -224,13 +206,27 @@ CUDA_DEVICE vector4 clampToNeighborhood(
 }
 
 template <PixelFormat PixelFormat>
-CUDA_KERNEL void FoveatedPolarToScreenSpaceKernel(Texture2D polarImage,
+CUDA_KERNEL void ComputeMoments(GPUCamera::PolarTextures polarTex) {
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < polarTex.raw.width && j < polarTex.raw.height) {
+        vector4 m_1, m_2;
+        vector2 invDim = vector2(1.0f / polarTex.raw.width, 1.0f / polarTex.raw.height);
+        vector2 coord = vector2(invDim.x * i, invDim.y * j);
+        computeMoments3x3Window(polarTex.raw.d_texObject, coord, invDim, m_1, m_2);
+        writeSurface<PixelFormat>(m_1, polarTex.moment1, i, j);
+        writeSurface<PixelFormat>(m_2, polarTex.moment2, i, j);
+    }
+}
+
+template <PixelFormat PixelFormat>
+CUDA_KERNEL void FoveatedPolarToScreenSpaceKernel(GPUCamera::PolarTextures polarTex,
                                                   Texture2D resultTexture,
                                                   GPUImage resultImage,
                                                   matrix3x3 sampleSpaceToEyeSpaceMatrix,
                                                   EccentricityToTexCoordMapping eToTexMap,
-                                                  Texture2D previousImage,
-                                                  Texture2D polarFoveatedTMaxImage,
+                                                  Texture2D previousResultTexture,
                                                   matrix4x4 eyeSpaceToPreviousSampleSpaceMatrix,
                                                   TemporalFilterSettings settings) {
     unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -241,19 +237,23 @@ CUDA_KERNEL void FoveatedPolarToScreenSpaceKernel(Texture2D polarImage,
         vector3 sampleSpacePoint = vector3(normalizedCoord, 1.0f);
         vector3 eyeSpaceDirection = normalize(sampleSpaceToEyeSpaceMatrix * sampleSpacePoint);
 
+
         float theta, eccentricity;
         eyeSpaceDirectionToAngularEyeCoord(eyeSpaceDirection, theta, eccentricity);
 
+        /** Display full mapping, for debugging
+        theta = normalizedCoord.x * 2.0f * Pi;
+        eccentricity = normalizedCoord.y * eToTexMap.invMaxEccentricity;
+        */
+
         vector2 coord = angularEyeCoordToPolarTextureCoord(theta, eccentricity, eToTexMap);
 
-        vector4 newValue = bicubic(polarImage, coord);
+        vector4 newValue = bicubicFast(polarTex.raw, coord);
         vector4 result = newValue;
         vector4 surfaceResult = result;
-        if (!settings.inPolarSpace) {
-            float tValue = tex2D<float>(polarFoveatedTMaxImage.d_texObject, coord.x, coord.y);
-            // Clamp infinity. TODO: something more robust
-            tValue = min(tValue, 1000.0f);
 
+        float tValue = tex2D<float>(polarTex.depth.d_texObject, coord.x, coord.y);
+        if (tValue < CUDA_INF) {
             vector3 currentEyePosition = angularEyeCoordToDirection(theta, eccentricity) * tValue;
 
             vector4 prevSamplePosition = eyeSpaceToPreviousSampleSpaceMatrix * vector4(currentEyePosition, 1.0f);
@@ -262,81 +262,38 @@ CUDA_KERNEL void FoveatedPolarToScreenSpaceKernel(Texture2D polarImage,
             float alpha = settings.alpha;
             vector4 oldValue = newValue;
             if (oldTexCoord.x > 0 && oldTexCoord.y > 0 && oldTexCoord.x < 1 && oldTexCoord.y < 1 && tValue > 0) {
-                oldValue = vector4(tex2D<float4>(previousImage.d_texObject, oldTexCoord.x, oldTexCoord.y));
+                oldValue = vector4(tex2D<float4>(previousResultTexture.d_texObject, oldTexCoord.x, oldTexCoord.y));
             }
-            // TODO: could compute the moments in a prepass
-            vector2 invDim = vector2(1.0f / polarImage.width, 1.0f / polarImage.height);
-            vector4 clampedOldValue = clampToNeighborhood(oldValue, polarImage.d_texObject, coord, invDim, settings);
+
+
+            vector4 clampedOldValue = clampToNeighborhood(oldValue, polarTex, coord, settings);
+
+            // Make alpha settings be dependent on eccentricity. Make it higher in fovea and lower toward periphery
+            float normalizedE = eccentricity * eToTexMap.invMaxEccentricity;
+            float mn = 0.2f, mx = 0.35f;
+            float t = clamp((normalizedE - mn) / (mx - mn), 0.f, 1.f);
+            alpha = lerp(0.5f, alpha, t);
+
+            // Heuristic hack! Turn down TAA clamping in the periphery
+            normalizedE = sqrtf(sqrtf(sqrtf(sqrtf(normalizedE))));
+            clampedOldValue.x = lerp(clampedOldValue.x, oldValue.x, normalizedE);
+            clampedOldValue.y = lerp(clampedOldValue.y, oldValue.y, normalizedE);
+            clampedOldValue.z = lerp(clampedOldValue.z, oldValue.z, normalizedE);
+            clampedOldValue.w = lerp(clampedOldValue.w, oldValue.w, normalizedE);
+
             surfaceResult = alpha * newValue + (1.0f - alpha) * clampedOldValue;
             result = surfaceResult;
         }
 
-        // DEBUG CODE:
-        // surfaceResult = vector4(tex2D<float4>(polarImage.d_texObject, normalizedCoord.x, normalizedCoord.y));
-        // result = surfaceResult;
 
         if (PixelFormat == PixelFormat::RGBA32F) {
             vector4* output = (vector4*)resultImage.data();
             output[resultImage.stride() * j + i] = result;
-            if (!settings.inPolarSpace) {
-                surf2Dwrite(float4(surfaceResult), resultTexture.d_surfaceObject, i * sizeof(float4), j);
-            }
         } else {
             uint32_t* output = (uint32_t*)resultImage.data();
             output[resultImage.stride() * j + i] = ToColor4Unorm8SRgb(result);
-            if (!settings.inPolarSpace) {
-                surf2Dwrite(ToColor4Unorm8SRgb(surfaceResult), resultTexture.d_surfaceObject, i * sizeof(uchar4), j);
-            }
-        }
-    }
-}
-
-template <PixelFormat PixelFormat>
-CUDA_KERNEL void FoveatedTemporalFilterKernel(Texture2D rawImage,
-                                              Texture2D previousImage,
-                                              Texture2D polarFoveatedTMaxImage,
-                                              EccentricityToTexCoordMapping eToTexMap,
-                                              matrix4x4 eyeToEyePrevious,
-                                              TemporalFilterSettings settings,
-                                              Texture2D resultImage) {
-    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i < rawImage.width && j < rawImage.height) {
-        vector2 invDim = vector2(1.0f / rawImage.width, 1.0f / rawImage.height);
-        vector2 coord = vector2(i + 0.5f, j + 0.5f) * invDim;
-        float tValue = tex2D<float>(polarFoveatedTMaxImage.d_texObject, coord.x, coord.y);
-        vector4 newValue(tex2D<float4>(rawImage.d_texObject, coord.x, coord.y));
-        vector4 result = newValue;
-        if (tValue < CUDA_INF) {
-            float theta, e;
-            polarTextureCoordToAngularEyeCoord(coord, eToTexMap, theta, e);
-
-            vector3 currentEyePosition = angularEyeCoordToDirection(theta, e) * tValue;
-
-            vector4 prevEyePosition = eyeToEyePrevious * vector4(currentEyePosition, 1.0f);
-
-            float prevTheta, prevE;
-            eyeSpaceDirectionToAngularEyeCoord(normalize(vector3(prevEyePosition)), prevTheta, prevE);
-
-            vector2 oldTexCoord = angularEyeCoordToPolarTextureCoord(prevTheta, prevE, eToTexMap);
-
-            float alpha = settings.alpha;
-
-            vector4 oldValue(tex2D<float4>(previousImage.d_texObject, oldTexCoord.x, oldTexCoord.y));
-
-            // TODO: could compute the moments in a prepass
-            vector4 clampedOldValue = clampToNeighborhood(oldValue, rawImage.d_texObject, coord, invDim, settings);
-            result = alpha * newValue + (1.0f - alpha) * clampedOldValue;
-        }
-
-        // vector2 diff = (oldTexCoord - coord);
-        // result = newValue;// {tValue, -tValue, 0.0f, 0.0f};
-        if (PixelFormat == PixelFormat::RGBA32F) {
-            surf2Dwrite(float4(result), resultImage.d_surfaceObject, i * sizeof(float4), j);
-        } else {
-            surf2Dwrite(ToColor4Unorm8SRgb(result), resultImage.d_surfaceObject, i * sizeof(uchar4), j);
         }
+        writeSurface<PixelFormat>(surfaceResult, resultTexture, i, j);
     }
 }
 
@@ -354,6 +311,9 @@ CUDA_DEVICE vector4 texelFetch(Texture2D tex, unsigned i, unsigned j) {
     return vector4(tex2D<float4>(tex.d_texObject, coord.x, coord.y));
 }
 
+// The reliance on eccentricity is a pure guess, a better implementation would make this more principled or
+// at least try and obtain the formula used (but not published) in
+// https://research.nvidia.com/sites/default/files/publications/supplementary.pdf
 template <PixelFormat PixelFormat>
 CUDA_KERNEL void SeparableFilterUsingEccentricity(Texture2D output,
                                                   Texture2D input,
@@ -365,16 +325,15 @@ CUDA_KERNEL void SeparableFilterUsingEccentricity(Texture2D output,
 
     if (i < output.width && j < output.height) {
         float eccentricity = getEccentricity(i, j, output, sampleSpaceToEyeSpaceMatrix);
-        // TODO: compute filter radius
-        int filterRadius = 5;
 
+        int filterRadius = 5;
         vector4 valueSum = vector4(0.0f, 0.0f, 0.0f, 0.0f);
         float weightSum = 0.0f;
         for (int R = -filterRadius; R <= filterRadius; ++R) {
             vector2i tapLoc(clamp((int)i + R * step.x, (int)0, (int)output.width - 1),
                             clamp((int)j + R * step.y, (int)0, (int)output.height - 1));
-            // TODO: compute filter weight
-            float weight = 1.0f;
+            float normDist = fabsf(float(R)) / float(filterRadius + 0.1);
+            float weight = powf(1.0f - normDist, sqrtf(eccentricity));
             valueSum += texelFetch(input, tapLoc.x, tapLoc.y) * weight;
             weightSum += weight;
         }
@@ -398,14 +357,19 @@ CUDA_KERNEL void FinishConstrastEnhancement(GPUImage resultImage,
                                             Texture2D unfilteredTexture,
                                             Texture2D filteredTexture,
                                             ContrastEnhancementSettings settings,
-                                            matrix3x3 sampleSpaceToEyeSpaceMatrix) {
+                                            matrix3x3 sampleSpaceToEyeSpaceMatrix,
+                                            EccentricityToTexCoordMapping eToTexMap) {
     unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     if (i < resultImage.width() && j < resultImage.height()) {
         // TODO: compute sigma
         float eccentricity = getEccentricity(i, j, unfilteredTexture, sampleSpaceToEyeSpaceMatrix);
-        float sigma = 1.0f;
+        float sigma = 8.0f;
+
+        float t = eccentricity * eToTexMap.invMaxEccentricity;
+
+        sigma *= max(0.001f, clamp(t * t, 0.f, 1.f));
 
         vector4 pix = texelFetch(unfilteredTexture, i, j);
         vector4 pmean = texelFetch(filteredTexture, i, j);
@@ -417,76 +381,34 @@ CUDA_KERNEL void FinishConstrastEnhancement(GPUImage resultImage,
     }
 }
 
-static void textureCopy(Texture2D dst, Texture2D src) {
-    assert(dst.width == src.width && dst.height == src.height && dst.format == src.format);
-    cutilSafeCall(cudaMemcpy2DArrayToArray(dst.d_rawMemory, 0, 0, src.d_rawMemory, 0, 0, dst.width * dst.elementSize,
-                                           dst.height));
-}
-
-void GPUCamera::foveatedPolarTemporalFilter(const matrix4x4& eyeToEyePrevious) {
-    size_t width = polarFoveatedImage.width;
-    size_t height = polarFoveatedImage.height;
-    KernelDim dim = KernelDim(width, height, CUDA_GROUP_WIDTH, CUDA_GROUP_HEIGHT);
-
-    EccentricityToTexCoordMapping eToTexMap;
-    getEccentricityMap(eToTexMap);
-
-    switch (outputModeToPixelFormat(outputMode)) {
-        case PixelFormat::RGBA32F:
-            FoveatedTemporalFilterKernel<PixelFormat::RGBA32F><<<dim.grid, dim.block, 0, stream>>>(
-                rawPolarFoveatedImage, previousPolarFoveatedImage, polarFoveatedDepthImage, eToTexMap, eyeToEyePrevious,
-                temporalFilterSettings, polarFoveatedImage);
-            break;
-        case PixelFormat::RGBA8_SRGB:
-            FoveatedTemporalFilterKernel<PixelFormat::RGBA8_SRGB><<<dim.grid, dim.block, 0, stream>>>(
-                rawPolarFoveatedImage, previousPolarFoveatedImage, polarFoveatedDepthImage, eToTexMap, eyeToEyePrevious,
-                temporalFilterSettings, polarFoveatedImage);
-            break;
-        default:
-            assert(false);
-    }
-    // TODO: could ping-pong buffers to save a copy
-    textureCopy(previousPolarFoveatedImage, polarFoveatedImage);
-}
-
 void GPUCamera::foveatedPolarToScreenSpace(const matrix4x4& eyeToEyePrevious,
                                            const matrix3x3& eyePreviousToSamplePrevious,
                                            const matrix3x3& sampleToEye) {
-    bool filterInPolarSpace = temporalFilterSettings.inPolarSpace;
-    if (filterInPolarSpace) {
-        foveatedPolarTemporalFilter(eyeToEyePrevious);
-    }
-
     KernelDim dim = KernelDim(resultImage.width(), resultImage.height(), CUDA_GROUP_WIDTH, CUDA_GROUP_HEIGHT);
-    Texture2D currentPolarFoveatedImage = filterInPolarSpace ? polarFoveatedImage : rawPolarFoveatedImage;
+    KernelDim polDim =
+        KernelDim(polarTextures.raw.width, polarTextures.raw.height, CUDA_GROUP_WIDTH, CUDA_GROUP_HEIGHT);
     matrix4x4 eyeToSamplePrevious = matrix4x4(eyePreviousToSamplePrevious) * eyeToEyePrevious;
 
     EccentricityToTexCoordMapping eToTexMap;
     getEccentricityMap(eToTexMap);
-
+    ComputeMoments<PixelFormat::RGBA16><<<polDim.grid, polDim.block, 0, stream>>>(polarTextures);
     switch (outputModeToPixelFormat(outputMode)) {
         case PixelFormat::RGBA32F:
             FoveatedPolarToScreenSpaceKernel<PixelFormat::RGBA32F><<<dim.grid, dim.block, 0, stream>>>(
-                currentPolarFoveatedImage, resultTexture, resultImage, sampleToEye, eToTexMap, previousResultTexture,
-                polarFoveatedDepthImage, eyeToSamplePrevious, temporalFilterSettings);
+                polarTextures, resultTexture, resultImage, sampleToEye, eToTexMap, previousResultTexture,
+                eyeToSamplePrevious, temporalFilterSettings);
             break;
         case PixelFormat::RGBA8_SRGB:
             FoveatedPolarToScreenSpaceKernel<PixelFormat::RGBA8_SRGB><<<dim.grid, dim.block, 0, stream>>>(
-                currentPolarFoveatedImage, resultTexture, resultImage, sampleToEye, eToTexMap, previousResultTexture,
-                polarFoveatedDepthImage, eyeToSamplePrevious, temporalFilterSettings);
+                polarTextures, resultTexture, resultImage, sampleToEye, eToTexMap, previousResultTexture,
+                eyeToSamplePrevious, temporalFilterSettings);
             break;
         default:
             assert(false);
     }
 
-    // TODO: could ping-pong buffers to save a copy
-    textureCopy(previousResultTexture, resultTexture);
-
     if (contrastEnhancementSettings.enable) {
         assert(outputModeToPixelFormat(outputMode) == PixelFormat::RGBA8_SRGB);
-        assert(filterInPolarSpace == false);
-        // TODO: Probably cheaper to compute eccentricity once per pixel and store in
-        // single channel 16F texture; and reuse in all three kernels
         SeparableFilterUsingEccentricity<PixelFormat::RGBA8_SRGB>
             <<<dim.grid, dim.block, 0, stream>>>(contrastEnhancementBuffers.horizontallyFiltered, resultTexture, {0, 1},
                                                  contrastEnhancementSettings, sampleToEye);
@@ -495,37 +417,27 @@ void GPUCamera::foveatedPolarToScreenSpace(const matrix4x4& eyeToEyePrevious,
             contrastEnhancementSettings, sampleToEye);
         FinishConstrastEnhancement<PixelFormat::RGBA8_SRGB>
             <<<dim.grid, dim.block, 0, stream>>>(resultImage, resultTexture, contrastEnhancementBuffers.fullyFiltered,
-                                                 contrastEnhancementSettings, sampleToEye);
+                                                 contrastEnhancementSettings, sampleToEye, eToTexMap);
     }
+
+    std::swap(previousResultTexture, resultTexture);
 }
 
-void GPUCamera::updateEyeSpaceFoveatedSamples(
-    const ArrayView<PrecomputedDirectionSample> precomputedDirectionalSamples) {
-    d_foveatedDirectionalSamples = GPUBuffer<PrecomputedDirectionSample>(precomputedDirectionalSamples.cbegin(),
-                                                                         precomputedDirectionalSamples.cend());
+void GPUCamera::updateEyeSpaceFoveatedSamples(const ArrayView<DirectionalBeam> eyeBeams) {
+    d_foveatedEyeDirectionalSamples = GPUBuffer<DirectionalBeam>(eyeBeams.cbegin(), eyeBeams.cend());
 
     // Allocate and calculate eye-space frusta
-    uint32_t blockCount = ((uint32_t)precomputedDirectionalSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    uint32_t blockCount = ((uint32_t)eyeBeams.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
     d_foveatedEyeSpaceTileFrusta = GPUBuffer<SimpleRayFrustum>(blockCount * TILES_PER_BLOCK);
     d_foveatedEyeSpaceBlockFrusta = GPUBuffer<SimpleRayFrustum>(blockCount);
     d_foveatedWorldSpaceTileFrusta = GPUBuffer<SimpleRayFrustum>(blockCount * TILES_PER_BLOCK);
     d_foveatedWorldSpaceBlockFrusta = GPUBuffer<SimpleRayFrustum>(blockCount);
 
-    ComputeEyeSpaceFrusta(d_foveatedDirectionalSamples, d_foveatedEyeSpaceTileFrusta, d_foveatedEyeSpaceBlockFrusta);
-
-    d_tileFrusta = GPUBuffer<GPURayPacketFrustum>(blockCount * TILES_PER_BLOCK);
-    d_cullBlockFrusta = GPUBuffer<GPURayPacketFrustum>(blockCount);
+    ComputeEyeSpaceFrusta(d_foveatedEyeDirectionalSamples, d_foveatedEyeSpaceTileFrusta, d_foveatedEyeSpaceBlockFrusta);
 
     safeCudaEventDestroy(transferTileToCPUEvent);
     cutilSafeCall(cudaEventCreateWithFlags(&transferTileToCPUEvent, cudaEventDisableTiming));
 
-    safeCudaFreeHost(tileFrustaPinned);
-    safeCudaFreeHost(cullBlockFrustaPinned);
-
-    cutilSafeCall(
-        cudaMallocHost((void**)&tileFrustaPinned, sizeof(GPURayPacketFrustum) * blockCount * TILES_PER_BLOCK));
-    cutilSafeCall(cudaMallocHost((void**)&cullBlockFrustaPinned, sizeof(GPURayPacketFrustum) * blockCount));
-
     safeCudaFreeHost(foveatedWorldSpaceTileFrustaPinned);
     safeCudaFreeHost(foveatedWorldSpaceBlockFrustaPinned);
 
@@ -534,31 +446,48 @@ void GPUCamera::updateEyeSpaceFoveatedSamples(
     cutilSafeCall(cudaMallocHost((void**)&foveatedWorldSpaceBlockFrustaPinned, sizeof(SimpleRayFrustum) * blockCount));
 }
 
+Plane operator*(matrix4x4 M, Plane p) {
+    vector4 O = vector4(p.normal * p.dist, 1.0f);
+    O = M * O;
+    vector3 N = vector3(transpose(invert(M)) * vector4(p.normal, 0));
+    return Plane{N, dot(vector3(O), N)};
+}
+
 void GPUCamera::updatePerFrameFoveatedData(const FloatRect& sampleBounds,
                                            const matrix3x3& cameraToSample,
                                            const matrix3x3& eyeToCamera,
                                            const matrix4x4& eyeToWorld) {
-    validSampleCount = uint32_t(d_foveatedDirectionalSamples.size());
-    SampleInfo sampleInfo(*this);
-    uint32_t tileCount = uint32_t(d_tileFrusta.size());
-    uint32_t blockCount = uint32_t(d_cullBlockFrusta.size());
-    assert(d_foveatedWorldSpaceBlockFrusta.size() == d_cullBlockFrusta.size());
+    validSampleCount = uint32_t(d_foveatedEyeDirectionalSamples.size());
+    CameraBeams cameraBeams(*this);
+    uint32_t tileCount = uint32_t(d_foveatedWorldSpaceTileFrusta.size());
+    uint32_t blockCount = uint32_t(d_foveatedWorldSpaceBlockFrusta.size());
 
-    ResetCullFrusta(d_cullBlockFrusta, d_tileFrusta, tileCount, blockCount, stream);
 
     matrix3x3 eyeToSample = cameraToSample * eyeToCamera;
-    TransformFoveatedSamplesToSampleSpace(eyeToSample, eyeToCamera, sampleBounds, d_foveatedDirectionalSamples,
-                                          sampleInfo, d_sampleRemap, validSampleCount, stream);
+    TransformFoveatedSamplesToCameraSpace(eyeToSample, eyeToCamera, sampleBounds, d_foveatedEyeDirectionalSamples,
+                                          cameraBeams, d_sampleRemap, validSampleCount, stream);
 
-    CalculateSampleCullFrusta(d_cullBlockFrusta, d_tileFrusta, sampleInfo, validSampleCount, tileCount, blockCount,
-                              stream);
-    // Queue the copy back
-    d_cullBlockFrusta.readbackAsync(cullBlockFrustaPinned, stream);
-    d_tileFrusta.readbackAsync(tileFrustaPinned, stream);
+    auto sampleToCamera = invert(cameraToSample);
+
+    auto U = sampleBounds.upper;
+    auto L = sampleBounds.lower;
+
+    vector2 sampleDirs[4] = {{U.x, U.y}, {L.x, U.y}, {L.x, L.y}, {U.x, L.y}};
+
+    const float EPSILON = -0.01f;
+    FourPlanes cullPlanes;
+    for (int i = 0; i < 4; ++i) {
+        vector3 dir0 = sampleToCamera * vector3(sampleDirs[i], 1.0f);
+        vector3 dir1 = sampleToCamera * vector3(sampleDirs[(i + 1) % 4], 1.0f);
+        Plane eyeSpacePlane;
+        eyeSpacePlane.normal = invert(eyeToCamera) * normalize(cross(dir1, dir0));
+        eyeSpacePlane.dist = EPSILON;
+        cullPlanes.data[i] = eyeToWorld * eyeSpacePlane;
+    }
 
     CalculateWorldSpaceFrusta(d_foveatedWorldSpaceBlockFrusta, d_foveatedWorldSpaceTileFrusta,
-                              d_foveatedEyeSpaceBlockFrusta, d_foveatedEyeSpaceTileFrusta, eyeToWorld, blockCount,
-                              tileCount, stream);
+                              d_foveatedEyeSpaceBlockFrusta, d_foveatedEyeSpaceTileFrusta, eyeToWorld, cullPlanes,
+                              blockCount, tileCount, stream);
     // Queue the copy back
     d_foveatedWorldSpaceBlockFrusta.readbackAsync(foveatedWorldSpaceBlockFrustaPinned, stream);
     d_foveatedWorldSpaceTileFrusta.readbackAsync(foveatedWorldSpaceTileFrustaPinned, stream);
diff --git a/libraries/hvvr/raycaster/gpu_foveated.h b/libraries/hvvr/raycaster/gpu_foveated.h
deleted file mode 100644
index 20cee67..0000000
--- a/libraries/hvvr/raycaster/gpu_foveated.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree. An additional grant
- * of patent rights can be found in the PATENTS file in the same directory.
- */
diff --git a/libraries/hvvr/raycaster/gpu_image.h b/libraries/hvvr/raycaster/gpu_image.h
index b75e995..3be8b55 100644
--- a/libraries/hvvr/raycaster/gpu_image.h
+++ b/libraries/hvvr/raycaster/gpu_image.h
@@ -46,7 +46,16 @@ class GPUImage {
     };
 
     uint32_t bytesPerPixel() const {
-        return (m_format == PixelFormat::RGBA8_SRGB) ? 4 : 16;
+		switch (m_format) {
+		case PixelFormat::RGBA8_SRGB:
+			return 4;
+		case PixelFormat::RGBA16:
+			return 8;
+		case PixelFormat::RGBA32F:
+			return 16;
+		}
+		assert(false);
+		return 0;
     }
     size_t sizeInMemory() const {
         return size_t(bytesPerPixel()) * m_stride * m_height;
diff --git a/libraries/hvvr/raycaster/gpu_samples.cu b/libraries/hvvr/raycaster/gpu_samples.cu
index a114a91..8e9b3fe 100644
--- a/libraries/hvvr/raycaster/gpu_samples.cu
+++ b/libraries/hvvr/raycaster/gpu_samples.cu
@@ -13,9 +13,8 @@
 
 namespace hvvr {
 
-SampleInfo::SampleInfo(const GPUCamera& camera) {
-    centers = camera.d_sampleLocations;
-    extents = camera.d_sampleExtents;
+CameraBeams::CameraBeams(const GPUCamera& camera) {
+    directionalBeams = camera.d_directionalBeams;
     frameJitter = camera.frameJitter;
     lens = camera.lens;
 }
diff --git a/libraries/hvvr/raycaster/gpu_samples.h b/libraries/hvvr/raycaster/gpu_samples.h
index 7766978..a4df3c2 100644
--- a/libraries/hvvr/raycaster/gpu_samples.h
+++ b/libraries/hvvr/raycaster/gpu_samples.h
@@ -46,8 +46,8 @@ CUDA_HOST_DEVICE_INL vector2 tapLocation(int subsampleIndex, float spinAngle, fl
 
 template <uint32_t MSAARate>
 CUDA_HOST_DEVICE_INL vector2 getSubsampleUnitOffset(vector2 sampleJitter,
-                                                   int subsampleIndex,
-                                                   float extraSpinAngle = 0.0f) {
+                                                    int subsampleIndex,
+                                                    float extraSpinAngle = 0.0f) {
     (void)sampleJitter;
     float spinAngle = extraSpinAngle;
 #if JITTER_SAMPLES
@@ -61,24 +61,19 @@ CUDA_HOST_DEVICE_INL vector2 getSubsampleUnitOffset(vector2 sampleJitter,
     return vector2(unitDiskLoc.x * radius, unitDiskLoc.y * radius);
 }
 
-struct SampleInfo {
-    vector2* centers;
-    Sample::Extents* extents;
+CHDI DirectionalBeam operator*(matrix3x3 M, DirectionalBeam beam) {
+    DirectionalBeam mBeam;
+    mBeam.centerRay = M * beam.centerRay;
+    mBeam.du = M * beam.du;
+    mBeam.dv = M * beam.dv;
+    return mBeam;
+}
+
+struct CameraBeams {
+    DirectionalBeam* directionalBeams;
     vector2 frameJitter;
     ThinLens lens;
-    SampleInfo(const GPUCamera& camera);
-};
-
-struct UnpackedSample {
-    vector2 center;
-    vector2 majorAxis;
-    vector2 minorAxis;
-};
-
-struct UnpackedDirectionalSample {
-    vector3 centerDir;
-    vector3 majorDirDiff;
-    vector3 minorDirDiff;
+    CameraBeams(const GPUCamera& camera);
 };
 
 struct SampleDoF {
@@ -86,84 +81,10 @@ struct SampleDoF {
     vector3 dir;
 };
 
-// sqrt(2)/2, currently a hack so that the ellipses blobs of diagonally adjacent pixels on a uniform grid are tangent
-#define EXTENT_MODIFIER 0.70710678118f
-
-CUDA_DEVICE_INL UnpackedSample GetFullSample(uint32_t sampleIndex, SampleInfo sampleInfo) {
-    UnpackedSample sample;
-    sample.center = sampleInfo.centers[sampleIndex];
-
-    Sample::Extents extents = sampleInfo.extents[sampleIndex];
-    sample.minorAxis.x = extents.minorAxis.x * EXTENT_MODIFIER;
-    sample.minorAxis.y = extents.minorAxis.y * EXTENT_MODIFIER;
-
-    // 90 degree Rotation, and rescale
-    float minorAxisLengthInv =
-        rsqrtf(sample.minorAxis.x * sample.minorAxis.x + sample.minorAxis.y * sample.minorAxis.y);
-    float rescale = extents.majorAxisLength * EXTENT_MODIFIER * minorAxisLengthInv;
-    sample.majorAxis.x = -sample.minorAxis.y * rescale;
-    sample.majorAxis.y = sample.minorAxis.x * rescale;
-
-    return sample;
-}
-
-CUDA_DEVICE_INL UnpackedDirectionalSample GetDirectionalSample3D(uint32_t sampleIndex,
-                                                                 SampleInfo sampleInfo,
-                                                                 matrix4x4 sampleToWorld,
-                                                                 matrix3x3 sampleToCamera,
-                                                                 matrix4x4 cameraToWorld) {
-    UnpackedSample sample = GetFullSample(sampleIndex, sampleInfo);
-
-#if ENABLE_HACKY_WIDE_FOV
-    matrix3x3 cameraToWorldRotation = matrix3x3(cameraToWorld);
-
-    UnpackedDirectionalSample sample3D;
-
-    float u = sample.center.x;
-    float v = sample.center.y;
-
-    float yaw = (u - .5f) * (HACKY_WIDE_FOV_W * RadiansPerDegree);
-    float pitch = -(v - .5f) * (HACKY_WIDE_FOV_H * RadiansPerDegree);
-
-    float newX = sin(yaw) * cos(pitch);
-    float newY = sin(pitch);
-    float newZ = -cos(yaw) * cos(pitch);
-    sample3D.centerDir = vector3(newX, newY, newZ);
-
-    // making something up...
-    const float invWidth = 1.0f / 2160.0f;
-    const float invHeight = 1.0f / 1200.0f;
-    float majorAxisMag = sin(.5f * invHeight * (HACKY_WIDE_FOV_H * RadiansPerDegree));
-    float minorAxisMag = sin(.5f * invWidth * (HACKY_WIDE_FOV_W * RadiansPerDegree));
-
-    sample3D.majorDirDiff.x = sin(yaw) * sin(pitch);
-    sample3D.majorDirDiff.y = -cos(pitch);
-    sample3D.majorDirDiff.z = -cos(yaw) * sin(pitch);
-
-    sample3D.minorDirDiff = cross(sample3D.majorDirDiff, sample3D.centerDir);
-
-    sample3D.majorDirDiff *= majorAxisMag;
-    sample3D.minorDirDiff *= minorAxisMag;
-
-    if (HACKY_WIDE_FOV_H > HACKY_WIDE_FOV_W) {
-        vector3 temp = sample3D.minorDirDiff;
-        sample3D.minorDirDiff = sample3D.majorDirDiff;
-        sample3D.majorDirDiff = temp;
-    }
-
-    sample3D.centerDir = cameraToWorldRotation * sample3D.centerDir;
-    sample3D.majorDirDiff = cameraToWorldRotation * sample3D.majorDirDiff;
-    sample3D.minorDirDiff = cameraToWorldRotation * sample3D.minorDirDiff;
-#else
-    matrix3x3 sampleToWorldRotation(sampleToWorld);
-
-    UnpackedDirectionalSample sample3D;
-    sample3D.centerDir = sampleToWorldRotation * vector3(sample.center.x, sample.center.y, 1.0f);
-    sample3D.majorDirDiff = sampleToWorldRotation * vector3(sample.majorAxis.x, sample.majorAxis.y, 0.0f);
-    sample3D.minorDirDiff = sampleToWorldRotation * vector3(sample.minorAxis.x, sample.minorAxis.y, 0.0f);
-#endif
-
-    return sample3D;
+CUDA_DEVICE_INL DirectionalBeam GetDirectionalSample3D(uint32_t sampleIndex,
+                                                       CameraBeams cameraBeams,
+                                                       matrix4x4 cameraToWorld) {
+    return matrix3x3(cameraToWorld) * cameraBeams.directionalBeams[sampleIndex];
 }
 
 template <uint32_t MSAARate, uint32_t BlockSize>
diff --git a/libraries/hvvr/raycaster/intersect.cu b/libraries/hvvr/raycaster/intersect.cu
index 0b99577..b7c246b 100644
--- a/libraries/hvvr/raycaster/intersect.cu
+++ b/libraries/hvvr/raycaster/intersect.cu
@@ -8,7 +8,6 @@
  */
 
 #include "cuda_decl.h"
-#include "gbuffer.h"
 #include "gpu_camera.h"
 #include "gpu_context.h"
 #include "kernel_constants.h"
@@ -37,7 +36,7 @@ CUDA_DEVICE int EmitGBuffer(int laneIndex, const uint32_t* sampleTriIndex, Rayca
             oneTri = false;
         }
     }
-    bool oneTriPerLane = (ballot(oneTri) == laneGetMaskAll());
+    bool oneTriPerLane = (warpBallot(oneTri) == laneGetMaskAll());
 
     if (oneTriPerLane) {
         // for all threads in the warp, only a single triangle is hit
@@ -96,8 +95,8 @@ struct TriCache {
 
     // TODO(anankervis): don't waste space by unioning the DoF and non-DoF variants (getting the bigger size of the two)
     union {
-        IntersectTriangleTile data[maxSize];
-        IntersectTriangleTileDoF dataDoF[maxSize];
+        IntersectTriangleTile<false> data[maxSize];
+        IntersectTriangleTile<true> dataDoF[maxSize];
     };
     uint32_t index[maxSize];
 
@@ -108,40 +107,40 @@ struct TriCache {
     FrustumPlanes frustumPlanes;
 
     union {
-        TileData tile;
-        TileDataDoF tileDoF;
+        TileData<false> tile;
+        TileData<true> tileDoF;
     };
 
     uint32_t sampleTriIndex[BlockSize * AARate];
 };
 
+
 template <uint32_t AARate, uint32_t BlockSize, bool EnableDoF>
 CUDA_DEVICE void IntersectSamples(const PrecomputedTriangleIntersect* CUDA_RESTRICT trianglesIntersect,
-                                  SampleInfo sampleInfo,
-                                  const UnpackedDirectionalSample& sample,
-                                  matrix4x4 sampleToWorld,
+                                  CameraBeams cameraBeams,
+                                  const DirectionalBeam& sample,
+                                  matrix4x4 cameraToWorld,
                                   const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                   uint32_t sampleOffset,
                                   TriCache<AARate, BlockSize>& triCache,
                                   int triCount,
                                   float* sampleTMax) {
-    UnpackedSample sample2D = GetFullSample(sampleOffset, sampleInfo);
-    matrix3x3 sampleToWorldRotation = matrix3x3(sampleToWorld);
-    vector3 lensCenterToFocalCenter =
-        sampleInfo.lens.focalDistance * (sampleToWorldRotation * vector3(sample2D.center.x, sample2D.center.y, 1.0f));
+    vector3 centerDir = normalize(sample.centerRay);
+    float zed = dot(matrix3x3(cameraToWorld) * vector3(0, 0, -1.0f), centerDir);
+    vector3 lensCenterToFocalCenter = (cameraBeams.lens.focalDistance / zed) * centerDir;
 
     for (int j = 0; j < triCount; ++j) {
         uint32_t triIndex = triCache.index[j];
 
         if (EnableDoF) {
-            IntersectTriangleTileDoF triTileDoF = triCache.dataDoF[j];
+            IntersectTriangleTile<true> triTileDoF = triCache.dataDoF[j];
             IntersectTriangleThreadDoF triThreadDoF(triTileDoF, lensCenterToFocalCenter);
 
 #pragma unroll
             for (int i = 0; i < AARate; ++i) {
                 vector2 lensUV;
                 vector2 dirUV;
-                GetSampleUVsDoF<AARate, BlockSize>(tileSubsampleLensPos, sampleInfo.frameJitter,
+                GetSampleUVsDoF<AARate, BlockSize>(tileSubsampleLensPos, cameraBeams.frameJitter,
                                                    triCache.tileDoF.focalToLensScale, i, lensUV, dirUV);
 
                 if (triThreadDoF.test(triTileDoF, lensCenterToFocalCenter, triCache.tileDoF.lensU,
@@ -152,12 +151,12 @@ CUDA_DEVICE void IntersectSamples(const PrecomputedTriangleIntersect* CUDA_RESTR
                 }
             }
         } else {
-            IntersectTriangleTile triTile = triCache.data[j];
-            IntersectTriangleThread triThread(triTile, sample.centerDir);
+            IntersectTriangleTile<false> triTile = triCache.data[j];
+            IntersectTriangleThread triThread(triTile, sample.centerRay, sample.du, sample.dv);
 
 #pragma unroll
             for (int i = 0; i < AARate; ++i) {
-                vector2 alpha = getSubsampleUnitOffset<AARate>(sampleInfo.frameJitter, i);
+                vector2 alpha = getSubsampleUnitOffset<AARate>(cameraBeams.frameJitter, i);
 
                 if (triThread.test(triTile, alpha, sampleTMax[i])) {
                     // ray intersected triangle and passed depth test, sampleTMax[i] has been updated
@@ -171,9 +170,7 @@ CUDA_DEVICE void IntersectSamples(const PrecomputedTriangleIntersect* CUDA_RESTR
 
 // intersect a whole tile of rays
 template <uint32_t AARate, uint32_t BlockSize, bool EnableDoF>
-CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
-                               matrix4x4 sampleToWorld,
-                               matrix3x3 sampleToCamera,
+CUDA_DEVICE void IntersectTile(CameraBeams cameraBeams,
                                matrix4x4 cameraToWorld,
                                const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                const unsigned* CUDA_RESTRICT triIndices,
@@ -182,17 +179,13 @@ CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
                                TileTriRange triRange,
                                TriCache<AARate, BlockSize>& sMemTriCache,
                                float* sampleTMax) {
-    // TODO: switch to full 3D sample to allow different directions per subsample
-    UnpackedDirectionalSample sample =
-        GetDirectionalSample3D(sampleOffset, sampleInfo, sampleToWorld, sampleToCamera, cameraToWorld);
+    DirectionalBeam sample = GetDirectionalSample3D(sampleOffset, cameraBeams, cameraToWorld);
 
-    // TODO(anankervis): precompute this with more accurate values, and load from a per-tile buffer
-    // (but watch out for the foveated path)
     if (threadIdx.x == BlockSize / 2) {
         if (EnableDoF) {
-            sMemTriCache.tileDoF.load(sampleInfo, sampleToWorld, sampleOffset);
+            sMemTriCache.tileDoF.load(cameraBeams, cameraToWorld, sample);
         } else {
-            sMemTriCache.tile.load(sampleToWorld, sample);
+            sMemTriCache.tile.load(cameraToWorld, sample);
         }
         // make sure there is a __syncthreads somewhere between here and first use (tri cache init, for example)
     }
@@ -208,8 +201,8 @@ CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
         // each thread cooperates to populate the shared mem triangle cache
         bool outputTri = false;
         uint32_t triIndex = badTriIndex;
-        IntersectTriangleTile triTile;
-        IntersectTriangleTileDoF triTileDoF;
+        IntersectTriangleTile<false> triTile;
+        IntersectTriangleTile<true> triTileDoF;
         if (threadIdx.x < sMemTriCache.maxSize) {
             uint32_t triIndirectIndex = triRangeCurrent + threadIdx.x;
             if (triIndirectIndex < triRange.end) {
@@ -221,10 +214,10 @@ CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
                     if (EnableDoF) {
                         // test for backfacing and intersection before ray origin
                         IntersectResult intersectResultSetup =
-                            triTileDoF.setup(triIntersect, sMemTriCache.tileDoF.lensCenter, sMemTriCache.tileDoF.lensU,
+                            triTileDoF.setup(triIntersect, vector3(cameraToWorld.m3), sMemTriCache.tileDoF.lensU,
                                              sMemTriCache.tileDoF.lensV);
                         if (intersectResultSetup != intersect_all_out) {
-#if FOVEATED_TRIANGLE_FRUSTA_TEST_DISABLE == 0
+#if USE_TILE_FRUSTA_TEST == 1
                             // test the tile frustum against the triangle's edges
                             // only perform this test if all rays within the tile can be guaranteed to intersect the
                             // front side of the triangle's plane
@@ -240,11 +233,9 @@ CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
                             }
                         }
                     } else {
-                        IntersectResult intersectResultSetup =
-                            triTile.setup(triIntersect, sMemTriCache.tile.rayOrigin, sMemTriCache.tile.majorDirDiff,
-                                          sMemTriCache.tile.minorDirDiff);
+                        IntersectResult intersectResultSetup = triTile.setup(triIntersect, sMemTriCache.tile.rayOrigin);
                         if (intersectResultSetup != intersect_all_out) {
-#if FOVEATED_TRIANGLE_FRUSTA_TEST_DISABLE == 0
+#if USE_TILE_FRUSTA_TEST == 1
                             IntersectResult intersectResultUVW =
                                 TestTriangleFrustaUVW(sMemTriCache.frustumRays, triIntersect);
 
@@ -288,7 +279,7 @@ CUDA_DEVICE void IntersectTile(SampleInfo sampleInfo,
         }
         __syncthreads();
 
-        IntersectSamples<AARate, BlockSize, EnableDoF>(trianglesIntersect, sampleInfo, sample, sampleToWorld,
+        IntersectSamples<AARate, BlockSize, EnableDoF>(trianglesIntersect, cameraBeams, sample, cameraToWorld,
                                                        tileSubsampleLensPos, sampleOffset, sMemTriCache, outputCount,
                                                        sampleTMax);
 
@@ -307,9 +298,7 @@ union IntersectTileSharedMem {
 
 template <uint32_t AARate, uint32_t BlockSize, bool EnableDoF>
 CUDA_KERNEL void IntersectKernel(RaycasterGBufferSubsample* gBuffer,
-                                 SampleInfo sampleInfo,
-                                 matrix4x4 sampleToWorld,
-                                 matrix3x3 sampleToCamera,
+                                 CameraBeams cameraBeams,
                                  matrix4x4 cameraToWorld,
                                  const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                  const uint32_t* CUDA_RESTRICT tileIndexRemapOccupied,
@@ -336,9 +325,8 @@ CUDA_KERNEL void IntersectKernel(RaycasterGBufferSubsample* gBuffer,
     }
 
     float sampleTMax[AARate];
-    IntersectTile<AARate, BlockSize, EnableDoF>(sampleInfo, sampleToWorld, sampleToCamera, cameraToWorld,
-                                                tileSubsampleLensPos, triIndices, trianglesIntersect, sampleOffset,
-                                                triRange, sMem.triCache, sampleTMax);
+    IntersectTile<AARate, BlockSize, EnableDoF>(cameraBeams, cameraToWorld, tileSubsampleLensPos, triIndices,
+                                                trianglesIntersect, sampleOffset, triRange, sMem.triCache, sampleTMax);
 
     uint32_t sampleTriIndex[AARate];
     for (int i = 0; i < AARate; i++) {
@@ -368,7 +356,7 @@ CUDA_KERNEL void GenerateHeat() {
 }
 #endif
 
-void GPUCamera::intersect(GPUSceneState& sceneState, const SampleInfo& sampleInfo) {
+void GPUCamera::intersect(GPUSceneState& sceneState, const CameraBeams& cameraBeams) {
     Camera_StreamedData& streamedData = streamed[streamedIndexGPU];
 
     uint32_t occupiedTileCount = streamedData.tileCountOccupied;
@@ -389,18 +377,16 @@ void GPUCamera::intersect(GPUSceneState& sceneState, const SampleInfo& sampleInf
 #endif
 
     KernelDim dimIntersect(occupiedTileCount * TILE_SIZE, TILE_SIZE);
-    if (sampleInfo.lens.radius > 0.0f) {
+    if (cameraBeams.lens.radius > 0.0f) {
         // Enable depth of field
         IntersectKernel<COLOR_MODE_MSAA_RATE, TILE_SIZE, true><<<dimIntersect.grid, dimIntersect.block, 0, stream>>>(
-            d_gBuffer, sampleInfo, cameraToWorld * matrix4x4(sampleToCamera), sampleToCamera, cameraToWorld,
-            d_tileSubsampleLensPos, local.tileIndexRemapOccupied, local.tileTriRanges, streamedData.triIndices,
-            local.tileFrusta3D, sceneState.trianglesIntersect);
+            d_gBuffer, cameraBeams, cameraToWorld, d_tileSubsampleLensPos, local.tileIndexRemapOccupied,
+            local.tileTriRanges, streamedData.triIndices, local.tileFrusta3D, sceneState.trianglesIntersect);
     } else {
         // No depth of field, assume all rays have the same origin
         IntersectKernel<COLOR_MODE_MSAA_RATE, TILE_SIZE, false><<<dimIntersect.grid, dimIntersect.block, 0, stream>>>(
-            d_gBuffer, sampleInfo, cameraToWorld * matrix4x4(sampleToCamera), sampleToCamera, cameraToWorld,
-            d_tileSubsampleLensPos, local.tileIndexRemapOccupied, local.tileTriRanges, streamedData.triIndices,
-            local.tileFrusta3D, sceneState.trianglesIntersect);
+            d_gBuffer, cameraBeams, cameraToWorld, d_tileSubsampleLensPos, local.tileIndexRemapOccupied,
+            local.tileTriRanges, streamedData.triIndices, local.tileFrusta3D, sceneState.trianglesIntersect);
     }
 
 #if PROFILE_INTERSECT
@@ -414,16 +400,14 @@ void GPUCamera::intersect(GPUSceneState& sceneState, const SampleInfo& sampleInf
     }
     frameIndex++;
 
-    // I need more of a workload to get consistent clocks out of the GPU...
+    // Need more of a workload to get consistent clocks out of the GPU...
     GenerateHeat<<<1024, 32, 0, stream>>>();
 #endif
 }
 
 template <uint32_t AARate>
 CUDA_KERNEL void DumpRaysKernel(SimpleRay* rayBuffer,
-                                SampleInfo sampleInfo,
-                                matrix4x4 sampleToWorld,
-                                matrix3x3 sampleToCamera,
+                                CameraBeams cameraBeams,
                                 matrix4x4 cameraToWorld,
                                 const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                 int sampleCount,
@@ -438,20 +422,14 @@ CUDA_KERNEL void DumpRaysKernel(SimpleRay* rayBuffer,
                 return;
             }
         }
-        UnpackedDirectionalSample sample3D =
-            GetDirectionalSample3D(sampleOffset, sampleInfo, sampleToWorld, sampleToCamera, cameraToWorld);
+        DirectionalBeam sample3D = GetDirectionalSample3D(sampleOffset, cameraBeams, cameraToWorld);
 
         for (int i = 0; i < AARate; ++i) {
-            vector2 alpha = getSubsampleUnitOffset<AARate>(sampleInfo.frameJitter, i);
-            vector3 dir =
-                normalize(sample3D.centerDir + sample3D.majorDirDiff * alpha.x + sample3D.minorDirDiff * alpha.y);
-            vector3 pos = vector3(cameraToWorld * vector4(0, 0, 0, 1));
-            if (sampleInfo.lens.radius > 0.0f) {
+            vector2 alpha = getSubsampleUnitOffset<AARate>(cameraBeams.frameJitter, i);
+            vector3 dir = normalize(sample3D.centerRay + sample3D.du * alpha.x + sample3D.dv * alpha.y);
+            vector3 pos = vector3(cameraToWorld.m3);
+            if (cameraBeams.lens.radius > 0.0f) {
                 // TODO: implement
-                /*SampleDoF sampleDoF = GetSampleDoF<AARate, TILE_SIZE>(sampleOffset, i, sampleInfo, sampleToCamera,
-                cameraToWorld, tileSubsampleLensPos);
-                pos = sampleDoF.pos;
-                dir = sampleDoF.dir;*/
             }
             uint32_t index = outputOffset + i;
             rayBuffer[index].direction.x = dir.x;
@@ -469,13 +447,13 @@ void GPUCamera::dumpRays(std::vector<SimpleRay>& rays, bool outputScanlineOrder)
     GPUBuffer<SimpleRay> d_rays(rayCount);
     rays.resize(rayCount);
 
-    SampleInfo sampleInfo(*this);
+    CameraBeams cameraBeams(*this);
     uint32_t tileCount = (validSampleCount + TILE_SIZE - 1) / TILE_SIZE;
 
     KernelDim dim(tileCount * TILE_SIZE, TILE_SIZE);
-    DumpRaysKernel<COLOR_MODE_MSAA_RATE><<<dim.grid, dim.block, 0, stream>>>(
-        d_rays, sampleInfo, cameraToWorld * matrix4x4(sampleToCamera), sampleToCamera, cameraToWorld,
-        d_tileSubsampleLensPos.data(), validSampleCount, d_sampleRemap.data(), outputScanlineOrder);
+    DumpRaysKernel<COLOR_MODE_MSAA_RATE>
+        <<<dim.grid, dim.block, 0, stream>>>(d_rays, cameraBeams, cameraToWorld, d_tileSubsampleLensPos.data(),
+                                             validSampleCount, d_sampleRemap.data(), outputScanlineOrder);
 
     d_rays.readback(rays.data());
 }
diff --git a/libraries/hvvr/raycaster/kernel_constants.h b/libraries/hvvr/raycaster/kernel_constants.h
index d1dd021..a42ddb2 100644
--- a/libraries/hvvr/raycaster/kernel_constants.h
+++ b/libraries/hvvr/raycaster/kernel_constants.h
@@ -14,15 +14,14 @@
 #define MSAA_SHADE 0
 #define SSAA_SHADE 1
 #define SUPERSHADING_MODE MSAA_SHADE
+
 #define JITTER_SAMPLES 0
-// TODO(anankervis): GPU foveated path doesn't pass along correct tile culling frusta to intersect (yet)
-#define FOVEATED_TRIANGLE_FRUSTA_TEST_DISABLE 0
 
-#define DOF_LENS_POS_LOOKUP_TABLE_TILES 4
+// GPU foveated path doesn't pass along correct tile culling frusta to intersect (yet), so disable
+// If enabled, increases intersection perf on our test scenes ~4%
+#define USE_TILE_FRUSTA_TEST 1
 
-#define ENABLE_HACKY_WIDE_FOV 0
-#define HACKY_WIDE_FOV_W 210.0f
-#define HACKY_WIDE_FOV_H 130.0f
+#define DOF_LENS_POS_LOOKUP_TABLE_TILES 4
 
 #define SM_BARYCENTRIC 2
 #define SM_TRI_ID 3
diff --git a/libraries/hvvr/raycaster/material.h b/libraries/hvvr/raycaster/material.h
index c8a00df..7afff52 100644
--- a/libraries/hvvr/raycaster/material.h
+++ b/libraries/hvvr/raycaster/material.h
@@ -14,8 +14,8 @@
 namespace hvvr {
 
 enum class ShadingModel : uint32_t {
-	none = 0,
-	phong,
+    none = 0,
+    phong,
     emissive,
 };
 
@@ -40,12 +40,13 @@ struct SimpleMaterial {
     }
 
     vector4 emissive;
-	vector4 diffuse;
-	vector4 specular;
-	float glossiness;
-	float opacity;
-	// upper 16 bits diffuse/emissive | next 16 bits specularTexture | next 16 bits glossinessTexture | lower 16 bits shadingModel
-	uint64_t textureIDsAndShadingModel;
+    vector4 diffuse;
+    vector4 specular;
+    float glossiness;
+    float opacity;
+    // upper 16 bits diffuse/emissive | next 16 bits specularTexture | next 16 bits glossinessTexture | lower 16 bits
+    // shadingModel
+    uint64_t textureIDsAndShadingModel;
 };
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/prim_tests.h b/libraries/hvvr/raycaster/prim_tests.h
index 324bc79..ba86579 100644
--- a/libraries/hvvr/raycaster/prim_tests.h
+++ b/libraries/hvvr/raycaster/prim_tests.h
@@ -116,23 +116,37 @@ CUDA_DEVICE_INL IntersectResult TestTriangleFrustaUVW(const SimpleRayFrustum& ra
     return intersect_all_in;
 }
 
+CUDA_DEVICE_INL void GetDifferentials(vector3 edge0,
+                                      vector3 edge1,
+                                      vector3 v0ToRayOrigin,
+                                      vector3 majorDirDiff,
+                                      vector3 minorDirDiff,
+                                      vector2& dDenomDAlpha,
+                                      vector2& dVdAlpha,
+                                      vector2& dWdAlpha) {
+    vector3 normal = cross(edge0, edge1);
+    dDenomDAlpha = vector2(dot(-majorDirDiff, normal), dot(-minorDirDiff, normal));
+
+    dVdAlpha =
+        vector2(dot(edge1, cross(-majorDirDiff, v0ToRayOrigin)), dot(edge1, cross(-minorDirDiff, v0ToRayOrigin)));
+
+    dWdAlpha =
+        vector2(-dot(edge0, cross(-majorDirDiff, v0ToRayOrigin)), -dot(edge0, cross(-minorDirDiff, v0ToRayOrigin)));
+}
+
 // tile-wide values
-struct IntersectTriangleTile {
-    float t;
+template <bool DoF>
+struct IntersectTriangleTile;
 
+template <>
+struct IntersectTriangleTile<false> {
+    float t;
     vector3 edge0;
     vector3 edge1;
     vector3 v0ToRayOrigin;
 
-    vector2 dDenomDAlpha;
-    vector2 dVdAlpha;
-    vector2 dWdAlpha;
-
     // returns false if no rays originating at rayOrigin can intersect the triangle's plane (assumes backface testing)
-    CUDA_DEVICE IntersectResult setup(const PrecomputedTriangleIntersect& triPrecomp,
-                                      vector3 rayOrigin,
-                                      vector3 majorDirDiff,
-                                      vector3 minorDirDiff) {
+    CUDA_DEVICE IntersectResult setup(const PrecomputedTriangleIntersect& triPrecomp, vector3 rayOrigin) {
         vector3 v0 = triPrecomp.v0;
         edge0 = triPrecomp.edge0;
         edge1 = triPrecomp.edge1;
@@ -146,14 +160,6 @@ struct IntersectTriangleTile {
         if (t < 0.0f)
             return intersect_all_out; // ray origin is behind the triangle's plane
 
-        dDenomDAlpha = vector2(dot(-majorDirDiff, normal), dot(-minorDirDiff, normal));
-
-        dVdAlpha =
-            vector2(dot(edge1, cross(-majorDirDiff, v0ToRayOrigin)), dot(edge1, cross(-minorDirDiff, v0ToRayOrigin)));
-
-        dWdAlpha =
-            vector2(-dot(edge0, cross(-majorDirDiff, v0ToRayOrigin)), -dot(edge0, cross(-minorDirDiff, v0ToRayOrigin)));
-
         return intersect_all_in;
     }
 };
@@ -163,8 +169,14 @@ struct IntersectTriangleThread {
     float denomCenter;
     float vCenter;
     float wCenter;
+    vector2 dDenomDAlpha;
+    vector2 dVdAlpha;
+    vector2 dWdAlpha;
 
-    CUDA_DEVICE IntersectTriangleThread(const IntersectTriangleTile& triTile, vector3 rayDirCenter) {
+    CUDA_DEVICE IntersectTriangleThread(const IntersectTriangleTile<false>& triTile,
+                                        vector3 rayDirCenter,
+                                        vector3 majorDirDiff,
+                                        vector3 minorDirDiff) {
         vector3 normal = cross(triTile.edge0, triTile.edge1);
         denomCenter = dot(-rayDirCenter, normal);
 
@@ -172,15 +184,17 @@ struct IntersectTriangleThread {
         vector3 eCenter = cross(-rayDirCenter, triTile.v0ToRayOrigin);
         vCenter = dot(triTile.edge1, eCenter);
         wCenter = -dot(triTile.edge0, eCenter);
+        GetDifferentials(triTile.edge0, triTile.edge1, triTile.v0ToRayOrigin, majorDirDiff, minorDirDiff, dDenomDAlpha,
+                         dVdAlpha, dWdAlpha);
     }
 
     // returns true if the ray intersects the triangle and the intersection distance is less than depth
     // also updates the value of depth
-    CUDA_DEVICE bool test(const IntersectTriangleTile& triTile, vector2 alpha, float& depth) {
+    CUDA_DEVICE bool test(const IntersectTriangleTile<false>& triTile, vector2 alpha, float& depth) {
         // it seems that the CUDA compiler is missing an opportunity to merge multiply + add across function calls into
         // FMA, so no call to dot product function here...
         // 2 FMA
-        float denom = denomCenter + triTile.dDenomDAlpha.x * alpha.x + triTile.dDenomDAlpha.y * alpha.y;
+        float denom = denomCenter + dDenomDAlpha.x * alpha.x + dDenomDAlpha.y * alpha.y;
 
         // t still needs to be divided by denom to get the correct distance
         // this is a combination of two tests:
@@ -197,9 +211,9 @@ struct IntersectTriangleThread {
 
         // compute scaled barycentrics
         // 2 FMA
-        float v = vCenter + triTile.dVdAlpha.x * alpha.x + triTile.dVdAlpha.y * alpha.y;
+        float v = vCenter + dVdAlpha.x * alpha.x + dVdAlpha.y * alpha.y;
         // 2 FMA
-        float w = wCenter + triTile.dWdAlpha.x * alpha.x + triTile.dWdAlpha.y * alpha.y;
+        float w = wCenter + dWdAlpha.x * alpha.x + dWdAlpha.y * alpha.y;
         // 2 ADD
         float u = denom - v - w;
 
@@ -217,12 +231,12 @@ struct IntersectTriangleThread {
         return true;
     }
 
-    CUDA_DEVICE void calcUVW(const IntersectTriangleTile& triTile, vector2 alpha, vector3& uvw) {
-        float denom = denomCenter + triTile.dDenomDAlpha.x * alpha.x + triTile.dDenomDAlpha.y * alpha.y;
+    CUDA_DEVICE void calcUVW(const IntersectTriangleTile<false>& triTile, vector2 alpha, vector3& uvw) {
+        float denom = denomCenter + dDenomDAlpha.x * alpha.x + dDenomDAlpha.y * alpha.y;
 
         // compute scaled barycentrics
-        float v = vCenter + triTile.dVdAlpha.x * alpha.x + triTile.dVdAlpha.y * alpha.y;
-        float w = wCenter + triTile.dWdAlpha.x * alpha.x + triTile.dWdAlpha.y * alpha.y;
+        float v = vCenter + dVdAlpha.x * alpha.x + dVdAlpha.y * alpha.y;
+        float w = wCenter + dWdAlpha.x * alpha.x + dWdAlpha.y * alpha.y;
         float u = denom - v - w;
 
         float denomInv = 1.0f / denom;
@@ -233,7 +247,8 @@ struct IntersectTriangleThread {
 };
 
 // tile-wide values
-struct IntersectTriangleTileDoF {
+template <>
+struct IntersectTriangleTile<true> {
     vector2 dDenomDAlpha;
 
     vector3 edge0;
@@ -281,14 +296,14 @@ struct IntersectTriangleTileDoF {
 struct IntersectTriangleThreadDoF {
     float denomCenter;
 
-    CUDA_DEVICE IntersectTriangleThreadDoF(const IntersectTriangleTileDoF& triTile, vector3 rayDirCenter) {
+    CUDA_DEVICE IntersectTriangleThreadDoF(const IntersectTriangleTile<true>& triTile, vector3 rayDirCenter) {
         vector3 normal = cross(triTile.edge0, triTile.edge1);
         denomCenter = dot(-rayDirCenter, normal);
     }
 
     // returns true if the ray intersects the triangle and the intersection distance is less than depth
     // also updates the value of depth
-    CUDA_DEVICE bool test(const IntersectTriangleTileDoF& triTile,
+    CUDA_DEVICE bool test(const IntersectTriangleTile<true>& triTile,
                           vector3 lensCenterToFocalCenter,
                           vector3 lensU,
                           vector3 lensV,
@@ -349,7 +364,7 @@ struct IntersectTriangleThreadDoF {
         return true;
     }
 
-    CUDA_DEVICE void calcUVW(const IntersectTriangleTileDoF& triTile,
+    CUDA_DEVICE void calcUVW(const IntersectTriangleTile<true>& triTile,
                              vector3 lensCenterToFocalCenter,
                              vector3 lensU,
                              vector3 lensV,
diff --git a/libraries/hvvr/raycaster/raycaster.cpp b/libraries/hvvr/raycaster/raycaster.cpp
index e7fa592..53a4121 100644
--- a/libraries/hvvr/raycaster/raycaster.cpp
+++ b/libraries/hvvr/raycaster/raycaster.cpp
@@ -37,7 +37,7 @@ Raycaster::Raycaster(const RayCasterSpecification& spec) : _spec(spec), _sceneDi
     }
     _threadPool = std::make_unique<ThreadPool>(numThreads, threadInit);
 
-    if (!GPUContext::cudaInit()) {
+    if (!GPUContext::cudaInit(_spec.outputTo3DApi)) {
         assert(false);
     }
 
@@ -55,6 +55,27 @@ Raycaster::~Raycaster() {
     GPUContext::cudaCleanup();
 }
 
+void Raycaster::reinit(RayCasterGPUMode mode) {
+    cutilSafeCall(cudaDeviceSynchronize());
+
+    _cameras.clear();
+    cleanupScene();
+    if (_gpuContext)
+        _gpuContext->cleanup();
+    GPUContext::cudaCleanup();
+    _gpuContext.release();
+
+    cutilSafeCall(cudaDeviceReset());
+
+    _spec.mode = mode;
+
+    if (!GPUContext::cudaInit(_spec.outputTo3DApi)) {
+        assert(false);
+    }
+
+    _gpuContext = std::make_unique<GPUContext>();
+}
+
 Camera* Raycaster::createCamera(const FloatRect& viewport, float apertureRadius) {
     _cameras.emplace_back(std::make_unique<Camera>(viewport, apertureRadius, *_gpuContext));
     return (_cameras.end() - 1)->get();
diff --git a/libraries/hvvr/raycaster/raycaster.h b/libraries/hvvr/raycaster/raycaster.h
index 273a51d..d6e3c31 100644
--- a/libraries/hvvr/raycaster/raycaster.h
+++ b/libraries/hvvr/raycaster/raycaster.h
@@ -9,9 +9,9 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#include "raycaster_spec.h"
 #include "dynamic_array.h"
 #include "graphics_types.h"
+#include "raycaster_spec.h"
 
 #include <memory>
 #include <string>
@@ -19,8 +19,9 @@
 
 struct PrecomputedTriangleShade;
 struct BVHNode;
-struct BlockInfo;
+struct RayHierarchy;
 struct Camera_StreamedData;
+struct RayPacketFrustum3D;
 
 namespace hvvr {
 
@@ -63,6 +64,8 @@ class Raycaster {
     //// rendering
     void render(double elapsedTime);
 
+    void reinit(RayCasterGPUMode mode);
+
 protected:
     Raycaster(const Raycaster&) = delete;
     Raycaster(Raycaster&&) = delete;
@@ -94,12 +97,26 @@ class Raycaster {
     void uploadScene();
     void cleanupScene();
 
+    void setupAllRenderTargets();
+    void blitAllRenderTargets();
+
+    void interopMapResources();
+    void interopUnmapResources();
+
+
     //// rendering
+    void renderCameraGPUIntersectAndReconstructDeferredMSAAResolve(std::unique_ptr<hvvr::Camera>& camera);
     void renderGPUIntersectAndReconstructDeferredMSAAResolve();
     void renderFoveatedPolarSpaceCudaReconstruct();
+    void renderCamera(std::unique_ptr<hvvr::Camera>& camera);
 
     // traverse BVH and generate lists of triangles to intersect on the GPU
-    void buildTileTriangleLists(const BlockInfo& blockInfo, Camera_StreamedData* streamed);
+    void buildTileTriangleLists(const RayHierarchy& rayHierarchy, Camera_StreamedData* streamed);
+    void transformHierarchyCameraToWorld(std::unique_ptr<hvvr::Camera>& camera,
+                                         const RayPacketFrustum3D* tilesSrc,
+                                         const RayPacketFrustum3D* blocksSrc,
+                                         uint32_t blockCount,
+                                         Camera_StreamedData* streamed);
 };
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/raycaster.props b/libraries/hvvr/raycaster/raycaster.props
index 5c5f632..e1c43b9 100644
--- a/libraries/hvvr/raycaster/raycaster.props
+++ b/libraries/hvvr/raycaster/raycaster.props
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
     <IncludePath>$(MSBuildThisFileDirectory);$(IncludePath)</IncludePath>
diff --git a/libraries/hvvr/raycaster/raycaster.vcxproj b/libraries/hvvr/raycaster/raycaster.vcxproj
index 9df24a1..850891b 100644
--- a/libraries/hvvr/raycaster/raycaster.vcxproj
+++ b/libraries/hvvr/raycaster/raycaster.vcxproj
@@ -18,11 +18,9 @@
     <ClInclude Include="camera.h" />
     <ClInclude Include="cuda_util.h" />
     <ClInclude Include="frusta.h" />
-    <ClInclude Include="gbuffer.h" />
     <ClInclude Include="gpu_buffer.h" />
     <ClInclude Include="gpu_camera.h" />
     <ClInclude Include="gpu_context.h" />
-    <ClInclude Include="gpu_foveated.h" />
     <ClInclude Include="gpu_image.h" />
     <ClInclude Include="gpu_samples.h" />
     <ClInclude Include="gpu_scene_state.h" />
@@ -114,6 +112,7 @@
     <PlatformToolset>v140</PlatformToolset>
     <CharacterSet>MultiByte</CharacterSet>
     <UseDebugLibraries>false</UseDebugLibraries>
+    <CodeGenerationComputeVersionString>compute_60,sm_60</CodeGenerationComputeVersionString>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -128,11 +127,12 @@
   <ItemDefinitionGroup>
     <CudaCompile>
       <GenerateRelocatableDeviceCode>false</GenerateRelocatableDeviceCode>
-      <CodeGeneration>compute_61,sm_61</CodeGeneration>
+      <CodeGeneration>$(CodeGenerationComputeVersionString)</CodeGeneration>
       <TargetMachinePlatform>64</TargetMachinePlatform>
       <FastMath>true</FastMath>
       <GenerateLineInfo>true</GenerateLineInfo>
       <AdditionalOptions>-Xcompiler "/wd 4201 /wd 4127" %(AdditionalOptions)</AdditionalOptions>
+      <GPUDebugInfo Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</GPUDebugInfo>
     </CudaCompile>
   </ItemDefinitionGroup>
 </Project>
\ No newline at end of file
diff --git a/libraries/hvvr/raycaster/raycaster.vcxproj.filters b/libraries/hvvr/raycaster/raycaster.vcxproj.filters
index 93b8b65..03b0b98 100644
--- a/libraries/hvvr/raycaster/raycaster.vcxproj.filters
+++ b/libraries/hvvr/raycaster/raycaster.vcxproj.filters
@@ -19,9 +19,6 @@
     <ClInclude Include="frusta.h">
       <Filter>cuda\samples</Filter>
     </ClInclude>
-    <ClInclude Include="gpu_foveated.h">
-      <Filter>cuda\samples</Filter>
-    </ClInclude>
     <ClInclude Include="tile_data.h">
       <Filter>cuda\samples</Filter>
     </ClInclude>
@@ -58,9 +55,6 @@
     <ClInclude Include="warp_ops.h">
       <Filter>cuda\util</Filter>
     </ClInclude>
-    <ClInclude Include="gbuffer.h">
-      <Filter>cuda</Filter>
-    </ClInclude>
     <ClInclude Include="gpu_camera.h">
       <Filter>cuda</Filter>
     </ClInclude>
diff --git a/libraries/hvvr/raycaster/raycaster_common.h b/libraries/hvvr/raycaster/raycaster_common.h
index baafc62..e62b41a 100644
--- a/libraries/hvvr/raycaster/raycaster_common.h
+++ b/libraries/hvvr/raycaster/raycaster_common.h
@@ -9,18 +9,6 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-// TODO: Remove all dependencies on DX
-#ifdef _WIN32
-#define DX_SUPPORTED
-#endif
-
-// TODO(anankervis): move this elsewhere - the raycaster core shouldn't care what the output mode is
-#define OUTPUT_MODE_NONE 0
-#define OUTPUT_MODE_3D_API 1
-// not yet implemented
-#define OUTPUT_MODE_PNG 2
-#define OUTPUT_MODE OUTPUT_MODE_3D_API
-
 namespace hvvr {
 
 // Rays per tile
diff --git a/libraries/hvvr/raycaster/raycaster_spec.h b/libraries/hvvr/raycaster/raycaster_spec.h
index 2ea5a60..7075575 100644
--- a/libraries/hvvr/raycaster/raycaster_spec.h
+++ b/libraries/hvvr/raycaster/raycaster_spec.h
@@ -13,24 +13,15 @@
 
 #include <math.h>
 
-enum class RaycasterOutputMode { COLOR_RGBA8 };
+enum class RaycasterOutputFormat { COLOR_RGBA8 };
 
-struct FoveatedReconstructionSpecification {
-    enum class Mode { BARYCENTRIC, KNN } mode = Mode::BARYCENTRIC;
-    size_t computeK = 9;
-    // Empirically, computeK 9 or 8 requires triangleSearchK = 15
-    // computeK 7 or 6 requires triangleSearch = 14
-    // For "high-quality" choose 9,15
-    // For (slightly) better perf, use 7,14
-    static const size_t triangleSearchK = 15;
+enum class RayCasterGPUMode {
+    GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE,
+    GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT
 };
 
 struct RayCasterSpecification {
-    enum class GPUMode {
-        GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE,
-        GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT
-    };
-    GPUMode mode = GPUMode::GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE;
+    RayCasterGPUMode mode = RayCasterGPUMode::GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE;
 
     // limit the number of threads used by the raycaster when going wide
     // 0 = default = number of hardware threads in the system
@@ -38,31 +29,23 @@ struct RayCasterSpecification {
 
     struct FoveatedSamplePattern {
         float degreeTrackingError = 0.5f;
-        float minMAR = 1.0f / 60.0f;
-        float maxMAR = INFINITY;
         float maxFOVDegrees = 110.0f;
         float marSlope = 0.022f;
         float fovealMARDegrees = 1.0f / 60.0f;
-        float zenithJitterStrength = 0.0f;
-        float ringJitterStrength = 0.0f;
     } foveatedSamplePattern;
 
-    FoveatedReconstructionSpecification reconstruction;
-
-    RaycasterOutputMode outputMode = RaycasterOutputMode::COLOR_RGBA8;
+    RaycasterOutputFormat outputFormat = RaycasterOutputFormat::COLOR_RGBA8;
 
     static RayCasterSpecification feb2017FoveatedDemoSettings() {
         RayCasterSpecification spec;
-        spec.mode = RayCasterSpecification::GPUMode::GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT;
+        spec.mode = RayCasterGPUMode::GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT;
         spec.foveatedSamplePattern.degreeTrackingError = 7.0f;
-        spec.foveatedSamplePattern.minMAR = 1.0f / 20.0f;
         spec.foveatedSamplePattern.fovealMARDegrees = 1.0f / 20.0f;
         spec.foveatedSamplePattern.marSlope = 0.015f;
-        spec.foveatedSamplePattern.zenithJitterStrength = 0.5f;
-        spec.foveatedSamplePattern.ringJitterStrength = 1.0f;
         spec.foveatedSamplePattern.maxFOVDegrees = 90.0f;
-        // spec.foveatedSamplePattern.maxMAR = 1.0f / 4.0f; // less than half pixel density of DK1
-        spec.outputMode = RaycasterOutputMode::COLOR_RGBA8;
+        spec.outputFormat = RaycasterOutputFormat::COLOR_RGBA8;
         return spec;
     };
+
+    bool outputTo3DApi = true;
 };
diff --git a/libraries/hvvr/raycaster/remap.cu b/libraries/hvvr/raycaster/remap.cu
index 0d5daea..d70fc97 100644
--- a/libraries/hvvr/raycaster/remap.cu
+++ b/libraries/hvvr/raycaster/remap.cu
@@ -129,7 +129,7 @@ void GPUCamera::remap() {
     }
 }
 
-// TODO: switch to a gather approach to improve perf?
+// Switching to gather might improve performance, but this will likely never be a bottleneck
 CUDA_KERNEL void RemapPolarFoveatedKernel(uint32_t* src,
                                           float* tmaxSrc,
                                           vector2ui* remap,
@@ -147,14 +147,14 @@ CUDA_KERNEL void RemapPolarFoveatedKernel(uint32_t* src,
 }
 
 void GPUCamera::remapPolarFoveated() {
-    uint32_t rawSampleCount = rawPolarFoveatedImage.width * rawPolarFoveatedImage.height;
+    uint32_t rawSampleCount = polarTextures.raw.width * polarTextures.raw.height;
     KernelDim dim = KernelDim(rawSampleCount, CUDA_GROUP_SIZE);
 
     switch (outputModeToPixelFormat(outputMode)) {
         case PixelFormat::RGBA8_SRGB:
             RemapPolarFoveatedKernel<<<dim.grid, dim.block, 0, stream>>>(d_sampleResults.data(), d_tMaxBuffer,
-                                                                         d_polarRemapToPixel, rawPolarFoveatedImage,
-                                                                         polarFoveatedDepthImage, rawSampleCount);
+                                                                         d_polarRemapToPixel, polarTextures.raw,
+                                                                         polarTextures.depth, rawSampleCount);
             break;
         default:
             assert(false);
diff --git a/libraries/hvvr/raycaster/render.cpp b/libraries/hvvr/raycaster/render.cpp
index 9d6d0ae..4d340e6 100644
--- a/libraries/hvvr/raycaster/render.cpp
+++ b/libraries/hvvr/raycaster/render.cpp
@@ -13,12 +13,13 @@
 #include "model.h"
 #include "raycaster.h"
 #include "raycaster_common.h"
-#include "timer.h"
 #include "thread_pool.h"
+#include "timer.h"
+#include "../shared/3rdparty/lodepng.h"
 
 #define DUMP_SCENE_AND_RAYS 0
- // If disabled, rays are blocked the same way they are during tracing, 
- // which should improve coherence but makes visualization more difficult
+// If disabled, rays are blocked the same way they are during tracing,
+// which should improve coherence but makes visualization more difficult
 #define DUMP_IN_SCANLINE_ORDER 0
 // dump in binary or text format?
 #define DUMP_BINARY 1
@@ -123,6 +124,27 @@ static void dumpSceneToOFF(const std::string& filename, const std::vector<std::u
     fclose(file);
 #endif
 }
+
+
+static void dumpSceneAndRays(GPUCamera* gpuCamera, const std::vector<std::unique_ptr<Model>>& models) {
+    static bool dumped = false;
+    if (!dumped) {
+        Timer timer;
+
+        dumpSceneToOFF("scene_dump.off", models);
+
+        std::vector<SimpleRay> rays;
+        gpuCamera->dumpRays(rays, DUMP_IN_SCANLINE_ORDER == 1);
+        dumpRaysToRFF("ray_dump.rff", rays);
+
+        double dumpTime = timer.get();
+        printf("dump time %f\n", dumpTime);
+
+        dumped = true;
+    }
+}
+
+
 #endif // DUMP_SCENE_AND_RAYS
 
 void Raycaster::render(double elapsedTime) {
@@ -131,10 +153,10 @@ void Raycaster::render(double elapsedTime) {
         return; // no scene geometry is loaded
 
     switch (_spec.mode) {
-        case RayCasterSpecification::GPUMode::GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE:
+        case RayCasterGPUMode::GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE:
             renderGPUIntersectAndReconstructDeferredMSAAResolve();
             break;
-        case RayCasterSpecification::GPUMode::GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT:
+        case RayCasterGPUMode::GPU_FOVEATED_POLAR_SPACE_CUDA_RECONSTRUCT:
             renderFoveatedPolarSpaceCudaReconstruct();
             break;
         default:
@@ -146,18 +168,25 @@ static inline void debugPrintTileCost(const SampleHierarchy& samples, size_t blo
     double maxCost = 0.0f;
     double meanCost = 0.0f;
 
+    auto frustumCost = [](RayPacketFrustum3D f) {
+        vector3* dirs = f.pointDir;
+        double a0 = (double)length(cross(dirs[0] - dirs[1], dirs[0] - f.pointDir[3]));
+        double a1 = (double)length(cross(dirs[2] - dirs[1], dirs[2] - f.pointDir[3]));
+        return 0.5 * (a0 + a1);
+    };
+
     double maxBlockCost = 0.0f;
     double meanBlockCost = 0.0f;
 
     for (size_t i = 0; i < blockCount; ++i) {
         for (size_t j = 0; j < TILES_PER_BLOCK; ++j) {
-            const auto& f = samples.tileFrusta2D[i * TILES_PER_BLOCK + j];
-            double cost = (f.xMax() - f.xMin()) + (f.yMax() - f.yMin());
+            const auto& f = samples.tileFrusta3D[i * TILES_PER_BLOCK + j];
+            double cost = frustumCost(f);
             maxCost = std::max(cost, maxCost);
             meanCost += cost;
         }
-        const auto& f = samples.blockFrusta2D[i];
-        double cost = (f.xMax() - f.xMin()) + (f.yMax() - f.yMin());
+        const auto& f = samples.blockFrusta3D[i];
+        double cost = frustumCost(f);
         maxBlockCost = std::max(cost, maxBlockCost);
         meanBlockCost += cost;
     }
@@ -176,175 +205,166 @@ inline static FloatRect expandRect(const FloatRect& rect, const float fractionTo
     return {newLower, newUpper};
 }
 
-void Raycaster::renderGPUIntersectAndReconstructDeferredMSAAResolve() {
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
+
+void Raycaster::setupAllRenderTargets() {
     // Do DX11/CUDA interop setup if necessary
+    for (auto& camera : _cameras) {
+        camera->setupRenderTarget(*_gpuContext);
+    }
+}
+
+void Raycaster::blitAllRenderTargets() {
+    // Copy the results to the camera's DX texture
     for (auto& camera : _cameras) {
         if (!camera->getEnabled())
             continue;
-        GPUCamera* gpuCamera = camera->_gpuCamera;
-
-        if (camera->_renderTarget.isHardwareRenderTarget() && camera->_newHardwareTarget) {
-            gpuCamera->bindTexture(*_gpuContext, camera->_renderTarget);
-            camera->_newHardwareTarget = false;
-        }
+        camera->extractImage();
     }
+}
+
+void Raycaster::interopMapResources() {
     _gpuContext->interopMapResources();
-#endif
-    {
-        // Render and reconstruct from each camera
-        for (auto& camera : _cameras) {
-            if (!camera->getEnabled())
-                continue;
+}
 
-            const SampleHierarchy& samples = camera->getSampleData().samples;
-            uint32_t tileCount = uint32_t(samples.tileFrusta3D.size());
+void Raycaster::interopUnmapResources() {
+    _gpuContext->interopUnmapResources();
+}
 
-            GPUCamera* gpuCamera = camera->_gpuCamera;
-            // begin filling data for the GPU
-            Camera_StreamedData* streamed = gpuCamera->streamedDataLock(tileCount);
+void Raycaster::transformHierarchyCameraToWorld(std::unique_ptr<hvvr::Camera>& camera,
+                                                const RayPacketFrustum3D* tilesSrc,
+                                                const RayPacketFrustum3D* blocksSrc,
+                                                uint32_t blockCount,
+                                                Camera_StreamedData* streamed) {
+    matrix4x4 cameraToWorld = camera->getCameraToWorld();
+    matrix4x4 cameraToWorldInvTrans = transpose(invert(cameraToWorld));
 
-            vector2 jitter = camera->_frameJitters[camera->_frameCount % camera->_frameJitters.size()];
-            gpuCamera->setCameraJitter(vector2(jitter.x * 0.5f + 0.5f, jitter.y * 0.5f + 0.5f));
-            ++camera->_frameCount;
-
-            matrix4x4 cameraToWorld = camera->getCameraToWorld();
-            matrix4x4 cameraToWorldInvTrans = transpose(invert(cameraToWorld));
-
-            // transform from camera to world space
-            {
-                uint32_t blockCount = uint32_t(samples.blockFrusta3D.size());
-                const RayPacketFrustum3D* blocksSrc = samples.blockFrusta3D.data();
-                RayPacketFrustum3D* blocksDst = camera->_blockFrustaTransformed.data();
-
-                const RayPacketFrustum3D* tilesSrc = samples.tileFrusta3D.data();
-                RayPacketFrustum3D* tilesDst = camera->_tileFrustaTransformed.data();
-
-                SimpleRayFrustum* simpleTileFrusta = streamed->tileFrusta3D.dataHost();
-
-                auto blockTransformTask = [&](uint32_t startBlock, uint32_t endBlock) -> void {
-                    assert((_mm_getcsr() & 0x8040) == 0x8040); // make sure denormals are being treated as zero
-
-                    for (uint32_t blockIndex = startBlock; blockIndex < endBlock; blockIndex++) {
-                        blocksDst[blockIndex] = blocksSrc[blockIndex].transform(cameraToWorld, cameraToWorldInvTrans);
-
-                        uint32_t startTile = blockIndex * TILES_PER_BLOCK;
-                        uint32_t endTile = startTile + TILES_PER_BLOCK;
-                        for (uint32_t tileIndex = startTile; tileIndex < endTile; tileIndex++) {
-                            RayPacketFrustum3D& tileDst = tilesDst[tileIndex];
-                            tileDst = tilesSrc[tileIndex].transform(cameraToWorld, cameraToWorldInvTrans);
-
-                            SimpleRayFrustum& simpleFrustum = simpleTileFrusta[tileIndex];
-                            for (int n = 0; n < RayPacketFrustum3D::pointCount; n++) {
-                                simpleFrustum.origins[n].x = tileDst.pointOrigin[n].x;
-                                simpleFrustum.origins[n].y = tileDst.pointOrigin[n].y;
-                                simpleFrustum.origins[n].z = tileDst.pointOrigin[n].z;
-
-                                simpleFrustum.directions[n].x = tileDst.pointDir[n].x;
-                                simpleFrustum.directions[n].y = tileDst.pointDir[n].y;
-                                simpleFrustum.directions[n].z = tileDst.pointDir[n].z;
-                            }
-                        }
-                    }
-                };
-
-                enum { maxTasks = 4096 };
-                enum { blocksPerThread = 16 };
-                uint32_t numTasks = (blockCount + blocksPerThread - 1) / blocksPerThread;
-                assert(numTasks <= maxTasks);
-                numTasks = min<uint32_t>(maxTasks, numTasks);
-
-                std::future<void> taskResults[maxTasks];
-                for (uint32_t i = 0; i < numTasks; ++i) {
-                    uint32_t startBlock = min(blockCount, i * blocksPerThread);
-                    uint32_t endBlock = min(blockCount, (i + 1) * blocksPerThread);
-
-                    taskResults[i] = _threadPool->addTask(blockTransformTask, startBlock, endBlock);
-                }
-                for (uint32_t i = 0; i < numTasks; ++i) {
-                    taskResults[i].get();
+    RayPacketFrustum3D* blocksDst = camera->_cpuHierarchy._blockFrusta.data();
+
+    RayPacketFrustum3D* tilesDst = camera->_cpuHierarchy._tileFrusta.data();
+
+    SimpleRayFrustum* simpleTileFrusta = streamed->tileFrusta3D.dataHost();
+
+    auto blockTransformTask = [&](uint32_t startBlock, uint32_t endBlock) -> void {
+        assert((_mm_getcsr() & 0x8040) == 0x8040); // make sure denormals are being treated as zero
+
+        for (uint32_t blockIndex = startBlock; blockIndex < endBlock; blockIndex++) {
+            blocksDst[blockIndex] = blocksSrc[blockIndex].transform(cameraToWorld, cameraToWorldInvTrans);
+
+            uint32_t startTile = blockIndex * TILES_PER_BLOCK;
+            uint32_t endTile = startTile + TILES_PER_BLOCK;
+            for (uint32_t tileIndex = startTile; tileIndex < endTile; tileIndex++) {
+                RayPacketFrustum3D& tileDst = tilesDst[tileIndex];
+                tileDst = tilesSrc[tileIndex].transform(cameraToWorld, cameraToWorldInvTrans);
+                SimpleRayFrustum& simpleFrustum = simpleTileFrusta[tileIndex];
+                for (int n = 0; n < RayPacketFrustum3D::pointCount; n++) {
+                    simpleFrustum.origins[n] = tileDst.pointOrigin[n];
+                    simpleFrustum.directions[n] = tileDst.pointDir[n];
                 }
             }
+        }
+    };
 
-            gpuCamera->updatePerFrame(camera->getTranslation(), camera->getForward(), camera->getSampleToCamera(),
-                                      cameraToWorld);
+    enum { maxTasks = 4096 };
+    enum { blocksPerThread = 16 };
+    uint32_t numTasks = (blockCount + blocksPerThread - 1) / blocksPerThread;
+    assert(numTasks <= maxTasks);
+    numTasks = min<uint32_t>(maxTasks, numTasks);
 
-            BlockInfo blockInfo;
-            blockInfo.blockFrusta = camera->_blockFrustaTransformed;
-            blockInfo.tileFrusta = camera->_tileFrustaTransformed;
+    std::future<void> taskResults[maxTasks];
+    for (uint32_t i = 0; i < numTasks; ++i) {
+        uint32_t startBlock = min(blockCount, i * blocksPerThread);
+        uint32_t endBlock = min(blockCount, (i + 1) * blocksPerThread);
 
-#if DUMP_SCENE_AND_RAYS
-            static bool dumped = false;
-            if (!dumped) {
-                Timer timer;
+        taskResults[i] = _threadPool->addTask(blockTransformTask, startBlock, endBlock);
+    }
+    for (uint32_t i = 0; i < numTasks; ++i) {
+        taskResults[i].get();
+    }
+};
+
+void Raycaster::renderCameraGPUIntersectAndReconstructDeferredMSAAResolve(std::unique_ptr<hvvr::Camera>& camera) {
+    if (!camera->getEnabled())
+        return;
+
+    const SampleHierarchy& samples = camera->getSampleData().samples;
+    uint32_t tileCount = uint32_t(samples.tileFrusta3D.size());
+
+    GPUCamera* gpuCamera = camera->_gpuCamera;
+    // begin filling data for the GPU
+    Camera_StreamedData* streamed = gpuCamera->streamedDataLock(tileCount);
+    {
+        vector2 jitter = camera->_frameJitters[camera->_frameCount % camera->_frameJitters.size()];
+        gpuCamera->setCameraJitter(vector2(jitter.x * 0.5f + 0.5f, jitter.y * 0.5f + 0.5f));
+        ++camera->_frameCount;
 
-                dumpSceneToOFF("scene_dump.off", _models);
+        transformHierarchyCameraToWorld(camera, samples.tileFrusta3D.data(), samples.blockFrusta3D.data(),
+                                        uint32_t(samples.blockFrusta3D.size()), streamed);
 
-                std::vector<SimpleRay> rays;
-                gpuCamera->dumpRays(rays, DUMP_IN_SCANLINE_ORDER == 1);
-                dumpRaysToRFF("ray_dump.rff", rays);
+        gpuCamera->updateTransform(camera->getCameraToWorld());
 
-                double dumpTime = timer.get();
-                printf("dump time %f\n", dumpTime);
+        RayHierarchy rayHierachy;
+        rayHierachy.blockFrusta = camera->_cpuHierarchy._blockFrusta;
+        rayHierachy.tileFrusta = camera->_cpuHierarchy._tileFrusta;
 
-                dumped = true;
-            }
+#if DUMP_SCENE_AND_RAYS
+        dumpSceneAndRays(gpuCamera, _models);
 #endif
 
-            buildTileTriangleLists(blockInfo, streamed);
+        buildTileTriangleLists(rayHierachy, streamed);
 
-            // end filling data for the GPU
-            gpuCamera->streamedDataUnlock();
+        // end filling data for the GPU
+    }
+    gpuCamera->streamedDataUnlock();
 
-            gpuCamera->intersectShadeResolve(_gpuContext->sceneState);
-            gpuCamera->remap();
+    gpuCamera->intersectShadeResolve(_gpuContext->sceneState);
+    gpuCamera->remap();
+}
 
-            // size_t totalTris = _bvhScene.triangles.size;
-            // size_t trisIntersected = camera->tileCullInfo.triIndexCount;
-            // printf("Intersection Ratio %f: %d/%d\n", (float)trisIntersected / totalTris, trisIntersected, totalTris);
-        }
+void Raycaster::renderCamera(std::unique_ptr<hvvr::Camera>& camera) {
+    if (!camera->getEnabled())
+        return;
+
+    if (_spec.outputTo3DApi == true) {
+        camera->setupRenderTarget(*_gpuContext);
+        interopMapResources();
     }
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    // Copy the results to the camera's DX texture
+
+    renderCameraGPUIntersectAndReconstructDeferredMSAAResolve(camera);
+
+    if (_spec.outputTo3DApi == true) {
+        camera->extractImage();
+        interopUnmapResources();
+    }
+}
+
+void Raycaster::renderGPUIntersectAndReconstructDeferredMSAAResolve() {
+    if (_spec.outputTo3DApi == true) {
+        setupAllRenderTargets();
+        interopMapResources();
+    }
+
+    // Render and reconstruct from each camera
     for (auto& camera : _cameras) {
-        if (!camera->getEnabled())
-            continue;
-        GPUCamera* gpuCamera = camera->_gpuCamera;
+        renderCameraGPUIntersectAndReconstructDeferredMSAAResolve(camera);
+    }
 
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-        if (camera->_renderTarget.isHardwareRenderTarget()) {
-            gpuCamera->copyImageToBoundTexture();
-        } else {
-            gpuCamera->copyImageToCPU((uint32_t*)camera->_renderTarget.data, camera->_renderTarget.width,
-                                      camera->_renderTarget.height, uint32_t(camera->_renderTarget.stride));
-        }
-#endif
+    if (_spec.outputTo3DApi == true) {
+        blitAllRenderTargets();
+        interopUnmapResources();
     }
-    _gpuContext->interopUnmapResources();
-#endif
 }
 
 // TODO(anankervis): merge into a single render path
 void Raycaster::renderFoveatedPolarSpaceCudaReconstruct() {
     polarSpaceFoveatedSetup(this);
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    for (auto& camera : _cameras) {
-        if (!camera->getEnabled())
-            continue;
-        GPUCamera* gpuCamera = camera->_gpuCamera;
-
-        if (camera->_renderTarget.isHardwareRenderTarget() && camera->_newHardwareTarget) {
-            gpuCamera->bindTexture(*_gpuContext, camera->_renderTarget);
-            camera->_newHardwareTarget = false;
-        }
+    if (_spec.outputTo3DApi == true) {
+        setupAllRenderTargets();
     }
-#endif
     for (auto& camera : _cameras) {
         if (!camera->getEnabled())
             continue;
         GPUCamera* gpuCamera = camera->_gpuCamera;
 
-        const SampleData& sampleData = camera->getSampleData();
         const vector3& eyeDirection = camera->getEyeDir();
 
         vector2 jitter = camera->_frameJitters[camera->_frameCount % camera->_frameJitters.size()];
@@ -354,18 +374,17 @@ void Raycaster::renderFoveatedPolarSpaceCudaReconstruct() {
         matrix3x3 cameraToSample = invert(camera->getSampleToCamera());
         matrix3x3 eyeToCamera = matrix3x3::rotationFromZAxis(-eyeDirection);
         matrix4x4 eyeToWorld = camera->getCameraToWorld() * matrix4x4(eyeToCamera);
-        // TODO: principled derivation of magic constant
-        FloatRect cullRect = expandRect(sampleData.sampleBounds, 0.15f);
 
-        gpuCamera->updatePerFrameFoveatedData(cullRect, cameraToSample, eyeToCamera, eyeToWorld);
+        gpuCamera->updatePerFrameFoveatedData(camera->getSampleData().sampleBounds, cameraToSample, eyeToCamera,
+                                              eyeToWorld);
 
-        auto blockCount = (camera->_foveatedSampleData.eyeSpaceSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        auto blockCount = (camera->_foveatedSampleData.samples.directionalSamples.size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
         camera->_foveatedSampleData.blockCount = blockCount;
     }
 
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    _gpuContext->interopMapResources();
-#endif
+    if (_spec.outputTo3DApi == true) {
+        interopMapResources();
+    }
     {
         for (auto& camera : _cameras) {
             if (!camera->getEnabled())
@@ -394,19 +413,18 @@ void Raycaster::renderFoveatedPolarSpaceCudaReconstruct() {
                 simpleTileFrusta[i] = camera->_foveatedSampleData.simpleTileFrusta[i];
             }
 
-            gpuCamera->updatePerFrame(camera->getTranslation(), camera->getForward(), camera->getSampleToCamera(),
-                                      camera->getCameraToWorld());
+            gpuCamera->updateTransform(camera->getCameraToWorld());
 
-            BlockInfo blockInfo;
-            blockInfo.blockFrusta = ArrayView<const RayPacketFrustum3D>(
+            RayHierarchy rayHierarchy;
+            rayHierarchy.blockFrusta = ArrayView<const RayPacketFrustum3D>(
                 camera->_foveatedSampleData.samples.blockFrusta3D.data(), camera->_foveatedSampleData.blockCount);
-            blockInfo.tileFrusta =
+            rayHierarchy.tileFrusta =
                 ArrayView<const RayPacketFrustum3D>(camera->_foveatedSampleData.samples.tileFrusta3D.data(),
                                                     camera->_foveatedSampleData.blockCount * TILES_PER_BLOCK);
 
             // Uncomment to get stats on how good the block/tile clustering is working
             // debugPrintTileCost(camera->polarFoveatedSampleData.samples, camera->polarFoveatedSampleData.blockCount);
-            buildTileTriangleLists(blockInfo, streamed);
+            buildTileTriangleLists(rayHierarchy, streamed);
 
             // size_t totalTris = _bvhScene.triangles.size;
             // size_t trisIntersected = camera->tileCullInfo.triIndexCount;
@@ -432,22 +450,10 @@ void Raycaster::renderFoveatedPolarSpaceCudaReconstruct() {
             camera->_eyePreviousToSamplePrevious = invert(sampleToEye);
         }
     }
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    // Copy the results to the camera's DX texture
-    for (auto& camera : _cameras) {
-        if (!camera->getEnabled())
-            continue;
-        GPUCamera* gpuCamera = camera->_gpuCamera;
-
-        if (camera->_renderTarget.isHardwareRenderTarget()) {
-            gpuCamera->copyImageToBoundTexture();
-        } else {
-            gpuCamera->copyImageToCPU((uint32_t*)camera->_renderTarget.data, camera->_renderTarget.width,
-                                      camera->_renderTarget.height, uint32_t(camera->_renderTarget.stride));
-        }
+    if (_spec.outputTo3DApi == true) {
+        blitAllRenderTargets();
+        interopUnmapResources();
     }
-    _gpuContext->interopUnmapResources();
-#endif
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/resolve.cu b/libraries/hvvr/raycaster/resolve.cu
index b739fca..40a58e1 100644
--- a/libraries/hvvr/raycaster/resolve.cu
+++ b/libraries/hvvr/raycaster/resolve.cu
@@ -8,7 +8,6 @@
  */
 
 #include "cuda_decl.h"
-#include "gbuffer.h"
 #include "gpu_camera.h"
 #include "gpu_context.h"
 #include "kernel_constants.h"
@@ -33,8 +32,8 @@ struct ResolveSMem {
     ResolveSMem() {}
 
     union {
-        TileData tile;
-        TileDataDoF tileDoF;
+        TileData<false> tile;
+        TileData<true> tileDoF;
     };
 };
 
@@ -42,7 +41,7 @@ template <uint32_t AARate, uint32_t BlockSize, bool EnableDoF>
 CUDA_DEVICE vector4 ShadeSSAA(ResolveSMem& sMem,
                               const RaycasterGBufferSubsample* CUDA_RESTRICT gBufferWarp,
                               int laneIndex,
-                              UnpackedDirectionalSample sample3D,
+                              DirectionalBeam sample3D,
                               vector3 lensCenterToFocalCenter,
                               vector2 frameJitter,
                               const vector2* CUDA_RESTRICT tileSubsampleLensPos,
@@ -55,7 +54,6 @@ CUDA_DEVICE vector4 ShadeSSAA(ResolveSMem& sMem,
                               cudaTextureObject_t* textures,
                               const LightingEnvironment& env,
                               uint32_t sampleOffset,
-                              const SampleInfo& sampleInfo,
                               ResolveStats* resolveStats) {
     enum : uint32_t { badTriIndex = ~uint32_t(0) };
     float derivativeMultiplier = rsqrtf(float(AARate));
@@ -80,13 +78,13 @@ CUDA_DEVICE vector4 ShadeSSAA(ResolveSMem& sMem,
         const PrecomputedTriangleIntersect& triIntersect = trianglesIntersect[triIndex];
         const PrecomputedTriangleShade& triShade = trianglesShade[triIndex];
 
-        IntersectTriangleTileDoF triTileDoF;
-        triTileDoF.setup(triIntersect, sMem.tileDoF.lensCenter, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
+        IntersectTriangleTile<true> triTileDoF;
+        triTileDoF.setup(triIntersect, cameraPos, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
         IntersectTriangleThreadDoF triThreadDoF(triTileDoF, lensCenterToFocalCenter);
 
-        IntersectTriangleTile triTile;
-        triTile.setup(triIntersect, sMem.tile.rayOrigin, sMem.tile.majorDirDiff, sMem.tile.minorDirDiff);
-        IntersectTriangleThread triThread(triTile, sample3D.centerDir);
+        IntersectTriangleTile<false> triTile;
+        triTile.setup(triIntersect, sMem.tile.rayOrigin);
+        IntersectTriangleThread triThread(triTile, sample3D.centerRay, sample3D.du, sample3D.dv);
 
         while (sampleMask) {
             int subsampleIndex = __ffs(sampleMask) - 1;
@@ -165,7 +163,7 @@ template <uint32_t AARate, uint32_t BlockSize, bool EnableDoF>
 CUDA_DEVICE vector4 ShadeMSAA(ResolveSMem& sMem,
                               const RaycasterGBufferSubsample* CUDA_RESTRICT gBufferWarp,
                               int laneIndex,
-                              UnpackedDirectionalSample sample3D,
+                              DirectionalBeam sample3D,
                               vector3 lensCenterToFocalCenter,
                               vector2 frameJitter,
                               const vector2* CUDA_RESTRICT tileSubsampleLensPos,
@@ -178,7 +176,6 @@ CUDA_DEVICE vector4 ShadeMSAA(ResolveSMem& sMem,
                               cudaTextureObject_t* textures,
                               const LightingEnvironment& env,
                               uint32_t sampleOffset,
-                              const SampleInfo& sampleInfo,
                               ResolveStats* resolveStats) {
     enum : uint32_t { badTriIndex = ~uint32_t(0) };
 
@@ -229,13 +226,13 @@ CUDA_DEVICE vector4 ShadeMSAA(ResolveSMem& sMem,
         const PrecomputedTriangleIntersect& triIntersect = trianglesIntersect[triIndex];
         const PrecomputedTriangleShade& triShade = trianglesShade[triIndex];
 
-        IntersectTriangleTileDoF triTileDoF;
-        triTileDoF.setup(triIntersect, sMem.tileDoF.lensCenter, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
+        IntersectTriangleTile<true> triTileDoF;
+        triTileDoF.setup(triIntersect, cameraPos, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
         IntersectTriangleThreadDoF triThreadDoF(triTileDoF, lensCenterToFocalCenter);
 
-        IntersectTriangleTile triTile;
-        triTile.setup(triIntersect, sMem.tile.rayOrigin, sMem.tile.majorDirDiff, sMem.tile.minorDirDiff);
-        IntersectTriangleThread triThread(triTile, sample3D.centerDir);
+        IntersectTriangleTile<false> triTile;
+        triTile.setup(triIntersect, sMem.tile.rayOrigin);
+        IntersectTriangleThread triThread(triTile, sample3D.centerRay, sample3D.du, sample3D.dv);
 
         vector3 b;
         vector3 bOffX;
@@ -283,6 +280,7 @@ CUDA_DEVICE vector4 ShadeMSAA(ResolveSMem& sMem,
     }
     result *= 1.0f / AARate;
     result.w = 1.0f;
+    // result = hvvr::vector4(sample3D.centerDir, 1.0f); // Visualize ray directions
     return result;
 }
 
@@ -291,8 +289,8 @@ CUDA_DEVICE vector4 ShadeAndResolve(ResolveSMem& sMem,
                                     const RaycasterGBufferSubsample* CUDA_RESTRICT gBufferBlock,
                                     int laneIndex,
                                     uint32_t sampleOffset,
-                                    SampleInfo sampleInfo,
-                                    UnpackedDirectionalSample sample3D,
+                                    CameraBeams cameraBeams,
+                                    DirectionalBeam sample3D,
                                     vector3 lensCenterToFocalCenter,
                                     const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                     vector3 cameraPos,
@@ -311,31 +309,27 @@ CUDA_DEVICE vector4 ShadeAndResolve(ResolveSMem& sMem,
     vector4 result =
 #if SUPERSHADING_MODE == SSAA_SHADE
         ShadeSSAA<AARate, BlockSize, EnableDoF>(sMem, gBufferBlock, laneIndex, sample3D, lensCenterToFocalCenter,
-                                                sampleInfo.frameJitter, tileSubsampleLensPos, cameraPos,
+                                                cameraBeams.frameJitter, tileSubsampleLensPos, cameraPos,
                                                 cameraLookVector, trianglesIntersect, trianglesShade, verts, materials,
-                                                textures, env, sampleOffset, sampleInfo, resolveStats);
+                                                textures, env, sampleOffset, resolveStats);
 #else
         ShadeMSAA<AARate, BlockSize, EnableDoF>(sMem, gBufferBlock, laneIndex, sample3D, lensCenterToFocalCenter,
-                                                sampleInfo.frameJitter, tileSubsampleLensPos, cameraPos,
+                                                cameraBeams.frameJitter, tileSubsampleLensPos, cameraPos,
                                                 cameraLookVector, trianglesIntersect, trianglesShade, verts, materials,
-                                                textures, env, sampleOffset, sampleInfo, resolveStats);
+                                                textures, env, sampleOffset, resolveStats);
 #endif
 
     return result;
 }
 
-template <uint32_t AARate, uint32_t BlockSize, bool TMaxBuffer, bool EnableDoF>
+template <bool TMaxBuffer, bool EnableDoF, uint32_t AARate, uint32_t BlockSize>
 CUDA_KERNEL void ResolveKernel(uint32_t* sampleResults,
                                float* tMaxBuffer,
                                const RaycasterGBufferSubsample* CUDA_RESTRICT gBuffer,
-                               SampleInfo sampleInfo,
-                               matrix4x4 sampleToWorld,
-                               matrix3x3 sampleToCamera,
+                               CameraBeams cameraBeams,
                                matrix4x4 cameraToWorld,
                                const vector2* CUDA_RESTRICT tileSubsampleLensPos,
                                const unsigned* CUDA_RESTRICT tileIndexRemapOccupied,
-                               vector3 cameraPos,
-                               vector3 cameraLookVector,
                                const PrecomputedTriangleIntersect* CUDA_RESTRICT trianglesIntersect,
                                const PrecomputedTriangleShade* CUDA_RESTRICT trianglesShade,
                                const ShadingVertex* CUDA_RESTRICT verts,
@@ -357,37 +351,42 @@ CUDA_KERNEL void ResolveKernel(uint32_t* sampleResults,
     uint32_t warpIndex = sampleOffset / WARP_SIZE;
     uint32_t warpOffset = warpIndex * WARP_SIZE * AARate;
 
-    UnpackedDirectionalSample sample3D =
-        GetDirectionalSample3D(sampleOffset, sampleInfo, sampleToWorld, sampleToCamera, cameraToWorld);
+    DirectionalBeam sample3D = GetDirectionalSample3D(sampleOffset, cameraBeams, cameraToWorld);
 
-    UnpackedSample sample2D = GetFullSample(sampleOffset, sampleInfo);
-    matrix3x3 sampleToWorldRotation = matrix3x3(sampleToWorld);
-    vector3 lensCenterToFocalCenter =
-        sampleInfo.lens.focalDistance * (sampleToWorldRotation * vector3(sample2D.center.x, sample2D.center.y, 1.0f));
+    vector3 centerDir = normalize(sample3D.centerRay);
+    float zed = dot(matrix3x3(cameraToWorld) * vector3(0, 0, -1.0f), centerDir);
+    vector3 lensCenterToFocalCenter = (cameraBeams.lens.focalDistance / zed) * centerDir;
 
     // TODO(anankervis): precompute this with more accurate values, and load from a per-tile buffer
     // (but watch out for the foveated path)
     __shared__ ResolveSMem sMem;
     if (threadIdx.x == BlockSize / 2) {
         if (EnableDoF) {
-            sMem.tileDoF.load(sampleInfo, sampleToWorld, sampleOffset);
+            sMem.tileDoF.load(cameraBeams, cameraToWorld, sample3D);
         } else {
-            sMem.tile.load(sampleToWorld, sample3D);
+            sMem.tile.load(cameraToWorld, sample3D);
         }
     }
     __syncthreads();
 
+    vector3 cameraPos = vector3(cameraToWorld.m3);
+    vector3 cameraLookVector = -vector3(cameraToWorld.m2);
     vector4 result = ShadeAndResolve<AARate, BlockSize, EnableDoF>(
-        sMem, gBuffer + warpOffset, laneGetIndex(), sampleOffset, sampleInfo, sample3D, lensCenterToFocalCenter,
+        sMem, gBuffer + warpOffset, laneGetIndex(), sampleOffset, cameraBeams, sample3D, lensCenterToFocalCenter,
         tileSubsampleLensPos, cameraPos, cameraLookVector, trianglesIntersect, trianglesShade, verts, materials,
         textures, env, resolveStats);
 
     result = ACESFilm(result);
+
+    /*
+    result = vector4((tileIndex * 189 % 256)/(255.0f));
+    result.y = ((tileIndex/TILES_PER_BLOCK)*9 % 256) / (255.0f);
+    */
     sampleResults[sampleOffset] = ToColor4Unorm8SRgb(result);
 
     if (TMaxBuffer) {
         enum { tMaxSubsampleIndex = 0 };
-        vector2 alpha = getSubsampleUnitOffset<AARate>(sampleInfo.frameJitter, tMaxSubsampleIndex);
+        vector2 alpha = getSubsampleUnitOffset<AARate>(cameraBeams.frameJitter, tMaxSubsampleIndex);
 
         // scan through the compressed gbuffer until we find the subsample we care about
         enum : uint32_t { badTriIndex = ~uint32_t(0) };
@@ -412,14 +411,14 @@ CUDA_KERNEL void ResolveKernel(uint32_t* sampleResults,
                 PrecomputedTriangleIntersect triIntersect = trianglesIntersect[triIndex];
 
                 if (EnableDoF) {
-                    IntersectTriangleTileDoF triTileDoF;
-                    triTileDoF.setup(triIntersect, sMem.tileDoF.lensCenter, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
+                    IntersectTriangleTile<true> triTileDoF;
+                    triTileDoF.setup(triIntersect, cameraPos, sMem.tileDoF.lensU, sMem.tileDoF.lensV);
                     IntersectTriangleThreadDoF triThreadDoF(triTileDoF, lensCenterToFocalCenter);
 
                     // should lensUV be forced to zero (centered)?
                     vector2 lensUV;
                     vector2 dirUV;
-                    GetSampleUVsDoF<AARate, BlockSize>(tileSubsampleLensPos, sampleInfo.frameJitter,
+                    GetSampleUVsDoF<AARate, BlockSize>(tileSubsampleLensPos, cameraBeams.frameJitter,
                                                        sMem.tileDoF.focalToLensScale, tMaxSubsampleIndex, lensUV,
                                                        dirUV);
 
@@ -436,9 +435,9 @@ CUDA_KERNEL void ResolveKernel(uint32_t* sampleResults,
                     vector3 posDelta = pos - cameraPos;
                     tMaxValue = dot(posDelta, cameraLookVector);
                 } else {
-                    IntersectTriangleTile triTile;
-                    triTile.setup(triIntersect, sMem.tile.rayOrigin, sMem.tile.majorDirDiff, sMem.tile.minorDirDiff);
-                    IntersectTriangleThread triThread(triTile, sample3D.centerDir);
+                    IntersectTriangleTile<false> triTile;
+                    triTile.setup(triIntersect, sMem.tile.rayOrigin);
+                    IntersectTriangleThread triThread(triTile, sample3D.centerRay, sample3D.du, sample3D.dv);
 
                     vector3 uvw;
                     triThread.calcUVW(triTile, alpha, uvw);
@@ -459,7 +458,8 @@ CUDA_KERNEL void ResolveKernel(uint32_t* sampleResults,
     }
 }
 
-void GPUCamera::shadeAndResolve(GPUSceneState& sceneState, const SampleInfo& sampleInfo) {
+
+void GPUCamera::shadeAndResolve(GPUSceneState& sceneState, const CameraBeams& cameraBeams) {
     Camera_StreamedData& streamedData = streamed[streamedIndexGPU];
 
     static_assert(TILE_SIZE % WARP_SIZE == 0, "Tile size must be a multiple of warp size in the current architecture. "
@@ -487,33 +487,24 @@ void GPUCamera::shadeAndResolve(GPUSceneState& sceneState, const SampleInfo& sam
     }
 #endif
 
-#define RESOLVE_LAUNCH(AARate, BlockSize, TMaxBuffer, EnableDoF, dim, stream)                                          \
-    ResolveKernel<AARate, BlockSize, TMaxBuffer, EnableDoF><<<dim.grid, dim.block, 0, stream>>>(                       \
-        d_sampleResults, d_tMaxBuffer, d_gBuffer, sampleInfo, cameraToWorld * matrix4x4(sampleToCamera),               \
-        sampleToCamera, cameraToWorld, d_tileSubsampleLensPos, local.tileIndexRemapOccupied.data(), position,          \
-        lookVector, sceneState.trianglesIntersect, sceneState.trianglesShade, sceneState.worldSpaceVertices,           \
-        sceneState.materials, gDeviceTextureArray, sceneState.lightingEnvironment, resolveStatsPtr)
-
-    KernelDim dimResolve(streamedData.tileCountOccupied * TILE_SIZE, TILE_SIZE);
-    if (d_tMaxBuffer.size() != 0) {
-        // output a tMax depth buffer for reprojection
-        if (sampleInfo.lens.radius > 0.0f) {
-            // Enable depth of field
-            RESOLVE_LAUNCH(COLOR_MODE_MSAA_RATE, TILE_SIZE, true, true, dimResolve, stream);
-        } else {
-            // No depth of field, assume all rays have the same origin
-            RESOLVE_LAUNCH(COLOR_MODE_MSAA_RATE, TILE_SIZE, true, false, dimResolve, stream);
-        }
-    } else {
-        if (sampleInfo.lens.radius > 0.0f) {
-            // Enable depth of field
-            RESOLVE_LAUNCH(COLOR_MODE_MSAA_RATE, TILE_SIZE, false, true, dimResolve, stream);
-        } else {
-            // No depth of field, assume all rays have the same origin
-            RESOLVE_LAUNCH(COLOR_MODE_MSAA_RATE, TILE_SIZE, false, false, dimResolve, stream);
-        }
-    }
-#undef RESOLVE_LAUNCH
+
+    KernelDim dim(streamedData.tileCountOccupied * TILE_SIZE, TILE_SIZE);
+
+    bool hasTMax = (d_tMaxBuffer.size() != 0);
+    bool enableDoF = cameraBeams.lens.radius > 0.0f;
+    std::array<bool, 2> bargs = {{hasTMax, enableDoF}};
+
+#pragma warning(push)
+#pragma warning(disable : 4100)
+    dispatch_bools<2>{}(bargs, [&](auto... Bargs) {
+        ResolveKernel<decltype(Bargs)::value..., COLOR_MODE_MSAA_RATE, TILE_SIZE><<<dim.grid, dim.block, 0, stream>>>(
+            d_sampleResults, d_tMaxBuffer, d_gBuffer, cameraBeams, cameraToWorld, d_tileSubsampleLensPos,
+            local.tileIndexRemapOccupied.data(), sceneState.trianglesIntersect, sceneState.trianglesShade,
+            sceneState.worldSpaceVertices, sceneState.materials, gDeviceTextureArray, sceneState.lightingEnvironment,
+            resolveStatsPtr);
+    });
+#pragma warning(pop)
+
 
 #if PROFILE_RESOLVE
     if (frameIndex % profileFrameSkip == 0) {
@@ -534,7 +525,6 @@ void GPUCamera::shadeAndResolve(GPUSceneState& sceneState, const SampleInfo& sam
 #endif
 }
 
-template <bool TMaxBuffer>
 CUDA_KERNEL void ClearEmptyKernel(uint32_t* sampleResults,
                                   float* tMaxBuffer,
                                   const uint32_t* CUDA_RESTRICT tileIndexRemapEmpty,
@@ -546,7 +536,7 @@ CUDA_KERNEL void ClearEmptyKernel(uint32_t* sampleResults,
         uint32_t tileIndex = tileIndexRemapEmpty[compactedTileIndex];
         uint32_t sampleOffset = tileIndex * TILE_SIZE + threadIndex;
         sampleResults[sampleOffset] = 0xFF000000;
-        if (TMaxBuffer) {
+        if (tMaxBuffer) {
             tMaxBuffer[sampleOffset] = CUDA_INF;
         }
     }
@@ -562,13 +552,9 @@ void GPUCamera::clearEmpty() {
     dim3 dimGrid(blockCount, 1, 1);
     dim3 dimBlock(CUDA_GROUP_SIZE, 1, 1);
 
-    if (d_tMaxBuffer.size() != 0) {
-        ClearEmptyKernel<true>
-            <<<dimGrid, dimBlock, 0, stream>>>(d_sampleResults, d_tMaxBuffer, d_emptyTileIndexRemap, tileCount);
-    } else {
-        ClearEmptyKernel<false>
-            <<<dimGrid, dimBlock, 0, stream>>>(d_sampleResults, nullptr, d_emptyTileIndexRemap, tileCount);
-    }
+
+    float* tMaxBuffer = (d_tMaxBuffer.size() != 0) ? d_tMaxBuffer.data() : nullptr;
+    ClearEmptyKernel<<<dimGrid, dimBlock, 0, stream>>>(d_sampleResults, tMaxBuffer, d_emptyTileIndexRemap, tileCount);
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/sample_hierarchy.cpp b/libraries/hvvr/raycaster/sample_hierarchy.cpp
index f959a6c..e223149 100644
--- a/libraries/hvvr/raycaster/sample_hierarchy.cpp
+++ b/libraries/hvvr/raycaster/sample_hierarchy.cpp
@@ -15,9 +15,6 @@
 
 #pragma warning(disable : 4505) // unreferenced local function has been removed
 
-#if ENABLE_HACKY_WIDE_FOV
-#pragma warning(disable: 4702) // unreachable code
-#endif
 
 namespace hvvr {
 
@@ -27,6 +24,17 @@ static std::string toString(vector4 v) {
     return stringStream.str();
 }
 
+
+static vector3 sphericalUVToDirection(vector2 uv, float fovX, float fovY) {
+    float yaw = (uv.x - .5f) * (fovX * RadiansPerDegree);
+    float pitch = -(uv.y - .5f) * (fovY * RadiansPerDegree);
+
+    float newX = sin(yaw) * cos(pitch);
+    float newY = sin(pitch);
+    float newZ = -cos(yaw) * cos(pitch);
+    return normalize(vector3(newX, newY, newZ));
+}
+
 // http://cseweb.ucsd.edu/~ravir/whitted.pdf outlines a basic technique for generating bounding frusta over a packet of
 // non-point origin rays Step 1: Pick a major axis for the rays Step 2: Choose a near and far plane for the rays,
 // perpendicular to the major axis Step 3: Compute AABB of the ray intersection points on both of the planes Step 4:
@@ -37,9 +45,12 @@ static std::string toString(vector4 v) {
 // Choose the major axis to be the Z axis. The near plane is z=0, the far plane can just be a camera parameter (negative
 // Z) The AABB for the near plane is just the AABB of the lens.
 static RayPacketFrustum3D get3DFrustumFrom2D(const RayPacketFrustum2D& frustum2D,
-                                             const matrix3x3& sampleToCamera,
-                                             ThinLens lens,
-                                             float farPlane) {
+                                             Sample2Dto3DMappingSettings settings) {
+    auto lens = settings.thinLens;
+    auto sampleToCamera = settings.sampleToCamera;
+
+    const float farPlane = -100.0f;
+
     vector3 nearPoints[4];
     nearPoints[0] = vector3(-lens.radius, -lens.radius, 0);
     nearPoints[1] = vector3(+lens.radius, -lens.radius, 0);
@@ -55,37 +66,20 @@ static RayPacketFrustum3D get3DFrustumFrom2D(const RayPacketFrustum2D& frustum2D
     rayDirections[2] = sampleToCamera * vector3(frustum2D.xMax(), frustum2D.yMin(), 1);
     rayDirections[3] = sampleToCamera * vector3(frustum2D.xMin(), frustum2D.yMin(), 1);
 
-#if ENABLE_HACKY_WIDE_FOV
-    const float invWidth = 1.0f / 2160.0f;
-    const float invHeight = 1.0f / 1200.0f;
-    // TODO: undo sample-space padding of tile extents, and calculate correct padding in camera space
-    float uv[4][2] = {
-        frustum2D.xMin(), frustum2D.yMax(), frustum2D.xMax(), frustum2D.yMax(),
-        frustum2D.xMax(), frustum2D.yMin(), frustum2D.xMin(), frustum2D.yMin(),
-    };
-
-    for (int i = 0; i < 4; i++) {
-        float u = uv[i][0];
-        float v = uv[i][1];
-
-        float yaw = (u - .5f) * (HACKY_WIDE_FOV_W * RadiansPerDegree);
-        float pitch = -(v - .5f) * (HACKY_WIDE_FOV_H * RadiansPerDegree);
-
-        float newX = sin(yaw) * cos(pitch);
-        float newY = sin(pitch);
-        float newZ = -cos(yaw) * cos(pitch);
-        rayDirections[i] = vector3(newX, newY, newZ);
-    }
+    if (settings.type == Sample2Dto3DMappingSettings::MappingType::SphericalSection) {
+        // TODO: undo sample-space padding of tile extents, and calculate correct padding in camera space
+        float uv[4][2] = {
+            frustum2D.xMin(), frustum2D.yMax(), frustum2D.xMax(), frustum2D.yMax(),
+            frustum2D.xMax(), frustum2D.yMin(), frustum2D.xMin(), frustum2D.yMin(),
+        };
 
-    return RayPacketFrustum3D(nearPoints[0], rayDirections[0], nearPoints[1], rayDirections[1], nearPoints[2],
-                              rayDirections[2], nearPoints[3], rayDirections[3]);
-#endif
+        for (int i = 0; i < 4; i++) {
+            vector2 uvCurrent = {uv[i][0], uv[i][1]};
+            rayDirections[i] = sphericalUVToDirection(uvCurrent, settings.fovXDegrees, settings.fovYDegrees);
+        }
 
-    for (int i = 0; i < 4; ++i) {
-        // printf("rayDirections[%d] = %s\n", i, toString(rayDirections[i]).c_str());
-    }
-    for (int i = 0; i < 4; ++i) {
-        // printf("normalize(rayDirections[%d]) = %s\n", i, toString(normalize(rayDirections[i])).c_str());
+        return RayPacketFrustum3D(nearPoints[0], rayDirections[0], nearPoints[1], rayDirections[1], nearPoints[2],
+                                  rayDirections[2], nearPoints[3], rayDirections[3]);
     }
 
     // Compute extrema points on the focal plane
@@ -121,17 +115,11 @@ static RayPacketFrustum3D get3DFrustumFrom2D(const RayPacketFrustum2D& frustum2D
     farPoints[1] = vector3(farXMax, farYMin, farPlane);
     farPoints[2] = vector3(farXMax, farYMax, farPlane);
     farPoints[3] = vector3(farXMin, farYMax, farPlane);
-    for (int i = 0; i < 4; ++i) {
-        // printf("farPoints[%d] = %s\n", i, toString(farPoints[i]).c_str());
-    }
 
     vector3 finalDirections[4];
     for (int i = 0; i < 4; ++i) {
         finalDirections[i] = normalize(farPoints[i] - nearPoints[i]);
     }
-    for (int i = 0; i < 4; ++i) {
-        // printf("finalDirections[%d] = %s\n", i, toString(finalDirections[i]).c_str());
-    }
 
     return RayPacketFrustum3D(nearPoints[0], finalDirections[0], nearPoints[1], finalDirections[1], nearPoints[2],
                               finalDirections[2], nearPoints[3], finalDirections[3]);
@@ -141,49 +129,60 @@ static RayPacketFrustum3D get3DFrustumFrom2D(const RayPacketFrustum2D& frustum2D
 // When switching to a general fit of non-pinhole camera space rays, we'll need to consider how
 // the ray thickness (majorAxisLength) works in camera space (it's not a uniform thickness in
 // camera space, unlike sample space).
-void SampleHierarchy::populate3DFrom2D(uint32_t blockCount, const matrix3x3& sampleToCamera, ThinLens lens) {
-    // TODO(mmara) set this on the camera itself?
-    const float farPlane = -100.0f;
-
-    for (uint32_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) {
-        const auto& blockFrustum2D = blockFrusta2D[blockIndex];
+void SampleHierarchy::generateFrom2D(const SampleHierarchy2D& hierarchy2D, Sample2Dto3DMappingSettings settings) {
+    for (uint32_t blockIndex = 0; blockIndex < hierarchy2D.blockFrusta.size(); ++blockIndex) {
+        const auto& blockFrustum2D = hierarchy2D.blockFrusta[blockIndex];
         for (uint32_t tileIndex = 0; tileIndex < TILES_PER_BLOCK; ++tileIndex) {
-            const auto& frustum2D = tileFrusta2D[blockIndex * TILES_PER_BLOCK + tileIndex];
-            tileFrusta3D[blockIndex * TILES_PER_BLOCK + tileIndex] =
-                get3DFrustumFrom2D(frustum2D, sampleToCamera, lens, farPlane);
+            const auto& frustum2D = hierarchy2D.tileFrusta[blockIndex * TILES_PER_BLOCK + tileIndex];
+            tileFrusta3D[blockIndex * TILES_PER_BLOCK + tileIndex] = get3DFrustumFrom2D(frustum2D, settings);
+        }
+        blockFrusta3D[blockIndex] = get3DFrustumFrom2D(blockFrustum2D, settings);
+    }
+    for (uint32_t sampleIndex = 0; sampleIndex < hierarchy2D.samples.size(); ++sampleIndex) {
+        UnpackedSample us = hierarchy2D.samples[sampleIndex];
+        DirectionalBeam& ds = directionalSamples[sampleIndex];
+        ds.centerRay = settings.sampleToCamera * vector3(us.center, 1.0f);
+        ds.du = settings.sampleToCamera * vector3(us.majorAxis, 0.0f);
+        ds.dv = settings.sampleToCamera * vector3(us.minorAxis, 0.0f);
+        if (settings.type == Sample2Dto3DMappingSettings::MappingType::SphericalSection) {
+            ds.centerRay = sphericalUVToDirection(us.center, settings.fovXDegrees, settings.fovYDegrees);
+            ds.du = sphericalUVToDirection(us.center + us.majorAxis, settings.fovXDegrees, settings.fovYDegrees) -
+                    ds.centerRay;
+            ds.dv = sphericalUVToDirection(us.center + us.minorAxis, settings.fovXDegrees, settings.fovYDegrees) -
+                    ds.centerRay;
         }
-        blockFrusta3D[blockIndex] = get3DFrustumFrom2D(blockFrustum2D, sampleToCamera, lens, farPlane);
     }
 }
 
-void SampleHierarchy::generate(ArrayView<SortedSample> sortedSamples,
-                               uint32_t blockCount,
-                               uint32_t validSampleCount,
-                               const FloatRect& cullRect,
-                               ArrayView<float> blockedSamplePositions,
-                               ArrayView<Sample::Extents> blockedSampleExtents,
-                               ThinLens lens,
-                               const matrix3x3& sampleToCamera) {
+SampleHierarchy2D::SampleHierarchy2D(ArrayView<SortedSample> sortedSamples,
+                                     uint32_t blockCount,
+                                     uint32_t validSampleCount,
+                                     const FloatRect& cullRect,
+                                     ThinLens lens,
+                                     const matrix3x3& sampleToCamera) {
+    (void)lens;
+    (void)sampleToCamera;
     uint32_t maxIndex = validSampleCount - 1;
     uint32_t sampleIndex = 0;
     RayPacketFrustum2D cullFrustum2D(cullRect.lower.x, cullRect.upper.x, cullRect.lower.y, cullRect.upper.y);
+    blockFrusta = DynamicArray<RayPacketFrustum2D>(blockCount);
+    tileFrusta = DynamicArray<RayPacketFrustum2D>(blockCount * TILES_PER_BLOCK);
+    samples = DynamicArray<UnpackedSample>(blockCount * BLOCK_SIZE);
     for (uint32_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) {
-        auto& blockFrustum2D = blockFrusta2D[blockIndex];
+        auto& blockFrustum2D = blockFrusta[blockIndex];
         blockFrustum2D.setEmpty();
         for (uint32_t tileIndex = 0; tileIndex < TILES_PER_BLOCK; ++tileIndex) {
-            auto& frustum2D = tileFrusta2D[blockIndex * TILES_PER_BLOCK + tileIndex];
+            auto& frustum2D = tileFrusta[blockIndex * TILES_PER_BLOCK + tileIndex];
             frustum2D.setEmpty();
             for (uint32_t tileSample = 0; tileSample < TILE_SIZE; tileSample++) {
-                float x = sortedSamples[sampleIndex].position.x;
-                float y = sortedSamples[sampleIndex].position.y;
-                float major = sortedSamples[sampleIndex].extents.majorAxisLength;
+                auto s = sortedSamples[sampleIndex];
+                float x = s.position.x;
+                float y = s.position.y;
+                float major = s.extents.majorAxisLength;
                 frustum2D.merge(x + major, y + major);
                 frustum2D.merge(x - major, y - major);
 
-                blockedSamplePositions[sampleIndex * 2] = x;
-                blockedSamplePositions[sampleIndex * 2 + 1] = y;
-                blockedSampleExtents[sampleIndex] = sortedSamples[sampleIndex].extents;
-
+                samples[sampleIndex] = unpackSample(sortedSamples[sampleIndex]);
                 // Copy the final sample to pad out the block
                 sampleIndex = std::min(sampleIndex + 1, maxIndex);
             }
@@ -191,8 +190,6 @@ void SampleHierarchy::generate(ArrayView<SortedSample> sortedSamples,
             blockFrustum2D.merge(frustum2D);
         }
     }
-
-    populate3DFrom2D(blockCount, sampleToCamera, lens);
 }
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/sample_hierarchy.h b/libraries/hvvr/raycaster/sample_hierarchy.h
index 290659a..6dd2b0d 100644
--- a/libraries/hvvr/raycaster/sample_hierarchy.h
+++ b/libraries/hvvr/raycaster/sample_hierarchy.h
@@ -9,28 +9,54 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-#include "traversal.h"
-#include "samples.h"
+#include "gpu_samples.h"
 #include "graphics_types.h"
+#include "samples.h"
+#include "traversal.h"
 
 namespace hvvr {
 
+struct SampleHierarchy2D {
+    DynamicArray<RayPacketFrustum2D> tileFrusta;
+    DynamicArray<RayPacketFrustum2D> blockFrusta;
+    DynamicArray<UnpackedSample> samples;
+    SampleHierarchy2D() {}
+    SampleHierarchy2D(ArrayView<SortedSample> sortedSamples,
+                      uint32_t blockCount,
+                      uint32_t validSampleCount,
+                      const FloatRect& cullRect,
+                      ThinLens thinLens,
+                      const matrix3x3& sampleToCamera);
+};
+
+struct Sample2Dto3DMappingSettings {
+    matrix3x3 sampleToCamera;
+    ThinLens thinLens;
+    enum class MappingType { Perspective, SphericalSection };
+    MappingType type = MappingType::Perspective;
+    // Only used for SphericalSection mapping
+    float fovXDegrees;
+    float fovYDegrees;
+    Sample2Dto3DMappingSettings() {}
+    Sample2Dto3DMappingSettings(matrix3x3 _sampleToCamera, ThinLens _thinLens)
+        : sampleToCamera(_sampleToCamera), thinLens(_thinLens) {}
+    static Sample2Dto3DMappingSettings sphericalSection(matrix3x3 _sampleToCamera,
+                                                        ThinLens _thinLens,
+                                                        float _fovXDegrees,
+                                                        float _fovYDegrees) {
+        Sample2Dto3DMappingSettings result(_sampleToCamera, _thinLens);
+        result.type = MappingType::SphericalSection;
+        result.fovXDegrees = _fovXDegrees;
+        result.fovYDegrees = _fovYDegrees;
+        return result;
+    }
+};
+
 struct SampleHierarchy {
-    DynamicArray<RayPacketFrustum2D> tileFrusta2D;
-    DynamicArray<RayPacketFrustum2D> blockFrusta2D;
     DynamicArray<RayPacketFrustum3D> tileFrusta3D;
     DynamicArray<RayPacketFrustum3D> blockFrusta3D;
-
-    void generate(ArrayView<SortedSample> sortedSamples,
-                  uint32_t blockCount,
-                  uint32_t validSampleCount,
-                  const FloatRect& cullRect,
-                  ArrayView<float> blockedSamplePositions,
-                  ArrayView<Sample::Extents> blockedSampleExtents,
-                  ThinLens thinLens,
-                  const matrix3x3& sampleToCamera);
-
-    void populate3DFrom2D(uint32_t blockCount, const matrix3x3& sampleToCamera, ThinLens thinLens);
+    DynamicArray<DirectionalBeam> directionalSamples;
+    void generateFrom2D(const SampleHierarchy2D& hierarchy2D, Sample2Dto3DMappingSettings settings);
 };
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/samples.cpp b/libraries/hvvr/raycaster/samples.cpp
index 015d060..091e253 100644
--- a/libraries/hvvr/raycaster/samples.cpp
+++ b/libraries/hvvr/raycaster/samples.cpp
@@ -9,7 +9,7 @@
 
 #include "samples.h"
 #include "constants_math.h"
-
+#include "foveated.h"
 #include <assert.h>
 #include <cmath>
 #include <functional>
@@ -42,52 +42,41 @@ DynamicArray<Sample> getGridSamples(size_t width, size_t height) {
     return samples;
 }
 
-DynamicArray<DirectionSample> getEyeSpacePolarFoveatedSamples(std::vector<float>& ringEccentricities,
-                                                                    size_t& samplesPerRing,
-                                                                    float maxEyeTrackingUncertaintyDegrees,
-                                                                    float minMAR,
-                                                                    float maxMAR,
-                                                                    float maxFOVDegrees,
-                                                                    float marSlope,
-                                                                    float fovealMARDegrees,
-                                                                    float zenithJitterStrength,
-                                                                    float ringJitterStrength) {
-    assert(zenithJitterStrength <= 1.0f);
-    assert(ringJitterStrength <= 1.0f);
+DynamicArray<DirectionalBeam> getEyeSpacePolarFoveatedSamples(size_t& samplesPerRing,
+                                                              EccentricityMap& emap,
+                                                              float maxEyeTrackingUncertaintyDegrees,
+                                                              float maxFOVDegrees,
+                                                              float marSlope,
+                                                              float fovealMARDegrees) {
     const float m = marSlope;
     const float w_0 = fovealMARDegrees;
+    const float switchPoint1 = maxEyeTrackingUncertaintyDegrees / w_0;
+    const float S = maxEyeTrackingUncertaintyDegrees;
+    emap = EccentricityMap(marSlope, fovealMARDegrees, maxEyeTrackingUncertaintyDegrees);
 
     samplesPerRing = 0;
-    int ringCount = 0;
+	size_t ringCount = 0;
     size_t irregularGridSampleCount = 0;
     { // Calculate number of samples so we can allocate before generation
-        // w = m e + w_0
-        // e = (w - w_0 / m)
-        float e = (minMAR - w_0) / m;
-        float w = w_0;
-        while (e - w <= maxFOVDegrees) {
+        float E = 0.0f;
+        while (E <= maxFOVDegrees * RadiansPerDegree) {
             // Angular distance (in degrees) between samples on this annulus
-            w = std::min(maxMAR, m * std::max(e - maxEyeTrackingUncertaintyDegrees, 0.0f) + w_0);
-            float ringRadius = sinf(e * RadiansPerDegree);
+            float w = emap.apply(ringCount + 1.0f) - E;
+            float ringRadius = sinf(E);
             float angularDistanceAroundRing = 2.0f * Pi * ringRadius;
-            float angularDistanceAroundRingDegrees = angularDistanceAroundRing / RadiansPerDegree;
-            size_t samplesOnAnnulus = (size_t)(std::ceil(angularDistanceAroundRingDegrees / w));
-            printf("New samplesOnAnnulus: %zu = ceil(%f/%f)\n", samplesOnAnnulus, angularDistanceAroundRingDegrees, w);
+            size_t samplesOnAnnulus = (size_t)(std::ceil(angularDistanceAroundRing / w));
+            printf("New samplesOnAnnulus: %zu = ceil(%f/%f)\n", samplesOnAnnulus, angularDistanceAroundRing, w);
             samplesPerRing = std::max(samplesPerRing, samplesOnAnnulus);
 
             irregularGridSampleCount += samplesOnAnnulus;
-            e += w;
             ++ringCount;
+            E = emap.apply((float)ringCount);
         }
     }
-    printf("(%zu*%d=%d)/%d %f times the minimal sample count\n", samplesPerRing, ringCount,
+    printf("(%zu*%zu=%d)/%d %f times the minimal sample count\n", samplesPerRing, ringCount,
            (int)(samplesPerRing * ringCount), (int)irregularGridSampleCount,
            (samplesPerRing * ringCount) / (float)irregularGridSampleCount);
-
-    int index = 0;
-    DynamicArray<DirectionSample> samples(ringCount * samplesPerRing);
-    float e = (minMAR - w_0) / m;
-    float w = w_0;
+    DynamicArray<DirectionalBeam> samples(ringCount * samplesPerRing);
     /**
     A note on differentials:
     The zenith differential is found by taking the nearest point on the next concentric ring outwards, and
@@ -118,28 +107,19 @@ DynamicArray<DirectionSample> getEyeSpacePolarFoveatedSamples(std::vector<float>
     and then scaling it to the distance along the ring to the next sample
     */
     // Generate concentric circles of samples with spacing equal or less than MAR of eye at the eccentricity
-
-    std::uniform_real_distribution<float> uniformRandomDist(0.0f, 1.0f);
-    std::mt19937 generator;
-    auto rand = std::bind(uniformRandomDist, std::ref(generator));
-
-    while (e - w <= maxFOVDegrees) {
+    float E = 0.0f;
+    for (size_t r = 0; r < ringCount; ++r) {
         // Angular distance (in degrees) between samples on this annulus
-        w = std::min(maxMAR, m * std::max(e - maxEyeTrackingUncertaintyDegrees, 0.0f) + w_0);
-        float ringRadius = sinf(e * RadiansPerDegree);
+        float E_next = emap.apply(r + 1.0f);
+        float ringRadius = sinf(E);
         float angularDistanceAroundRing = 2.0f * Pi * ringRadius;
 
-        ringEccentricities.push_back(e * RadiansPerDegree);
-        float ringRotation = (rand() - 0.5f) * ringJitterStrength;
-        for (int i = 0; i < samplesPerRing; ++i) {
-            float zenithJitter = w * (rand() - 0.5f) * zenithJitterStrength * 0.5f;
-            vector3 baseVector = normalize(
-                quaternion::fromAxisAngle(vector3(0, 1, 0), (e + zenithJitter) * RadiansPerDegree) * vector3(0, 0, -1));
-            vector3 zenithDiffBaseVector =
-                normalize(quaternion::fromAxisAngle(vector3(0, 1, 0), (e + w + zenithJitter) * RadiansPerDegree) *
-                          vector3(0, 0, -1));
-
-            float rotationRadians = (i + ringRotation + 0.5f) / float(samplesPerRing) * 2.0f * Pi;
+        vector3 baseVector = normalize(quaternion::fromAxisAngle(vector3(0, 1, 0), E) * vector3(0, 0, -1));
+        vector3 zenithDiffBaseVector =
+            normalize(quaternion::fromAxisAngle(vector3(0, 1, 0), E_next) * vector3(0, 0, -1));
+
+        for (size_t i = 0; i < samplesPerRing; ++i) {
+            float rotationRadians = (i + 0.5f) / float(samplesPerRing) * 2.0f * Pi;
             vector3 p = normalize(quaternion::fromAxisAngle(vector3(0, 0, -1), rotationRadians) * baseVector);
             vector3 zenithDiffDirection =
                 normalize(quaternion::fromAxisAngle(vector3(0, 0, -1), rotationRadians) * zenithDiffBaseVector);
@@ -150,15 +130,62 @@ DynamicArray<DirectionSample> getEyeSpacePolarFoveatedSamples(std::vector<float>
             vector3 zenithDiff = zenithDiffDirectionOnTangentPlane - p;
             vector3 azimuthalDiffUnit = cross(p, normalize(zenithDiff));
             vector3 azimuthalDiff = azimuthalDiffUnit * angularDistanceAroundRing / (float)samplesPerRing;
-            samples[index].direction = p;
-            samples[index].zenithDifferential = zenithDiff;
-            samples[index].azimuthalDifferential = azimuthalDiff;
-            ++index;
+            size_t idx = samplesPerRing * r + i;
+            samples[idx].centerRay = p;
+            samples[idx].du = zenithDiff;
+            samples[idx].dv = azimuthalDiff;
         }
 
-        e += w;
+        E = E_next;
     }
+
     return samples;
 }
 
+UnpackedSample unpackSample(Sample s) {
+    UnpackedSample sample;
+    // sqrt(2)/2, currently a hack so that the ellipses blobs of diagonally adjacent pixels on a uniform grid are
+    // tangent
+#define EXTENT_MODIFIER 0.70710678118f
+    sample.center = s.position;
+    sample.minorAxis.x = s.extents.minorAxis.x * EXTENT_MODIFIER;
+    sample.minorAxis.y = s.extents.minorAxis.y * EXTENT_MODIFIER;
+
+    // 90 degree Rotation, and rescale
+    float rescale = s.extents.majorAxisLength * EXTENT_MODIFIER / length(s.extents.minorAxis);
+    sample.majorAxis.x = -sample.minorAxis.y * rescale;
+    sample.majorAxis.y = sample.minorAxis.x * rescale;
+    return sample;
+#undef EXTENT_MODIFIER
+}
+
+void saveSamples(const std::vector<hvvr::Sample>& samples, const std::string& filename) {
+    auto file = fopen(filename.c_str(), "wb");
+    if (!file) {
+        hvvr::fail("Unable to open output sample file %s", filename.c_str());
+    }
+    SampleFileHeader header;
+    header.sampleCount = uint32_t(samples.size());
+    fwrite(&header, sizeof(SampleFileHeader), 1, file);
+    fwrite(&samples[0], sizeof(hvvr::Sample), header.sampleCount, file);
+    fclose(file);
+}
+
+void loadSamples(hvvr::DynamicArray<hvvr::Sample>& samples, const std::string& filename) {
+    auto file = fopen(filename.c_str(), "rb");
+    if (!file) {
+        hvvr::fail("Unable to find sample file %s\nMake sure to generate them using GenerateSamplesFromDistortion "
+                   "then copy them to this project's folder",
+                   filename.c_str());
+    }
+    SampleFileHeader header;
+    fread(&header, sizeof(SampleFileHeader), 1, file);
+    assert(header.magic == SampleFileHeader().magic);
+    assert(header.version == SampleFileHeader().version);
+    samples = hvvr::DynamicArray<hvvr::Sample>(header.sampleCount);
+    fread(samples.data(), sizeof(hvvr::Sample), samples.size(), file);
+    fclose(file);
+}
+
+
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/samples.h b/libraries/hvvr/raycaster/samples.h
index 8c42d9c..0579340 100644
--- a/libraries/hvvr/raycaster/samples.h
+++ b/libraries/hvvr/raycaster/samples.h
@@ -11,10 +11,10 @@
 
 #include "dynamic_array.h"
 #include "graphics_types.h"
-
+#include <cassert>
 
 namespace hvvr {
-
+struct EccentricityMap;
 struct Sample {
     vector2ui pixelLocation;
     vector2 position;
@@ -29,6 +29,29 @@ struct Sample {
     } extents;
 };
 
+struct UnpackedSample {
+    vector2 center;
+    vector2 majorAxis;
+    vector2 minorAxis;
+};
+
+UnpackedSample unpackSample(Sample s);
+
+// Header for binary file of Samples
+struct SampleFileHeader {
+    uint32_t magic = 0x0CD1; // Oculus Distortion
+    uint32_t version = 0;
+    uint32_t reserved0;
+    uint32_t sampleCount;
+};
+
+// Save to file using a simple binary format
+void loadSamples(hvvr::DynamicArray<hvvr::Sample>& samples, const std::string& filename);
+
+// Load from file using a simple binary format
+void saveSamples(const std::vector<hvvr::Sample>& samples, const std::string& filename);
+
+
 struct SortedSample : Sample {
     uint32_t channel;
 
@@ -38,14 +61,14 @@ struct SortedSample : Sample {
 
 DynamicArray<Sample> getGridSamples(size_t width, size_t height);
 
-// -Z is along the axis of the eye
-struct DirectionSample {
-    vector3 direction;
-    vector3 zenithDifferential;
-    vector3 azimuthalDifferential;
+struct DirectionalBeam {
+	vector3 centerRay;
+	vector3 du;
+	vector3 dv;
 };
 
 /*
+
     Generate eye-space samples matching the eyes resolution with density using the linear model from
     https://www.microsoft.com/en-us/research/wp-content/uploads/2012/11/foveated_final15.pdf
 
@@ -67,36 +90,25 @@ struct DirectionSample {
     They use
     w_0 = 1/48 degrees (20/10 vision is 1/60 degrees)
 
-    \param ringStarts: The starting index for every concentric circle (along with the final sample count at the end).
-   Useful for ad-hoc meshing
-
     \param maxEyeTrackingUncertaintyDegrees If set to > 0 then e acts like max(e - maxEyeTrackingUncertaintyDegrees, 0).
    This ensures that we conservatively calculate the sampling positions so that we don't accidentally undersample
    regions of the screen due to error in eye tracking. The higher this value, the more samples are generated.
-    \param minMAR We do not generate samples for regions of the distribution with lower calculated MAR than minMAR.
-   Useful if you want to render the region of the screen you can fully resolve with an exact technique instead of
-   reconstructing from these foveated samples
-   \param maxMAR We do not space samples any further away than the maxMAR in degrees
     \param maxFOVDegrees is the diagonal FOV of the display, we generate samples such that even if you are looking in
    the far corner of the display, there is one ring of samples beyond the opposite corner of the display, so that the
    entire screen can still be reconstructed from the foveate samples without issue.
     \param marSlope corresponds to m, we default it to the more conservative m_B
     \param fovealMARDegrees corresponds to w_0, we default it to 1/60 degrees (20/10) to be even more conservative than
    the microsoft paper
-   \param zenithJitterStrength How much to jitter each sample along the zenith direction
-   \param ringJitterStrength How much to jitter each ring along the azimuthal direction
-   (TODO: support to jitter samples individually along the azimuthal direction)
+
+   Consider replacing this space with a transformed log-polar coordinate system:
+   Weiman, Chaikin, Logarithmic Spiral Grids for Image Processing and Display, Computer Graphics and Image Processing
+   11, 197�226 (1979).
 */
-// Use the model from getEyeSpaceFoveatedSamples, but require identical number of samples around each annulus
-DynamicArray<DirectionSample> getEyeSpacePolarFoveatedSamples(std::vector<float>& ringEccentricities,
-                                                                    size_t& samplesPerRing,
-                                                                    float maxEyeTrackingUncertaintyDegrees,
-                                                                    float minMAR,
-                                                                    float maxMAR,
-                                                                    float maxFOVDegrees,
-                                                                    float marSlope,
-                                                                    float fovealMARDegrees,
-                                                                    float zenithJitterStrength,
-                                                                    float ringJitterStrength);
+DynamicArray<DirectionalBeam> getEyeSpacePolarFoveatedSamples(size_t& samplesPerRing,
+                                                              EccentricityMap& emap,
+                                                              float maxEyeTrackingUncertaintyDegrees,
+                                                              float maxFOVDegrees,
+                                                              float marSlope,
+                                                              float fovealMARDegrees);
 
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/scene.cpp b/libraries/hvvr/raycaster/scene.cpp
index 6da4023..3f3354e 100644
--- a/libraries/hvvr/raycaster/scene.cpp
+++ b/libraries/hvvr/raycaster/scene.cpp
@@ -25,6 +25,7 @@ void Raycaster::cleanupScene() {
     DestroyAllTextures();
 }
 
+
 Texture* Raycaster::createTexture(const TextureData& textureData) {
     _textures.emplace_back(std::make_unique<Texture>(textureData));
     return (_textures.end() - 1)->get();
diff --git a/libraries/hvvr/raycaster/scene_update.cpp b/libraries/hvvr/raycaster/scene_update.cpp
index 5c7e250..288a567 100644
--- a/libraries/hvvr/raycaster/scene_update.cpp
+++ b/libraries/hvvr/raycaster/scene_update.cpp
@@ -151,7 +151,6 @@ void Raycaster::uploadScene() {
     // if we update CUDA's copy of the scene, we must also call AnimateScene to supply the transforms
     gpuSceneState.setGeometry(*this);
 
-    // TODO(anankervis): don't dynamically allocate memory here
     DynamicArray<matrix4x4> modelToWorld(_models.size());
     for (size_t i = 0; i < _models.size(); ++i)
         modelToWorld[i] = matrix4x4(_models[i]->getTransform());
diff --git a/libraries/hvvr/raycaster/shading_helpers.h b/libraries/hvvr/raycaster/shading_helpers.h
index c4cedb0..df74b33 100644
--- a/libraries/hvvr/raycaster/shading_helpers.h
+++ b/libraries/hvvr/raycaster/shading_helpers.h
@@ -199,6 +199,14 @@ CHDI uint32_t ToColor4Unorm8SRgb(vector4 color) {
     uint32_t a = uint32_t(color.w * 255.0f);
     return (r) | (g << 8) | (b << 16) | (a << 24);
 }
+CHDI uint64_t ToColor4Unorm16(vector4 color) {
+	const float C = 65535.0f; // 2^16-1
+	uint64_t r = uint64_t(color.x * C);
+	uint64_t g = uint64_t(color.y * C);
+	uint64_t b = uint64_t(color.z * C);
+	uint64_t a = uint64_t(color.w * C);
+	return (r) | (g << 16) | (b << 32) | (a << 48);
+}
 
 CHDI vector4 FromColor4Unorm8(uint32_t c) {
     float r = (c & 0xFF) / 255.0f;
diff --git a/libraries/hvvr/raycaster/sort.h b/libraries/hvvr/raycaster/sort.h
index de5f288..9ff826d 100644
--- a/libraries/hvvr/raycaster/sort.h
+++ b/libraries/hvvr/raycaster/sort.h
@@ -17,18 +17,19 @@ CUDA_DEVICE_INL uint32_t FloatFlip(uint32_t f) {
     return f ^ mask;
 }
 CUDA_DEVICE_INL uint32_t IFloatFlip(uint32_t f) {
-    uint32_t mask = ((f >> 31) - 1) | 0x80000000;
+    // TODO: CUDA 8.0 -> 9.1 transition... PTX is OK (unchanged), SASS is busted
+    //uint32_t mask = ((f >> 31) - 1) | 0x80000000;
+
+    // this seems to work on 9.1
+    uint32_t mask = ((f & 0x80000000) == 0) ? 0xffffffff : 0x80000000;
+
     return f ^ mask;
 }
 CUDA_DEVICE_INL uint32_t FloatFlipF(float f) {
-    int fAsInt = __float_as_int(f);
-    uint32_t fAsUInt = *((uint32_t*)&f);
-    return FloatFlip(fAsUInt);
+    return FloatFlip(__float_as_int(f));
 }
 CUDA_DEVICE_INL float IFloatFlipF(uint32_t f) {
-    uint32_t resU = IFloatFlip(f);
-    int32_t resI = *((uint32_t*)&f);
-    return __int_as_float(resI);
+    return __int_as_float(IFloatFlip(f));
 }
 
 // bitonic sort within a single thread, see:
diff --git a/libraries/hvvr/raycaster/texture.cu b/libraries/hvvr/raycaster/texture.cu
index e4439d5..3feb825 100644
--- a/libraries/hvvr/raycaster/texture.cu
+++ b/libraries/hvvr/raycaster/texture.cu
@@ -43,6 +43,8 @@ static CudaFormatDescriptor formatToDescriptor(TextureFormat format) {
             return {8u, 8u, 8u, 8u, cudaChannelFormatKindUnsigned, cudaReadModeNormalizedFloat, true, 4};
         case TextureFormat::r8g8b8a8_unorm:
             return {8u, 8u, 8u, 8u, cudaChannelFormatKindUnsigned, cudaReadModeNormalizedFloat, false, 4};
+		case TextureFormat::r16g16b16a16_unorm:
+			return{ 16u, 16u, 16u, 16u, cudaChannelFormatKindUnsigned, cudaReadModeNormalizedFloat, false, 4 };
         case TextureFormat::r32g32b32a32_float:
             return {32u, 32u, 32u, 32u, cudaChannelFormatKindFloat, cudaReadModeElementType, false, 16};
         case TextureFormat::r16g16b16a16_float:
@@ -241,8 +243,8 @@ uint32_t CreateTexture(const TextureData& textureData) {
     texDesc.maxAnisotropy = 8;
 
     printf("width: %u, height: %u, stride: %u, elementSize: %u\n", textureData.width, textureData.height,
-           textureData.stride, desc.elementSize);
-    cutilSafeCall(cudaMemcpy2DToArray(tex.d_rawMemory, 0, 0, textureData.data, textureData.stride * desc.elementSize,
+           textureData.strideElements, desc.elementSize);
+    cutilSafeCall(cudaMemcpy2DToArray(tex.d_rawMemory, 0, 0, textureData.data, textureData.strideElements * desc.elementSize,
                                       textureData.width * desc.elementSize, textureData.height,
                                       cudaMemcpyHostToDevice));
 
@@ -276,6 +278,7 @@ void DestroyAllTextures() {
         cutilSafeCall(cudaDestroyTextureObject(gTextureAtlas[i].d_texObject));
     }
     cutilSafeCall(cudaFree(gDeviceTextureArray));
+    gTextureCount = 0;
 }
 
 Texture2D createEmptyTexture(uint32_t width,
@@ -318,4 +321,17 @@ Texture2D createEmptyTexture(uint32_t width,
     return tex;
 }
 
+CUDA_KERNEL void ClearKernel(Texture2D tex) {
+	uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
+	uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
+	if (x < tex.width*tex.elementSize && y < tex.height) {
+		surf2Dwrite<unsigned char>(0, tex.d_surfaceObject, x, y);
+	}
+}
+
+void clearTexture(Texture2D tex) {
+	KernelDim dim(tex.width*tex.elementSize, tex.height, 16, 8);
+	ClearKernel<<<dim.grid, dim.block>>>(tex);
+}
+
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/texture.h b/libraries/hvvr/raycaster/texture.h
index 972d753..e5efeea 100644
--- a/libraries/hvvr/raycaster/texture.h
+++ b/libraries/hvvr/raycaster/texture.h
@@ -19,6 +19,7 @@ enum class TextureFormat : uint32_t {
     none = 0,
     r8g8b8a8_unorm_srgb,
     r8g8b8a8_unorm,
+	r16g16b16a16_unorm,
     r32g32b32a32_float,
     r16g16b16a16_float,
     r11g11b10_float,
@@ -32,7 +33,7 @@ struct TextureData {
     TextureFormat format;
     uint32_t width;
     uint32_t height;
-    uint32_t stride; // TODO(anankervis): rename to strideElements
+    uint32_t strideElements;
 };
 
 class Texture {
@@ -60,6 +61,8 @@ inline size_t getTextureSize(uint32_t strideElements, uint32_t height, TextureFo
             return elementCount * 4;
         case TextureFormat::r8g8b8a8_unorm:
             return elementCount * 4;
+		case TextureFormat::r16g16b16a16_unorm:
+			return elementCount * 8;
         case TextureFormat::r32g32b32a32_float:
             return elementCount * 16;
         case TextureFormat::r16g16b16a16_float:
diff --git a/libraries/hvvr/raycaster/texture_internal.h b/libraries/hvvr/raycaster/texture_internal.h
index 5f72985..4d4195c 100644
--- a/libraries/hvvr/raycaster/texture_internal.h
+++ b/libraries/hvvr/raycaster/texture_internal.h
@@ -35,4 +35,6 @@ Texture2D createEmptyTexture(uint32_t width,
                              cudaTextureAddressMode yWrapMode,
                              bool linearFilter = true);
 
+void clearTexture(Texture2D tex);
+
 } // namespace hvvr
diff --git a/libraries/hvvr/raycaster/tile_data.h b/libraries/hvvr/raycaster/tile_data.h
index 18f4a68..b6900a7 100644
--- a/libraries/hvvr/raycaster/tile_data.h
+++ b/libraries/hvvr/raycaster/tile_data.h
@@ -14,40 +14,42 @@
 
 namespace hvvr {
 
-struct TileData {
-    vector3 rayOrigin;
-    vector3 majorDirDiff;
-    vector3 minorDirDiff;
+template <bool DoF>
+struct TileData;
+
 
-    CUDA_DEVICE void load(matrix4x4 sampleToWorld, UnpackedDirectionalSample sample) {
-        rayOrigin = vector3(sampleToWorld * vector4(0.0f, 0.0f, 0.0f, 1.0f));
+template <>
+struct TileData<false> {
+    vector3 rayOrigin;
+    vector3 dirDu;
+    vector3 dirDv;
 
-        majorDirDiff = sample.majorDirDiff;
-        minorDirDiff = sample.minorDirDiff;
+    CUDA_DEVICE_INL void load(matrix4x4 cameraToWorld, DirectionalBeam sample) {
+        rayOrigin = vector3(cameraToWorld.m3);
+        dirDu = sample.du;
+        dirDv = sample.dv;
     }
 };
 
-struct TileDataDoF {
+template <>
+struct TileData<true> {
     vector2 focalToLensScale;
-
-    vector3 lensCenter;
     vector3 lensU;
     vector3 lensV;
 
-    CUDA_DEVICE void load(SampleInfo sampleInfo, matrix4x4 sampleToWorld, uint32_t sampleOffset) {
-        UnpackedSample sample2D = GetFullSample(sampleOffset, sampleInfo);
-        matrix3x3 sampleToWorldRotation = matrix3x3(sampleToWorld);
-
-        lensCenter = vector3(sampleToWorld * vector4(0.0f, 0.0f, 0.0f, 1.0f));
+    CUDA_DEVICE void load(CameraBeams cameraBeams, matrix4x4 cameraToWorld, DirectionalBeam sample) {
+        matrix3x3 cameraToWorldRotation = matrix3x3(cameraToWorld);
 
-        // actual focal derivatives should be multiplier by sampleInfo.lens.focalDistance (see scale below)
-        vector3 focalU = sampleToWorldRotation * vector3(sample2D.majorAxis.x, sample2D.majorAxis.y, 0.0f);
-        vector3 focalV = sampleToWorldRotation * vector3(sample2D.minorAxis.x, sample2D.minorAxis.y, 0.0f);
+        // actual focal derivatives should be multiplier by cameraBeams.lens.focalDistance (see scale below)
+        vector3 focalU =
+            cameraToWorldRotation * sample.du; // vector3(sample2D.majorAxis.x, sample2D.majorAxis.y, 0.0f);
+        vector3 focalV =
+            cameraToWorldRotation * sample.dv; // vector3(sample2D.minorAxis.x, sample2D.minorAxis.y, 0.0f);
 
         // If we force the lens and focal planes to be parallel, and their derivatives to be identical except for a
         // scale factor, we can optimize the inner loop and test.
-        float focalDistance = sampleInfo.lens.focalDistance;
-        float lensRadius = sampleInfo.lens.radius;
+        float focalDistance = cameraBeams.lens.focalDistance;
+        float lensRadius = cameraBeams.lens.radius;
         float focalUMagInv = rsqrtf(dot(focalU, focalU));
         float focalVMagInv = rsqrtf(dot(focalV, focalV));
         float focalUMag = 1.0f / focalUMagInv;
diff --git a/libraries/hvvr/raycaster/traversal.cpp b/libraries/hvvr/raycaster/traversal.cpp
index 96aea30..eabe0ba 100644
--- a/libraries/hvvr/raycaster/traversal.cpp
+++ b/libraries/hvvr/raycaster/traversal.cpp
@@ -29,14 +29,22 @@ namespace hvvr {
 // TODO(anankervis): optimize this - it gets called a lot
 // find the major axis, and precompute some values for the intersection tests
 void RayPacketFrustum3D::updatePrecomputed() {
-    plane[0] = vector4(cross(pointDir[0], pointDir[1]), 0.0f);
-    plane[1] = vector4(cross(pointDir[1], pointDir[2]), 0.0f);
-    plane[2] = vector4(cross(pointDir[2], pointDir[3]), 0.0f);
-    plane[3] = vector4(cross(pointDir[3], pointDir[0]), 0.0f);
-    plane[0].w = dot(pointOrigin[0], vector3(plane[0]));
-    plane[1].w = dot(pointOrigin[1], vector3(plane[1]));
-    plane[2].w = dot(pointOrigin[2], vector3(plane[2]));
-    plane[3].w = dot(pointOrigin[3], vector3(plane[3]));
+    // If we have an infinite orgin, assume a fully degenerate frustum that encompasses nothing.
+    if (isinf(pointOrigin[0].x)) {
+        plane[0] = vector4(0, 0, 0, -INFINITY);
+        plane[1] = vector4(0, 0, 0, -INFINITY);
+        plane[2] = vector4(0, 0, 0, -INFINITY);
+        plane[3] = vector4(0, 0, 0, -INFINITY);
+    } else {
+        plane[0] = vector4(cross(pointDir[0], pointDir[1]), 0.0f);
+        plane[1] = vector4(cross(pointDir[1], pointDir[2]), 0.0f);
+        plane[2] = vector4(cross(pointDir[2], pointDir[3]), 0.0f);
+        plane[3] = vector4(cross(pointDir[3], pointDir[0]), 0.0f);
+        plane[0].w = dot(pointOrigin[0], vector3(plane[0]));
+        plane[1].w = dot(pointOrigin[1], vector3(plane[1]));
+        plane[2].w = dot(pointOrigin[2], vector3(plane[2]));
+        plane[3].w = dot(pointOrigin[3], vector3(plane[3]));
+    }
 
     vector3 avgDir = pointDir[0] + pointDir[1] + pointDir[2] + pointDir[3];
     if (fabsf(avgDir.x) > fabsf(avgDir.y)) {
@@ -417,6 +425,9 @@ void StackFrameBlock::sort(uint32_t stackSize) {
 static __forceinline uint32_t blockCull3D(StackFrameBlock* frame,
                                           const BVHNode* node,
                                           const RayPacketFrustum3D& frustum) {
+    if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
+        return 0;
+    }
     auto negateMask = m256(-0.0f);
     uint32_t top = 0;
     auto tMin = 0.0f;
@@ -512,6 +523,10 @@ static __forceinline uint32_t tileCull3D(uint32_t* triIndices,
                                          StackFrameTile* frame,
                                          uint32_t top,
                                          const RayPacketFrustum3D& frustum) {
+    (void)maxTriCount;
+    if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
+        return 0;
+    }
     auto negateMask = m256(-0.0f);
     auto tMin = 0.0f;
     uint32_t triCount = 0;
@@ -680,8 +695,11 @@ struct TaskData {
     }
 };
 
-static void cullThread(
-    const BlockInfo& blockInfo, uint32_t startBlock, uint32_t endBlock, const BVHNode* nodes, TaskData* perThread) {
+static void cullThread(const RayHierarchy& rayHierarchy,
+                       uint32_t startBlock,
+                       uint32_t endBlock,
+                       const BVHNode* nodes,
+                       TaskData* perThread) {
 #if DEBUG_STATS
     auto startTime = (double)__rdtsc();
 #endif
@@ -694,7 +712,7 @@ static void cullThread(
 
     StackFrameBlock frameBlock;
     for (uint32_t b = startBlock; b < endBlock; ++b) {
-        const RayPacketFrustum3D& blockFrustum = blockInfo.blockFrusta[b];
+        const RayPacketFrustum3D& blockFrustum = rayHierarchy.blockFrusta[b];
         uint32_t stackSize = blockCull3D(&frameBlock, nodes, blockFrustum);
 
         if (!stackSize) { // we hit nothing?
@@ -719,7 +737,7 @@ static void cullThread(
             uint32_t* triIndices = perThread->triIndices.data() + perThread->triIndexCount;
             uint32_t maxTriCount = uint32_t(perThread->triIndices.size()) - perThread->triIndexCount;
 
-            const RayPacketFrustum3D& tileFrustum = blockInfo.tileFrusta[globalTileIndex];
+            const RayPacketFrustum3D& tileFrustum = rayHierarchy.tileFrusta[globalTileIndex];
             uint32_t outputTriCount = tileCull3D(triIndices, maxTriCount, &frameTile, stackSize, tileFrustum);
 
             if (outputTriCount) {
@@ -741,9 +759,9 @@ static void cullThread(
     std::vector<double> blockFrustaAngle(endBlock - startBlock);
     std::vector<double> tileFrustaAngle((endBlock - startBlock) * TILES_PER_BLOCK);
     for (size_t b = startBlock; b < endBlock; ++b) {
-        blockFrustaAngle[b - startBlock] = solidAngle(blockInfo.blockFrusta[b]);
+        blockFrustaAngle[b - startBlock] = solidAngle(rayHierarchy.blockFrusta[b]);
         for (size_t t = 0; t < TILES_PER_BLOCK; ++t) {
-            tileFrustaAngle[t] = solidAngle(blockInfo.tileFrusta[b * TILES_PER_BLOCK + t]);
+            tileFrustaAngle[t] = solidAngle(rayHierarchy.tileFrusta[b * TILES_PER_BLOCK + t]);
         }
     }
     size_t validBlocks, validTiles;
@@ -757,24 +775,24 @@ static void cullThread(
 #endif
 }
 
-void Raycaster::buildTileTriangleLists(const BlockInfo& blockInfo, Camera_StreamedData* streamed) {
+void Raycaster::buildTileTriangleLists(const RayHierarchy& rayHierarchy, Camera_StreamedData* streamed) {
     const BVHNode* nodes = _nodes.data();
     ArrayView<uint32_t> triIndices(streamed->triIndices.dataHost(), streamed->triIndices.size());
 
 #if DEBUG_STATS
-    std::vector<double> blockFrustaAngle(blockInfo.blockFrusta.size());
-    for (int i = 0; i < blockInfo.blockFrusta.size(); ++i) {
-        blockFrustaAngle[i] = solidAngle(blockInfo.blockFrusta[i]);
+    std::vector<double> blockFrustaAngle(rayHierarchy.blockFrusta.size());
+    for (int i = 0; i < rayHierarchy.blockFrusta.size(); ++i) {
+        blockFrustaAngle[i] = solidAngle(rayHierarchy.blockFrusta[i]);
     }
     size_t validBlocks;
     vector4 m4X = minMaxMeanMedian(blockFrustaAngle, validBlocks);
     printf("Block: Min,Max,Mean,Median: %g, %g, %g, %g\n", m4X.x, m4X.y, m4X.z, m4X.w);
     printf("Percent of sphere covered by block frusta: %g\n", 100.0 * m4X.z * validBlocks / (4 * M_PI));
 
-    std::vector<double> tileFrustaAngle(blockInfo.tileFrusta.size());
-    for (int i = 0; i < blockInfo.tileFrusta.size(); ++i) {
+    std::vector<double> tileFrustaAngle(rayHierarchy.tileFrusta.size());
+    for (int i = 0; i < rayHierarchy.tileFrusta.size(); ++i) {
         // Convert to square degrees from steradians
-        tileFrustaAngle[i] = solidAngle(blockInfo.tileFrusta[i]);
+        tileFrustaAngle[i] = solidAngle(rayHierarchy.tileFrusta[i]);
     }
     size_t validTiles;
     m4X = minMaxMeanMedian(tileFrustaAngle, validTiles);
@@ -792,7 +810,7 @@ void Raycaster::buildTileTriangleLists(const BlockInfo& blockInfo, Camera_Stream
     // 2 seems the fastest (though this could vary depending on the scene and sample distribution)
     // 3+ seems to become less efficient due to workload balancing
     enum { blocksPerThread = 2 };
-    uint32_t blockCount = uint32_t(blockInfo.blockFrusta.size());
+    uint32_t blockCount = uint32_t(rayHierarchy.blockFrusta.size());
     uint32_t numTasks = (blockCount + blocksPerThread - 1) / blocksPerThread;
     assert(numTasks <= maxTasks);
     numTasks = min<uint32_t>(maxTasks, numTasks);
@@ -811,8 +829,7 @@ void Raycaster::buildTileTriangleLists(const BlockInfo& blockInfo, Camera_Stream
         if (i == numTasks - 1)
             assert(endBlock == blockCount);
 
-        taskResults[i] = _threadPool->addTask(cullThread, blockInfo, startBlock, endBlock,
-                                              nodes, &taskData[i]);
+        taskResults[i] = _threadPool->addTask(cullThread, rayHierarchy, startBlock, endBlock, nodes, &taskData[i]);
     }
 
 #if DEBUG_STATS
diff --git a/libraries/hvvr/raycaster/traversal.h b/libraries/hvvr/raycaster/traversal.h
index 0b6d1cf..a833ea2 100644
--- a/libraries/hvvr/raycaster/traversal.h
+++ b/libraries/hvvr/raycaster/traversal.h
@@ -10,12 +10,12 @@
  */
 
 #include "bvh_node.h"
+#include "dynamic_array.h"
 #include "graphics_types.h"
 #include "magic_constants.h"
 #include "raycaster_common.h"
 #include "samples.h"
 #include "vector_math.h"
-#include "dynamic_array.h"
 
 #include <assert.h>
 
@@ -85,6 +85,9 @@ struct RayPacketFrustum2D {
 // -where p are the vertices of a polyhedron
 // -then p[k] is outside the plane
 // -if all p are outside any single plane, then the polyhedron is rejected
+//
+// We allow for specifying a fully-culling degerenated frustum by using -infinite-b
+// (If the b[i] are -inifinity, then the frustum rejects all intersection tests.)
 struct RayPacketFrustum3D {
     // no near or far plane
     enum { planeCount = 4 };
@@ -177,11 +180,10 @@ struct RayPacketFrustum3D {
     // --as opposed to recomputing all the cross products and such, use transpose of inverse for normals/planes
     RayPacketFrustum3D transform(const matrix4x4& m, const matrix4x4& mInvTranspose) const {
         (void)mInvTranspose;
-        return RayPacketFrustum3D(
-            vector3(m * vector4(pointOrigin[0], 1.0f)), matrix3x3(m) * pointDir[0],
-            vector3(m * vector4(pointOrigin[1], 1.0f)), matrix3x3(m) * pointDir[1],
-            vector3(m * vector4(pointOrigin[2], 1.0f)), matrix3x3(m) * pointDir[2],
-            vector3(m * vector4(pointOrigin[3], 1.0f)), matrix3x3(m) * pointDir[3]);
+        return RayPacketFrustum3D(vector3(m * vector4(pointOrigin[0], 1.0f)), matrix3x3(m) * pointDir[0],
+                                  vector3(m * vector4(pointOrigin[1], 1.0f)), matrix3x3(m) * pointDir[1],
+                                  vector3(m * vector4(pointOrigin[2], 1.0f)), matrix3x3(m) * pointDir[2],
+                                  vector3(m * vector4(pointOrigin[3], 1.0f)), matrix3x3(m) * pointDir[3]);
     }
 
     // intersect against the four child AABBs, corresponding bit index is set if the AABB passes
@@ -202,7 +204,7 @@ struct RayPacketFrustum3D {
     }
 };
 
-struct BlockInfo {
+struct RayHierarchy {
     ArrayView<const RayPacketFrustum3D> blockFrusta;
     ArrayView<const RayPacketFrustum3D> tileFrusta;
 };
diff --git a/libraries/hvvr/raycaster/warp_ops.h b/libraries/hvvr/raycaster/warp_ops.h
index c5fdc2c..c990bc8 100644
--- a/libraries/hvvr/raycaster/warp_ops.h
+++ b/libraries/hvvr/raycaster/warp_ops.h
@@ -24,13 +24,13 @@ CUDA_DEVICE_INL int laneGetMaskLT() {
 
 template <typename T>
 CUDA_DEVICE_INL T laneBroadcast(T v, int laneIndex) {
-    return __shfl(v, laneIndex);
+    return __shfl_sync(__activemask(), v, laneIndex);
 }
 
 // avoid calling this multiple times for the same value of pred
 // the compiler doesn't like to optimize this intrinsic
 CUDA_DEVICE_INL int warpBallot(bool pred) {
-    return __ballot(pred);
+    return __ballot_sync(__activemask(), pred);
 }
 
 CUDA_DEVICE_INL int warpGetFirstActiveIndex(int predMask) {
@@ -69,6 +69,6 @@ CUDA_DEVICE_INL T warpAppend(bool pred, T* counter) {
 template <typename T, typename Op>
 CUDA_DEVICE_INL T warpReduce(T val, const Op& op) {
     for (int xorMask = WARP_SIZE / 2; xorMask >= 1; xorMask /= 2)
-        val = op(val, __shfl_xor(val, xorMask));
+        val = op(val, __shfl_xor_sync(__activemask(), val, xorMask));
     return val;
 }
diff --git a/libraries/hvvr/samples_shared/model_import_bin.cpp b/libraries/hvvr/samples_shared/model_import_bin.cpp
index db9afc9..215bb72 100644
--- a/libraries/hvvr/samples_shared/model_import_bin.cpp
+++ b/libraries/hvvr/samples_shared/model_import_bin.cpp
@@ -28,14 +28,14 @@ struct BinTexture {
 
     BinTexture() {}
     explicit BinTexture(const Texture& tex)
-        : format(tex.tex.format), width(tex.tex.width), height(tex.tex.height), stride(tex.tex.stride) {}
+        : format(tex.tex.format), width(tex.tex.width), height(tex.tex.height), stride(tex.tex.strideElements) {}
 
     explicit operator Texture() const {
         hvvr::TextureData texDesc = {};
         texDesc.format = format;
         texDesc.width = width;
         texDesc.height = height;
-        texDesc.stride = stride;
+        texDesc.strideElements = stride;
 
         return Texture(texDesc);
     }
@@ -88,7 +88,7 @@ bool loadBin(const char* path, Model& model) {
             fread(&binTex, sizeof(binTex), 1, file);
             Texture tex(binTex);
 
-            size_t sizeBytes = hvvr::getTextureSize(tex.tex.stride, tex.tex.height, tex.tex.format);
+            size_t sizeBytes = hvvr::getTextureSize(tex.tex.strideElements, tex.tex.height, tex.tex.format);
             tex.tex.data = new uint8_t [sizeBytes];
             fread((void*)tex.tex.data, sizeBytes, 1, file);
 
diff --git a/libraries/hvvr/samples_shared/samples_shared.props b/libraries/hvvr/samples_shared/samples_shared.props
index e043264..27adc0c 100644
--- a/libraries/hvvr/samples_shared/samples_shared.props
+++ b/libraries/hvvr/samples_shared/samples_shared.props
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ImportGroup Label="PropertySheets" />
   <PropertyGroup Label="UserMacros" />
diff --git a/libraries/hvvr/samples_shared/samples_shared.vcxproj b/libraries/hvvr/samples_shared/samples_shared.vcxproj
index 6b4ef26..d1a9a0a 100644
--- a/libraries/hvvr/samples_shared/samples_shared.vcxproj
+++ b/libraries/hvvr/samples_shared/samples_shared.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Development|x64">
@@ -52,4 +52,4 @@
     <Import Project="..\..\..\libraries\hvvr\shared\shared.props" />
   </ImportGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/libraries/hvvr/samples_shared/window_d3d11.cpp b/libraries/hvvr/samples_shared/window_d3d11.cpp
index 780e395..7d34137 100644
--- a/libraries/hvvr/samples_shared/window_d3d11.cpp
+++ b/libraries/hvvr/samples_shared/window_d3d11.cpp
@@ -219,7 +219,7 @@ void WindowD3D11::onResize() {
     initRenderTargets();
 
     if (_resizeCallback) {
-        _resizeCallback();
+        _resizeCallback(_width, _height);
     }
 }
 
diff --git a/libraries/hvvr/samples_shared/window_d3d11.h b/libraries/hvvr/samples_shared/window_d3d11.h
index 3e023c3..ec42df6 100644
--- a/libraries/hvvr/samples_shared/window_d3d11.h
+++ b/libraries/hvvr/samples_shared/window_d3d11.h
@@ -10,7 +10,7 @@
  */
 
 #include <stdint.h>
-
+#include <functional>
 struct IDXGISwapChain;
 struct ID3D11Device;
 struct ID3D11DeviceContext;
@@ -19,16 +19,24 @@ struct ID3D11RenderTargetView;
 
 class WindowD3D11 {
 public:
-    typedef void(*ResizeCallback)();
-    typedef void(*RawMouseInputCallback)(int dx, int dy);
+    typedef std::function<void(int,int)>  ResizeCallback;
+	typedef std::function<void(int, int)> RawMouseInputCallback;
 
     WindowD3D11(const char* name,
                 uint32_t width,
                 uint32_t height,
-                ResizeCallback resizeCallback,
-                RawMouseInputCallback rawMouseInputCallback);
+                ResizeCallback resizeCallback = nullptr,
+                RawMouseInputCallback rawMouseInputCallback = nullptr);
     ~WindowD3D11();
 
+	void setResizeCallback(ResizeCallback cb) {
+		_resizeCallback = cb;
+	}
+
+	void setRawMouseInputCallback(RawMouseInputCallback cb) {
+		_rawMouseInputCallback = cb;
+	}
+
     void* getWindowHandle() const {
         return _windowHandle;
     }
@@ -51,6 +59,9 @@ class WindowD3D11 {
     ID3D11Texture2D* getBackBufferTex() const {
         return _backBufferTex;
     }
+    ID3D11RenderTargetView* getBackBufferRTV() const {
+        return _backBufferRTV;
+    }
     ID3D11Texture2D* getRenderTargetTex() const {
         return _renderTargetTex;
     }
diff --git a/libraries/hvvr/shared/cuda_decl.h b/libraries/hvvr/shared/cuda_decl.h
index 71065ef..963a9eb 100644
--- a/libraries/hvvr/shared/cuda_decl.h
+++ b/libraries/hvvr/shared/cuda_decl.h
@@ -13,13 +13,14 @@
 
 #if defined(__CUDA_ARCH__)
 // we're compiling for the GPU target
-# define CUDA_COMPILE 1
-# define CUDA_ARCH __CUDA_ARCH__
+#define CUDA_COMPILE 1
+#define CUDA_ARCH __CUDA_ARCH__
 #else
-# define CUDA_COMPILE 0
-# define CUDA_ARCH 0
+#define CUDA_COMPILE 0
+#define CUDA_ARCH 0
 #endif
 
+#define CUDA_HOST __host__
 #define CUDA_DEVICE __device__
 #define CUDA_DEVICE_INL inline CUDA_DEVICE
 #define CUDA_HOST_DEVICE __host__ __device__
diff --git a/libraries/hvvr/shared/graphics_types.h b/libraries/hvvr/shared/graphics_types.h
index 42a4d03..935bbb4 100644
--- a/libraries/hvvr/shared/graphics_types.h
+++ b/libraries/hvvr/shared/graphics_types.h
@@ -47,7 +47,7 @@ struct ImageViewR8G8B8A8 {
     size_t stride = 0;
 };
 
-enum class PixelFormat { RGBA8_SRGB, RGBA32F };
+enum class PixelFormat { RGBA8_SRGB, RGBA16, RGBA32F };
 
 // A simple image resource handle: Does not own any memory.
 // Can wrap an arbitrary region in a 2-D buffer.
@@ -185,6 +185,11 @@ struct SimpleRayFrustum {
     vector3 directions[4];
 };
 
+struct Plane {
+    vector3 normal;
+    float dist;
+};
+
 // A precomputed triangle, optimized for intersection.
 struct PrecomputedTriangleIntersect {
     vector3 v0;
diff --git a/libraries/hvvr/shared/shared.props b/libraries/hvvr/shared/shared.props
index 3ad20fe..0dd179c 100644
--- a/libraries/hvvr/shared/shared.props
+++ b/libraries/hvvr/shared/shared.props
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ImportGroup Label="PropertySheets" />
   <PropertyGroup Label="UserMacros" />
diff --git a/libraries/hvvr/shared/shared.vcxproj b/libraries/hvvr/shared/shared.vcxproj
index 9f85456..20b3ad0 100644
--- a/libraries/hvvr/shared/shared.vcxproj
+++ b/libraries/hvvr/shared/shared.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Development|x64">
@@ -15,6 +15,7 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="3rdparty\lodepng.h" />
     <ClInclude Include="array_view.h" />
     <ClInclude Include="avx.h" />
     <ClInclude Include="bvh.h" />
@@ -30,6 +31,7 @@
     <ClInclude Include="vector_math.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="3rdparty\lodepng.cpp" />
     <ClCompile Include="bvh.cpp" />
     <ClCompile Include="debug.cpp" />
     <ClCompile Include="timer.cpp" />
@@ -57,4 +59,4 @@
     <Import Project="..\..\..\libraries\hvvr\raycaster\raycaster.props" />
   </ImportGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/libraries/hvvr/shared/vector_math.h b/libraries/hvvr/shared/vector_math.h
index 0125c6f..371d325 100644
--- a/libraries/hvvr/shared/vector_math.h
+++ b/libraries/hvvr/shared/vector_math.h
@@ -15,15 +15,15 @@
 // CUDA vector types
 #include <vector_types.h>
 #if CUDA_COMPILE
-# include <cuda_fp16.h>
-# include <vector_functions.h>
+#include <cuda_fp16.h>
+#include <vector_functions.h>
 #endif
 
 #include <math.h>
 #include <stdint.h>
 
 #ifdef _MSC_VER
-# include <intrin.h> // _CountLeadingZeros
+#include <intrin.h> // _CountLeadingZeros
 #endif
 
 namespace hvvr {
@@ -42,13 +42,6 @@ struct vector2 {
         return {x, y};
     }
 
-    CHD const float& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD float& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector2 operator-() const {
         return vector2(-x, -y);
     }
@@ -157,9 +150,7 @@ CHDI vector2 max(const vector2& a, float b) {
     return vector2(max(a.x, b), max(a.y, b));
 }
 CHDI vector2 clamp(const vector2& a, const vector2& lower, const vector2& upper) {
-    return vector2(
-        clamp(a.x, lower.x, upper.x),
-        clamp(a.y, lower.y, upper.y));
+    return vector2(clamp(a.x, lower.x, upper.x), clamp(a.y, lower.y, upper.y));
 }
 CHDI vector2 clamp(const vector2& a, float lower, float upper) {
     return vector2(clamp(a.x, lower, upper), clamp(a.y, lower, upper));
@@ -191,13 +182,6 @@ struct vector3 {
         return {x, y, z};
     }
 
-    CHD const float& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD float& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector3 operator-() const {
         return vector3(-x, -y, -z);
     }
@@ -261,6 +245,14 @@ struct vector3 {
     CHD bool operator!=(const vector3& v) const {
         return x != v.x || y != v.y || z != v.z;
     }
+
+    // Don't use in kernels!
+    CUDA_HOST const float& operator[](size_t index) const {
+        return *(&x + index);
+    }
+    CUDA_HOST float& operator[](size_t index) {
+        return *(&x + index);
+    }
 };
 
 CHDI vector3 operator+(float a, const vector3& b) {
@@ -306,10 +298,7 @@ CHDI vector3 max(const vector3& a, float b) {
     return vector3(max(a.x, b), max(a.y, b), max(a.z, b));
 }
 CHDI vector3 clamp(const vector3& a, const vector3& lower, const vector3& upper) {
-    return vector3(
-        clamp(a.x, lower.x, upper.x),
-        clamp(a.y, lower.y, upper.y),
-        clamp(a.z, lower.z, upper.z));
+    return vector3(clamp(a.x, lower.x, upper.x), clamp(a.y, lower.y, upper.y), clamp(a.z, lower.z, upper.z));
 }
 CHDI vector3 clamp(const vector3& a, float lower, float upper) {
     return vector3(clamp(a.x, lower, upper), clamp(a.y, lower, upper), clamp(a.z, lower, upper));
@@ -342,13 +331,6 @@ struct vector4 {
         return {x, y, z, w};
     }
 
-    CHD const float& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD float& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector4 operator-() const {
         return vector4(-x, -y, -z, -w);
     }
@@ -454,15 +436,12 @@ CHDI vector4 max(const vector4& a, float b) {
     return vector4(max(a.x, b), max(a.y, b), max(a.z, b), max(a.w, b));
 }
 CHDI vector4 clamp(const vector4& a, const vector4& lower, const vector4& upper) {
-    return vector4(
-        clamp(a.x, lower.x, upper.x),
-        clamp(a.y, lower.y, upper.y),
-        clamp(a.z, lower.z, upper.z),
-        clamp(a.w, lower.w, upper.w));
+    return vector4(clamp(a.x, lower.x, upper.x), clamp(a.y, lower.y, upper.y), clamp(a.z, lower.z, upper.z),
+                   clamp(a.w, lower.w, upper.w));
 }
 CHDI vector4 clamp(const vector4& a, float lower, float upper) {
     return vector4(clamp(a.x, lower, upper), clamp(a.y, lower, upper), clamp(a.z, lower, upper),
-        clamp(a.w, lower, upper));
+                   clamp(a.w, lower, upper));
 }
 CHDI vector4 abs(const vector4& v) {
     return vector4(fabsf(v.x), fabsf(v.y), fabsf(v.z), fabsf(v.w));
@@ -489,7 +468,7 @@ struct half {
 private:
     CHD static uint16_t floatToHalf(float x) {
 #if CUDA_COMPILE
-        return __float2half(x).x;
+        return __half_raw(__float2half(x)).x;
 #else
 #ifdef _MSC_VER
         return _mm_cvtps_ph(_mm_set_ps1(x), 0).m128i_u16[0];
@@ -524,13 +503,6 @@ struct vector2h {
         return vector2(float(x), float(y));
     }
 
-    CHD const half& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD half& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector2h operator-() const {
         return vector2h(-x, -y);
     }
@@ -553,13 +525,6 @@ struct vector4h {
         return vector4(float(x), float(y), float(z), float(w));
     }
 
-    CHD const half& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD half& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector4h operator-() const {
         return vector4h(-x, -y, -z, -w);
     }
@@ -584,13 +549,6 @@ struct vector2i {
         return {x, y};
     }
 
-    CHD const int32_t& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD int32_t& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector2i operator-() const {
         return vector2i(-x, -y);
     }
@@ -685,9 +643,7 @@ CHDI vector2i max(const vector2i& a, int32_t b) {
     return vector2i(max(a.x, b), max(a.y, b));
 }
 CHDI vector2i clamp(const vector2i& a, const vector2i& lower, const vector2i& upper) {
-    return vector2i(
-        clamp(a.x, lower.x, upper.x),
-        clamp(a.y, lower.y, upper.y));
+    return vector2i(clamp(a.x, lower.x, upper.x), clamp(a.y, lower.y, upper.y));
 }
 CHDI vector2i clamp(const vector2i& a, int32_t lower, int32_t upper) {
     return vector2i(clamp(a.x, lower, upper), clamp(a.y, lower, upper));
@@ -719,13 +675,6 @@ struct vector2ui {
         return {x, y};
     }
 
-    CHD const uint32_t& operator[](size_t index) const {
-        return *(&x + index);
-    }
-    CHD uint32_t& operator[](size_t index) {
-        return *(&x + index);
-    }
-
     CHD vector2ui operator+(const vector2ui& v) const {
         return vector2ui(x + v.x, y + v.y);
     }
@@ -813,9 +762,7 @@ CHDI vector2ui max(const vector2ui& a, uint32_t b) {
     return vector2ui(max(a.x, b), max(a.y, b));
 }
 CHDI vector2ui clamp(const vector2ui& a, const vector2ui& lower, const vector2ui& upper) {
-    return vector2ui(
-        clamp(a.x, lower.x, upper.x),
-        clamp(a.y, lower.y, upper.y));
+    return vector2ui(clamp(a.x, lower.x, upper.x), clamp(a.y, lower.y, upper.y));
 }
 CHDI vector2ui clamp(const vector2ui& a, uint32_t lower, uint32_t upper) {
     return vector2ui(clamp(a.x, lower, upper), clamp(a.y, lower, upper));
@@ -1053,6 +1000,7 @@ CHDI transform invert(const transform& t) {
     return transform(translation, rotation, scale);
 }
 
+// Column-major
 struct matrix3x3 {
     vector3 m0, m1, m2;
 
@@ -1132,10 +1080,7 @@ struct matrix3x3 {
     CHD static matrix3x3 crossProductMatrix(const vector3& v) {
         return matrix3x3(vector3(0.0f, v.z, -v.y), vector3(-v.z, 0, v.x), vector3(v.y, -v.x, 0.0f));
     }
-    // Return a matrix which rotates around a unit axis vector. An optional translation
-    // point allows defining a full rigid transform matrix.
-    // Note that the t vector is simply used as the 4th column of the result, so it should
-    // be a point (x,y,z,1) rather than a vector (x,y,z,0).
+    // Return a matrix which rotates around a unit axis vector.
     CHD static matrix3x3 axisAngle(const vector3& axis, float radians) {
         float c = cosf(radians);
         float s = sinf(radians);
@@ -1197,24 +1142,12 @@ CHDI matrix3x3 invert(const matrix3x3& m) {
     const vector3& y = m.m1;
     const vector3& z = m.m2;
 
-    float det =
-        x.x * (y.y * z.z - z.y * y.z) -
-        x.y * (y.x * z.z - y.z * z.x) +
-        x.z * (y.x * z.y - y.y * z.x);
-
-    return matrix3x3(
-        vector3(
-            y.y * z.z - z.y * y.z,
-            x.z * z.y - x.y * z.z,
-            x.y * y.z - x.z * y.y),
-        vector3(
-            y.z * z.x - y.x * z.z,
-            x.x * z.z - x.z * z.x,
-            y.x * x.z - x.x * y.z),
-        vector3(
-            y.x * z.y - z.x * y.y,
-            z.x * x.y - x.x * z.y,
-            x.x * y.y - y.x * x.y)) / det;
+    float det = x.x * (y.y * z.z - z.y * y.z) - x.y * (y.x * z.z - y.z * z.x) + x.z * (y.x * z.y - y.y * z.x);
+
+    return matrix3x3(vector3(y.y * z.z - z.y * y.z, x.z * z.y - x.y * z.z, x.y * y.z - x.z * y.y),
+                     vector3(y.z * z.x - y.x * z.z, x.x * z.z - x.z * z.x, y.x * x.z - x.x * y.z),
+                     vector3(y.x * z.y - z.x * y.y, z.x * x.y - x.x * z.y, x.x * y.y - y.x * x.y)) /
+           det;
 }
 
 struct matrix4x4 {
diff --git a/projects/hvvr_samples/modelconvert/modelconvert.vcxproj b/projects/hvvr_samples/modelconvert/modelconvert.vcxproj
index 07ddb32..a782c63 100644
--- a/projects/hvvr_samples/modelconvert/modelconvert.vcxproj
+++ b/projects/hvvr_samples/modelconvert/modelconvert.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Development|x64">
@@ -70,4 +70,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/projects/hvvr_samples/modelconvert/modelconvert.vcxproj.filters b/projects/hvvr_samples/modelconvert/modelconvert.vcxproj.filters
index 757e033..3161cca 100644
--- a/projects/hvvr_samples/modelconvert/modelconvert.vcxproj.filters
+++ b/projects/hvvr_samples/modelconvert/modelconvert.vcxproj.filters
@@ -1,6 +1,6 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <ClCompile Include="modelconvert.cpp" />
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/projects/hvvr_samples/modelviewer/modelviewer.cpp b/projects/hvvr_samples/modelviewer/modelviewer.cpp
index 358e5ea..8712441 100644
--- a/projects/hvvr_samples/modelviewer/modelviewer.cpp
+++ b/projects/hvvr_samples/modelviewer/modelviewer.cpp
@@ -19,13 +19,22 @@
 #include "timer.h"
 #include "vector_math.h"
 #include "window_d3d11.h"
-
 #include <ShellScalingAPI.h>
+#include <fstream>
+#include <sstream>
 #include <stdio.h>
 
 #pragma comment(lib, "Shcore.lib")
 
-// disable camera movement for benchmarking?
+#define RT_WIDTH 2160
+#define RT_HEIGHT 1200
+
+// for foveated
+#define GAZE_CURSOR_MODE_NONE 0  // eye direction is locked forward
+#define GAZE_CURSOR_MODE_MOUSE 1 // eye direction is set by clicking the mouse on the window
+#define GAZE_CURSOR_MODE GAZE_CURSOR_MODE_NONE
+
+// disable camera movement (for benchmarking)
 #define DISABLE_MOVEMENT 0
 #define CAMERA_SPEED 3.0
 
@@ -36,10 +45,11 @@
 
 // you might also want to enable JITTER_SAMPLES in kernel_constants.h
 #define ENABLE_FOVEATED 0
-// for foveated
-#define GAZE_CURSOR_MODE_NONE 0 // eye direction is locked forward
-#define GAZE_CURSOR_MODE_MOUSE 1 // eye direction is set by clicking the mouse on the window
-#define GAZE_CURSOR_MODE GAZE_CURSOR_MODE_NONE
+
+#define ENABLE_WIDE_FOV 0
+
+
+
 enum ModelviewerScene {
     scene_home = 0,
     scene_bunny,
@@ -48,233 +58,167 @@ enum ModelviewerScene {
     scene_bistro_interior,
     SceneCount
 };
-// which scene to load?
+// which scene to load? Can be overwritten in the command line
 static ModelviewerScene gSceneSelect = scene_sponza;
 
-#define RT_WIDTH 2160
-#define RT_HEIGHT 1200
+using hvvr::vector3;
+struct SceneSpecification {
+    vector3 defaultCameraOrigin;
+    float defaultCameraYaw;
+    float defaultCameraPitch;
+    float scale;
+    std::string filename;
+};
+static SceneSpecification gSceneSpecs[ModelviewerScene::SceneCount] = {
+    {vector3(1.0f, 3.0f, -1.5f), -(float)M_PI * .7f, (float)M_PI * -.05f, 1.0f, "oculus_home.bin"}, // Oculus Home
+    {vector3(-0.253644f, 0.577575f, 1.081316f), -0.162111f, -0.453079f, 1.0f, "bunny.bin"},         // Stanford Bunny
+    {vector3(10.091616f, 4.139270f, 1.230567f), -5.378105f, -0.398078f, 1.0f, "conference.bin"},    // Conference Room
+    {vector3(4.198845f, 6.105420f, -0.400903f), -4.704108f, -0.200078f, .01f, "sponza.bin"},        // Crytek Sponza
+    {vector3(2.0f, 2.0f, -0.5f), -(float)M_PI * .5f, (float)M_PI * -.05f, 1.0f, "bistro.bin"}       // Amazon Bistro
+};
 
-static hvvr::Timer gTimer;
+struct CameraSettings {
+    float lensRadius = (ENABLE_DEPTH_OF_FIELD == 1) ? 0.0015f : 0.0f;
+    float focalDistance = 0.3f;
+    bool foveatedCamera = (ENABLE_FOVEATED == 1);
+    bool movable = (DISABLE_MOVEMENT == 0);
+    float maxSpeed = (float)CAMERA_SPEED;
+};
 
-static std::unique_ptr<WindowD3D11> gWindow;
-static std::unique_ptr<hvvr::Raycaster> gRayCaster;
+struct CameraControl {
+    vector3 pos = {};
+    float yaw = 0.0f;
+    float pitch = 0.0f;
+    void locallyTranslate(vector3 delta) {
+        pos += hvvr::matrix3x3(hvvr::quaternion::fromEulerAngles(yaw, pitch, 0)) * delta;
+    }
+    hvvr::transform toTransform() const {
+        return hvvr::transform(pos, hvvr::quaternion::fromEulerAngles(yaw, pitch, 0), 1.0f);
+    }
+};
 
-static hvvr::Camera* gCamera = nullptr;
-static hvvr::vector3 gCameraPos = {};
-static float gCameraYaw = 0.0f;
-static float gCameraPitch = 0.0f;
+class GApp {
+public:
+    enum OutputMode { OUTPUT_NONE, OUTPUT_3D_API };
+    struct Settings {
+        std::string windowName = "Modelviewer";
+        uint32_t width = RT_WIDTH;
+        uint32_t height = RT_HEIGHT;
+        std::string sceneBasePath = "../../../../libraries/hvvr/samples_shared/data/scenes/";
+        // 0 = off, 1 = match monitor refresh, 2 = half monitor refresh
+        int vSync = ENABLE_VSYNC;
+        SceneSpecification initScene;
+        OutputMode outputMode = OUTPUT_3D_API;
+    };
+    GApp(Settings settings);
+    virtual void onInit();
+    virtual void onShutdown();
+    virtual void onUserInput();
+    virtual void onSimulation(double sceneTime, double deltaTime);
+    virtual void onRender();
+    virtual void onLoadScene(SceneSpecification spec);
+    virtual void onAfterLoadScene();
+    virtual void loadScene(SceneSpecification spec) {
+        onLoadScene(spec);
+        onAfterLoadScene();
+    }
+    virtual void endFrame();
 
-void gOnInit();
-void gOnMain();
-void gOnShutdown();
+    std::unique_ptr<WindowD3D11>& window() {
+        return m_window;
+    }
 
-void resizeCallback() {
-    uint32_t width = gWindow->getWidth();
-    uint32_t height = gWindow->getHeight();
+    void setResizeCallback(std::function<void(int, int)> callback);
 
-    hvvr::ImageViewR8G8B8A8 image((uint32_t*)gWindow->getRenderTargetTex(), width, height, width);
-    hvvr::ImageResourceDescriptor renderTarget(image);
-    renderTarget.memoryType = hvvr::ImageResourceDescriptor::MemoryType::DX_TEXTURE;
+    // Run until we get a quit message, on which we return
+    MSG run();
 
-    hvvr::DynamicArray<hvvr::Sample> samples = hvvr::getGridSamples(width, height);
+protected:
+    Settings m_settings;
+    hvvr::Timer m_timer;
 
-    gCamera->setViewport(hvvr::FloatRect{{-(float)width / height, -1}, {(float)width / height, 1}});
-    gCamera->setRenderTarget(renderTarget);
-    gCamera->setSamples(samples.data(), uint32_t(samples.size()), 1);
-}
+    std::unique_ptr<WindowD3D11> m_window;
+    std::unique_ptr<hvvr::Raycaster> m_rayCaster;
 
-void rawMouseInputCallback(int dx, int dy) {
-    (void)dx;
-    (void)dy;
-#if !DISABLE_MOVEMENT
-    gCameraYaw += -dx * 0.001f;
-    gCameraPitch += -dy * 0.001f;
-#endif
-}
+    std::function<void(int, int)> m_resizeCallback;
 
-int __stdcall WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, char* commandLine, int nCmdShow) {
-    (void)hInstance;
-    (void)hPrevInstance;
-    (void)commandLine;
-    (void)nCmdShow;
+    double m_prevElapsedTime;
+    double m_deltaTime;
 
-    // set the working directory to the executable's directory
-    char exePath[MAX_PATH] = {};
-    GetModuleFileName(GetModuleHandle(nullptr), exePath, MAX_PATH);
-    char exeDir[MAX_PATH] = {};
-    const char* dirTerminatorA = strrchr(exePath, '/');
-    const char* dirTerminatorB = strrchr(exePath, '\\');
-    const char* dirTerminator = hvvr::max(dirTerminatorA, dirTerminatorB);
-    if (dirTerminator > exePath) {
-        size_t dirLen = hvvr::min<size_t>(size_t(dirTerminator - exePath), MAX_PATH - 1);
-        strncpy(exeDir, exePath, dirLen);
-        SetCurrentDirectory(exeDir);
-    }
+    uint64_t m_frameID = 0;
+    hvvr::Camera* m_camera = nullptr;
+    CameraControl m_cameraControl = {};
 
-    // disable scaling of the output window
-    SetProcessDpiAwareness(PROCESS_PER_MONITOR_DPI_AWARE);
+    CameraSettings m_cameraSettings;
+};
 
-    // create a console output window
-    console::createStdOutErr();
 
-    gWindow = std::make_unique<WindowD3D11>("Simpleviewer", RT_WIDTH, RT_HEIGHT, resizeCallback, rawMouseInputCallback);
-    input::registerDefaultRawInputDevices(gWindow->getWindowHandle());
+GApp::GApp(Settings settings) {
+    m_settings = settings;
+    m_window = std::make_unique<WindowD3D11>(settings.windowName.c_str(), settings.width, settings.height);
+    auto resizeCallback = [this](int width, int height) {
+        hvvr::ImageViewR8G8B8A8 image((uint32_t*)m_window->getRenderTargetTex(), width, height, width);
+        hvvr::ImageResourceDescriptor renderTarget(image);
+        renderTarget.memoryType = hvvr::ImageResourceDescriptor::MemoryType::DX_TEXTURE;
 
-    gOnInit();
+        hvvr::DynamicArray<hvvr::Sample> samples = hvvr::getGridSamples(width, height);
 
-    // The main loop.
-    MSG msg;
-    for (;;) {
-        while (PeekMessageA(&msg, nullptr, 0, 0, PM_REMOVE)) {
-            if (msg.message == WM_QUIT)
-                goto shutdown;
-            TranslateMessage(&msg);
-            DispatchMessageA(&msg);
+        m_camera->setViewport(hvvr::FloatRect{{-(float)width / height, -1}, {(float)width / height, 1}});
+        m_camera->setRenderTarget(renderTarget);
+        if (ENABLE_WIDE_FOV) {
+            m_camera->setSphericalWarpSettings(210.0f, 130.0f);
         }
-        gOnMain();
-    }
-
-shutdown:
-    gOnShutdown();
-
-    return (int)msg.wParam;
-}
-
-void gOnInit() {
-    RayCasterSpecification spec;
-#if ENABLE_FOVEATED
-    spec = RayCasterSpecification::feb2017FoveatedDemoSettings();
-#else
-    spec.mode = RayCasterSpecification::GPUMode::GPU_INTERSECT_AND_RECONSTRUCT_DEFERRED_MSAA_RESOLVE;
-    spec.outputMode = RaycasterOutputMode::COLOR_RGBA8;
-#endif
-    gRayCaster = std::make_unique<hvvr::Raycaster>(spec);
-
-    std::string sceneBasePath = "../../../../libraries/hvvr/samples_shared/data/scenes/";
-    std::string scenePath;
-    float sceneScale = 1.0f;
-    switch (gSceneSelect) {
-        case scene_home:
-            // Oculus Home
-            gCameraPos = hvvr::vector3(1.0f, 3.0f, -1.5f);
-            gCameraYaw = -3.14159f * .7f;
-            gCameraPitch = 3.14159f * -.05f;
-            scenePath = sceneBasePath + "oculus_home.bin";
-            break;
-
-        case scene_bunny:
-            // Stanford Bunny
-            gCameraPos = hvvr::vector3(-0.253644f, 0.577575f, 1.081316f);
-            gCameraYaw = -0.162111f;
-            gCameraPitch = -0.453079f;
-            scenePath = sceneBasePath + "bunny.bin";
-            break;
-
-        case scene_conference:
-            // Conference Room
-            gCameraPos = hvvr::vector3(10.091616f, 4.139270f, 1.230567f);
-            gCameraYaw = -5.378105f;
-            gCameraPitch = -0.398078f;
-            scenePath = sceneBasePath + "conference.bin";
-            break;
-
-        case scene_sponza:
-            // Crytek Sponza
-            gCameraPos = hvvr::vector3(4.198845f, 6.105420f, -0.400903f);
-            gCameraYaw = -4.704108f;
-            gCameraPitch = -0.200078f;
-            scenePath = sceneBasePath + "sponza.bin";
-            sceneScale = .01f;
-            break;
-
-        case scene_bistro_interior:
-            // Amazon Bistro
-            gCameraPos = hvvr::vector3(2.0f, 2.0f, -0.5f);
-            gCameraYaw = -3.14159f * .5f;
-            gCameraPitch = 3.14159f * -.05f;
-            scenePath = sceneBasePath + "bistro.bin";
-            break;
-
-        default:
-            hvvr::fail("invalid scene enum");
-            return;
-            break;
-    }
-
-    // add a default directional light
-    hvvr::LightUnion light = {};
-    light.type = hvvr::LightType::directional;
-    light.directional.Direction = hvvr::normalize(hvvr::vector3(-.25f, 1.0f, 0.1f));
-    light.directional.Power = hvvr::vector3(0.4f, 0.35f, 0.35f);
-    gRayCaster->createLight(light);
-
-#if ENABLE_DEPTH_OF_FIELD
-    const float lensRadius = 0.0015f; // avg 3mm diameter in light (narrow pupil)
-#else
-    const float lensRadius = 0.0f;
-#endif
-    const float focalDistance = .1f; // min focal dist is about .1m for average gamer
-    gCamera = gRayCaster->createCamera(hvvr::FloatRect(hvvr::vector2(-1, -1), hvvr::vector2(1, 1)), lensRadius);
-    gCamera->setFocalDepth(focalDistance);
-    resizeCallback(); // make sure we bind a render target and some samples to the camera
-
-    // load the scene
-    model_import::Model importedModel;
-    if (!model_import::load(scenePath.c_str(), importedModel)) {
-        hvvr::fail("failed to load model %s", scenePath.c_str());
-    }
+        m_camera->setSamples(samples.data(), uint32_t(samples.size()), 1);
+    };
+    setResizeCallback(resizeCallback);
+
+    auto mouseCallback = [this](int dx, int dy) {
+        (void)dx;
+        (void)dy;
+        if (m_cameraSettings.movable) {
+            m_cameraControl.yaw += -dx * 0.001f;
+            m_cameraControl.pitch += -dy * 0.001f;
+        }
+    };
+    m_window->setRawMouseInputCallback(mouseCallback);
 
-    // apply scaling
-    for (auto& mesh : importedModel.meshes) {
-        mesh.transform.scale *= sceneScale;
-    }
-    // create the scene objects in the raycaster
-    if (!model_import::createObjects(*gRayCaster, importedModel)) {
-        hvvr::fail("failed to create model objects");
-    }
-}
 
-void gOnShutdown() {
-    gCamera = nullptr;
-    gRayCaster = nullptr;
+    input::registerDefaultRawInputDevices(m_window->getWindowHandle());
 }
 
-void gOnMain() {
-    static uint64_t frameID = 0;
-    static double prevElapsedTime = gTimer.getElapsed();
-    double elapsedTime = gTimer.getElapsed();
-    float deltaTime = float(elapsedTime - prevElapsedTime);
-    prevElapsedTime = elapsedTime;
 
+void GApp::onSimulation(double sceneTime, double deltaTime) {
+    (void)sceneTime;
     hvvr::vector3 posDelta(0.0f);
-#if !DISABLE_MOVEMENT
-    float cardinalCameraSpeed = (float)(CAMERA_SPEED * deltaTime);
-    if (GetAsyncKeyState(VK_LSHIFT) & 0x8000)
-        cardinalCameraSpeed *= .05f;
-
-    if (GetAsyncKeyState('W') & 0x8000)
-        posDelta.z -= cardinalCameraSpeed;
-    if (GetAsyncKeyState('A') & 0x8000)
-        posDelta.x -= cardinalCameraSpeed;
-    if (GetAsyncKeyState('S') & 0x8000)
-        posDelta.z += cardinalCameraSpeed;
-    if (GetAsyncKeyState('D') & 0x8000)
-        posDelta.x += cardinalCameraSpeed;
-    if (GetAsyncKeyState(VK_LCONTROL) & 0x8000)
-        posDelta.y -= cardinalCameraSpeed;
-    if (GetAsyncKeyState(VK_SPACE) & 0x8000)
-        posDelta.y += cardinalCameraSpeed;
-    gCameraPos += hvvr::matrix3x3(hvvr::quaternion::fromEulerAngles(gCameraYaw, gCameraPitch, 0)) * posDelta;
-#endif
+    if (m_cameraSettings.movable) {
+        float cardinalCameraSpeed = (float)(m_cameraSettings.maxSpeed * deltaTime);
+        if (GetAsyncKeyState(VK_LSHIFT) & 0x8000)
+            cardinalCameraSpeed *= .05f;
+
+        if (GetAsyncKeyState('W') & 0x8000)
+            posDelta.z -= cardinalCameraSpeed;
+        if (GetAsyncKeyState('A') & 0x8000)
+            posDelta.x -= cardinalCameraSpeed;
+        if (GetAsyncKeyState('S') & 0x8000)
+            posDelta.z += cardinalCameraSpeed;
+        if (GetAsyncKeyState('D') & 0x8000)
+            posDelta.x += cardinalCameraSpeed;
+        if (GetAsyncKeyState(VK_LCONTROL) & 0x8000)
+            posDelta.y -= cardinalCameraSpeed;
+        if (GetAsyncKeyState(VK_SPACE) & 0x8000)
+            posDelta.y += cardinalCameraSpeed;
+
+        m_cameraControl.locallyTranslate(posDelta);
+    }
 
 #if GAZE_CURSOR_MODE == GAZE_CURSOR_MODE_MOUSE
     if (GetAsyncKeyState(VK_LBUTTON) & 0x8000) {
         POINT cursorCoord = {};
         GetCursorPos(&cursorCoord);
-        ScreenToClient(HWND(gWindow->getWindowHandle()), &cursorCoord);
+        ScreenToClient(HWND(m_window->getWindowHandle()), &cursorCoord);
 
         RECT clientRect = {};
-        GetClientRect(HWND(gWindow->getWindowHandle()), &clientRect);
+        GetClientRect(HWND(m_window->getWindowHandle()), &clientRect);
         int width = clientRect.right - clientRect.left;
         int height = clientRect.bottom - clientRect.top;
 
@@ -283,27 +227,44 @@ void gOnMain() {
 
         if (fabsf(cursorX) <= 1.0f && fabsf(cursorY) <= 1.0f) {
             float screenDistance = 1.0f;
-
-            // This doesn't attempt to take into account the actual field of view or mapping from samples to pixels,
-            // so it will be a bit off, especially as you get further from the screen center.
-            hvvr::vector3 cursorPosEye(cursorX, cursorY, -screenDistance);
+            hvvr::vector3 cursorPosEye(cursorX * width / height, cursorY, -screenDistance);
             hvvr::vector3 eyeDir = hvvr::normalize(cursorPosEye);
-            gCamera->setEyeDir(eyeDir);
+            m_camera->setEyeDir(eyeDir);
         }
     }
 #endif
+}
 
-    hvvr::transform worldFromCamera =
-        hvvr::transform(gCameraPos, hvvr::quaternion::fromEulerAngles(gCameraYaw, gCameraPitch, 0), 1.0f);
-    gCamera->setCameraToWorld(worldFromCamera);
-
-    gRayCaster->render(elapsedTime);
+MSG GApp::run() {
+    onInit();
+    // The main loop.
+    MSG msg;
+    for (;;) {
+        while (PeekMessageA(&msg, nullptr, 0, 0, PM_REMOVE)) {
+            if (msg.message == WM_QUIT)
+                goto SHUTDOWN_APP;
+            TranslateMessage(&msg);
+            DispatchMessageA(&msg);
+        }
 
-#if OUTPUT_MODE == OUTPUT_MODE_3D_API
-    uint32_t syncInterval = ENABLE_VSYNC;
-    gWindow->copyAndPresent(syncInterval);
-#endif
+        double elapsedTime = m_timer.getElapsed();
+        m_deltaTime = elapsedTime - m_prevElapsedTime;
+        m_prevElapsedTime = elapsedTime;
+        onUserInput();
+        onSimulation(m_prevElapsedTime, m_deltaTime);
+        onRender();
+        endFrame();
+    }
+SHUTDOWN_APP:
+    onShutdown();
+    return msg;
+}
 
+void GApp::endFrame() {
+    if (m_settings.outputMode == OUTPUT_3D_API) {
+        uint32_t syncInterval = m_settings.vSync;
+        m_window->copyAndPresent(syncInterval);
+    }
     // collect some overall perf statistics
     {
         struct FrameStats {
@@ -314,13 +275,13 @@ void gOnMain() {
         static FrameStats frameStats[frameStatsWindowSize] = {};
         static int frameStatsPos = 0;
 
-        uint32_t rayCount = gCamera->getSampleData().sampleCount;
+        uint32_t rayCount = m_camera->getSampleCount();
 
-        frameStats[frameStatsPos].deltaTime = deltaTime;
+        frameStats[frameStatsPos].deltaTime = (float)m_deltaTime;
         frameStats[frameStatsPos].rayCount = rayCount;
 
         // let it run for a bit before collecting numbers
-        if (frameStatsPos == 0 && frameID > frameStatsWindowSize * 4) {
+        if (frameStatsPos == 0 && m_frameID > frameStatsWindowSize * 4) {
             static double frameTimeAvgTotal = 0.0;
             static uint64_t frameTimeAvgCount = 0;
 
@@ -352,5 +313,122 @@ void gOnMain() {
         frameStatsPos = (frameStatsPos + 1) % frameStatsWindowSize;
     }
 
-    frameID++;
+    m_frameID++;
+}
+
+void GApp::setResizeCallback(std::function<void(int, int)> callback) {
+    m_resizeCallback = callback;
+    m_window->setResizeCallback(callback);
+}
+
+void GApp::onAfterLoadScene() {
+    // Setup a regular camera
+    m_camera = m_rayCaster->createCamera(hvvr::FloatRect(hvvr::vector2(-1, -1), hvvr::vector2(1, 1)),
+                                         m_cameraSettings.lensRadius);
+    m_camera->setFocalDepth(m_cameraSettings.focalDistance);
+
+    m_resizeCallback(m_window->getWidth(),
+                     m_window->getHeight()); // make sure we bind a render target and some samples to the camera
+}
+
+void GApp::onLoadScene(SceneSpecification spec) {
+    m_cameraControl.pos = spec.defaultCameraOrigin;
+    m_cameraControl.yaw = spec.defaultCameraYaw;
+    m_cameraControl.pitch = spec.defaultCameraPitch;
+    float sceneScale = spec.scale;
+    std::string scenePath = m_settings.sceneBasePath + spec.filename;
+
+    // add a default directional light
+    hvvr::LightUnion light = {};
+    light.type = hvvr::LightType::directional;
+    light.directional.Direction = hvvr::normalize(hvvr::vector3(-.25f, 1.0f, 0.1f));
+    light.directional.Power = hvvr::vector3(0.4f, 0.35f, 0.35f);
+    m_rayCaster->createLight(light);
+
+    // load the scene
+    model_import::Model importedModel;
+    if (!model_import::load(scenePath.c_str(), importedModel)) {
+        hvvr::fail("failed to load model %s", scenePath.c_str());
+    }
+    // apply scaling
+    for (auto& mesh : importedModel.meshes) {
+        mesh.transform.scale *= sceneScale;
+    }
+    // create the scene objects in the raycaster
+    if (!model_import::createObjects(*m_rayCaster, importedModel)) {
+        hvvr::fail("failed to create model objects");
+    }
+}
+
+void GApp::onInit() {
+    RayCasterSpecification spec;
+    if (m_cameraSettings.foveatedCamera) {
+        spec = RayCasterSpecification::feb2017FoveatedDemoSettings();
+    }
+    spec.outputTo3DApi = (m_settings.outputMode == OUTPUT_3D_API);
+    m_rayCaster = std::make_unique<hvvr::Raycaster>(spec);
+
+    loadScene(m_settings.initScene);
+}
+
+void GApp::onShutdown() {
+    m_camera = nullptr;
+    m_rayCaster = nullptr;
 }
+
+void GApp::onUserInput() {}
+
+void GApp::onRender() {
+    m_camera->setCameraToWorld(m_cameraControl.toTransform());
+    m_rayCaster->render(m_prevElapsedTime);
+}
+
+
+int __stdcall WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, char* commandLine, int nCmdShow) {
+    (void)hInstance;
+    (void)hPrevInstance;
+    (void)commandLine;
+    (void)nCmdShow;
+
+    // set the working directory to the executable's directory
+    char exePath[MAX_PATH] = {};
+    GetModuleFileName(GetModuleHandle(nullptr), exePath, MAX_PATH);
+    char exeDir[MAX_PATH] = {};
+    const char* dirTerminatorA = strrchr(exePath, '/');
+    const char* dirTerminatorB = strrchr(exePath, '\\');
+    const char* dirTerminator = hvvr::max(dirTerminatorA, dirTerminatorB);
+    if (dirTerminator > exePath) {
+        size_t dirLen = hvvr::min<size_t>(size_t(dirTerminator - exePath), MAX_PATH - 1);
+        strncpy(exeDir, exePath, dirLen);
+        SetCurrentDirectory(exeDir);
+    }
+
+    // disable scaling of the output window
+    SetProcessDpiAwareness(PROCESS_PER_MONITOR_DPI_AWARE);
+
+    // create a console output window
+    console::createStdOutErr();
+
+    GApp::Settings settings = {};
+    settings.windowName = "HVVR Modelviewer";
+    settings.sceneBasePath = "../../../../libraries/hvvr/samples_shared/data/scenes/";
+
+    // The only command line argument is the (optional) scene index
+    if (__argc > 1) {
+        int sceneIndex = atoi(__argv[1]);
+        if (sceneIndex >= 0 && sceneIndex < ModelviewerScene::SceneCount) {
+            gSceneSelect = ModelviewerScene(sceneIndex);
+            printf("Set Scene index to %d, filename: %s\n", sceneIndex, gSceneSpecs[sceneIndex].filename.c_str());
+        }
+    }
+
+    if (gSceneSelect < 0 || gSceneSelect >= ModelviewerScene::SceneCount) {
+        hvvr::fail("invalid scene enum");
+    }
+
+    settings.initScene = gSceneSpecs[gSceneSelect];
+
+    GApp app(settings);
+    MSG msg = app.run();
+    return (int)msg.wParam;
+}
\ No newline at end of file
diff --git a/projects/hvvr_samples/modelviewer/modelviewer.vcxproj b/projects/hvvr_samples/modelviewer/modelviewer.vcxproj
index 46633c4..fffae00 100644
--- a/projects/hvvr_samples/modelviewer/modelviewer.vcxproj
+++ b/projects/hvvr_samples/modelviewer/modelviewer.vcxproj
@@ -37,8 +37,8 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
-	<PlatformToolset>v140</PlatformToolset>
-	<CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
     <UseDebugLibraries>false</UseDebugLibraries>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
diff --git a/vs2015/hvvr.sln b/vs2015/hvvr.sln
index 5b5e6a1..3ad8265 100644
--- a/vs2015/hvvr.sln
+++ b/vs2015/hvvr.sln
@@ -1,4 +1,4 @@
-﻿
+
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 14
 VisualStudioVersion = 14.0.25420.1