In [1]:
%cd /scratch_net/biwidl214/ecetin_scratch/GSCodec/notebooks/

/scratch_net/biwidl214/ecetin_scratch/GSCodec/notebooks


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import torch
from torch.utils.cpp_extension import load, load_inline, is_ninja_available
import os

os.environ['CUDA_LAUNCH_BLOCKING']='1'
os.environ['TORCH_USE_CUDA_DSA']='1'

In [3]:
%load_ext wurlitzer

In [4]:
print(torch.cuda.is_available())
print(is_ninja_available())

True
True


In [5]:
cuda_src = r'''
#include <torch/types.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#include <vector_types.h>
#include <device_launch_parameters.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}


__global__ void aggregate_gaussians_forward(
    float *xyzs, 
    float *shs, 
    float *covariances, 
    float *opacities, 
    float *weights,
    int *node_ids, 
    float *node_locations, 
    float *node_harmonics, 
    float *node_covariances, 
    float *node_opacities,
    float *node_total_weights,
    int N,
    int octree_depth) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    int node_idx;
    float weight = weights[idx];
    
    for (int j = 0; j < octree_depth; j++) {
        node_idx = node_ids[idx * octree_depth + j];
        if (node_idx == -1) continue; // Skip empty slots

        atomicAdd(&node_opacities[node_idx], weight * opacities[idx]);
        atomicAdd(&node_total_weights[node_idx], weight);

        for (int k = 0; k < 3; k++) {
            atomicAdd(&node_locations[node_idx * 3 + k], weight * xyzs[idx * 3 + k]);
        }

        for (int k = 0; k < 48; k++) {
            atomicAdd(&node_harmonics[node_idx * 48 + k], weight * shs[idx * 48 + k]);
        }

        for (int k = 0; k < 9; k++) {
            atomicAdd(&node_covariances[node_idx * 9 + k], weight * covariances[idx * 9 + k]);
        }
    }
}

__global__ void aggregate_gaussians_backward(
    float *grad_xyzs, 
    float *grad_shs, 
    float *grad_covariances, 
    float *grad_opacities,
    float *grad_weights,
    float *xyzs, 
    float *shs, 
    float *covariances, 
    float *opacities, 
    float *weights,
    int *node_ids, 
    float *node_total_weights,
    float *node_grad_locations, 
    float *node_grad_harmonics, 
    float *node_grad_covariances, 
    float *node_grad_opacities,
    int N,
    int octree_depth) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    int node_idx;
    float weight = weights[idx];
    float weight_grad = 0.0f;

    for (int j = 0; j < octree_depth; j++) {
        node_idx = node_ids[idx * octree_depth + j];
        if (node_idx == -1) continue; // Skip empty slots

        float normalized_weight = weight / (node_total_weights[node_idx] + 1e-8); // Avoid division by zero
        
        atomicAdd(&grad_opacities[idx], normalized_weight * node_grad_opacities[node_idx]);
        weight_grad += opacities[idx] * node_grad_opacities[node_idx];
        
        for (int k = 0; k < 3; k++) {
            atomicAdd(&grad_xyzs[idx * 3 + k], normalized_weight * node_grad_locations[node_idx * 3 + k]);
            weight_grad += xyzs[idx * 3 + k] * node_grad_locations[node_idx * 3 + k];
        }
        
        for (int k = 0; k < 48; k++) {
            atomicAdd(&grad_shs[idx * 48 + k], normalized_weight * node_grad_harmonics[node_idx * 48 + k]);
            weight_grad += shs[idx * 48 + k] * node_grad_harmonics[node_idx * 48 + k];
        }

        for (int k = 0; k < 9; k++) {
            atomicAdd(&grad_covariances[idx * 9 + k], normalized_weight * node_grad_covariances[node_idx * 9 + k]);
            weight_grad += covariances[idx * 9 + k] * node_grad_covariances[node_idx * 9 + k];
        }
    }
    atomicAdd(&grad_weights[idx], weight_grad);
}


// Host function to invoke the forward kernel
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> 
aggregate_gaussians_forward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const int num_unique_nodes,
    const int octree_depth) 
{
    CHECK_INPUT(xyzs);
    CHECK_INPUT(shs);
    CHECK_INPUT(covariances);
    CHECK_INPUT(opacities);
    CHECK_INPUT(weights);
    CHECK_INPUT(node_ids);

    int N = xyzs.size(0);
    int num_nodes = node_ids.size(0);

    torch::Tensor node_locations = torch::zeros({num_unique_nodes, 3}, torch::CUDA(torch::kFloat));
    torch::Tensor node_harmonics = torch::zeros({num_unique_nodes, 48}, torch::CUDA(torch::kFloat)); 
    torch::Tensor node_covariances = torch::zeros({num_unique_nodes, 9}, torch::CUDA(torch::kFloat));
    torch::Tensor node_opacities = torch::zeros({num_unique_nodes}, torch::CUDA(torch::kFloat));
    torch::Tensor node_total_weights = torch::zeros({num_unique_nodes}, torch::CUDA(torch::kFloat));

    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    aggregate_gaussians_forward<<<blocks, threads>>>(
        xyzs.data_ptr<float>(), 
        shs.data_ptr<float>(), 
        covariances.data_ptr<float>(), 
        opacities.data_ptr<float>(), 
        weights.data_ptr<float>(),
        node_ids.data_ptr<int>(), 
        node_locations.data_ptr<float>(), 
        node_harmonics.data_ptr<float>(), 
        node_covariances.data_ptr<float>(), 
        node_opacities.data_ptr<float>(),
        node_total_weights.data_ptr<float>(),
        N,
        octree_depth
    );

    return std::make_tuple(
        node_locations, node_harmonics, node_covariances, 
        node_opacities, node_total_weights);
}

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
aggregate_gaussians_backward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const torch::Tensor node_total_weights,
    const torch::Tensor node_grad_locations, 
    const torch::Tensor node_grad_harmonics, 
    const torch::Tensor node_grad_covariances, 
    const torch::Tensor node_grad_opacities,
    const int octree_depth) 
{
    CHECK_INPUT(xyzs);
    CHECK_INPUT(shs);
    CHECK_INPUT(covariances);
    CHECK_INPUT(opacities);
    CHECK_INPUT(weights);
    CHECK_INPUT(node_ids);
    CHECK_INPUT(node_grad_locations);
    CHECK_INPUT(node_grad_harmonics);
    CHECK_INPUT(node_grad_covariances);
    CHECK_INPUT(node_grad_opacities);
    CHECK_INPUT(node_total_weights);

    int N = xyzs.size(0);

    auto grad_xyzs = torch::zeros_like(xyzs);
    auto grad_shs = torch::zeros_like(shs);
    auto grad_covariances = torch::zeros_like(covariances);
    auto grad_opacities = torch::zeros_like(opacities);
    auto grad_weights = torch::zeros_like(weights);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    aggregate_gaussians_backward<<<blocks, threads>>>(
        grad_xyzs.data_ptr<float>(), 
        grad_shs.data_ptr<float>(), 
        grad_covariances.data_ptr<float>(), 
        grad_opacities.data_ptr<float>(),
        grad_weights.data_ptr<float>(),
        xyzs.data_ptr<float>(), 
        shs.data_ptr<float>(), 
        covariances.data_ptr<float>(), 
        opacities.data_ptr<float>(), 
        weights.data_ptr<float>(),
        node_ids.data_ptr<int>(),
        node_total_weights.data_ptr<float>(),
        node_grad_locations.data_ptr<float>(), 
        node_grad_harmonics.data_ptr<float>(), 
        node_grad_covariances.data_ptr<float>(), 
        node_grad_opacities.data_ptr<float>(),
        N,
        octree_depth
    );

    return std::make_tuple(grad_xyzs, grad_shs, grad_covariances, grad_opacities, grad_weights);
}
'''

In [6]:
cpp_src = r'''
#include <torch/extension.h>
#include <cstdio>
#include <tuple>
#include <string>

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> 
aggregate_gaussians_forward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities,
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const int num_unique_nodes,
    const int octree_depth);

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
aggregate_gaussians_backward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const torch::Tensor node_total_weights,
    const torch::Tensor node_grad_locations, 
    const torch::Tensor node_grad_harmonics, 
    const torch::Tensor node_grad_covariances, 
    const torch::Tensor node_grad_opacities,
    const int octree_depth);
'''

In [7]:
module = load_inline(
    cuda_sources=[cuda_src], cpp_sources=[cpp_src], 
    functions=["aggregate_gaussians_forward_cuda", "aggregate_gaussians_backward_cuda"],
    build_directory="aggregate",
    extra_cuda_cflags=[],
    verbose=True, name="my_cuda_extension",
)

Detected CUDA files, patching ldflags
Emitting ninja build file aggregate/build.ninja...
Building extension module my_cuda_extension...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=my_cuda_extension -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /scratch_net/biwidl214/ecetin/conda_envs/gscodec/lib/python3.10/site-packages/torch/include -isystem /scratch_net/biwidl214/ecetin/conda_envs/gscodec/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /scratch_net/biwidl214/ecetin/conda_envs/gscodec/lib/python3.10/site-packages/torch/include/TH -isystem /scratch_net/biwidl214/ecetin/conda_envs/gscodec/lib/python3.10/site-packages/torch/include/THC -isystem /include -isystem /scratch_net/biwidl214/ecetin/conda_envs/gscodec/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /scratch_net/biwidl214/ecetin_scratch/GSCodec/notebooks/aggregate/main.cpp -o main.o 
[2/3] /bin/nvcc  -DTORCH_EXTENSION_NAME=my_cuda_extension -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND1

Loading extension module my_cuda_extension...


In [8]:
%cd /scratch_net/biwidl214/ecetin_scratch/GSCodec

/scratch_net/biwidl214/ecetin_scratch/GSCodec


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [9]:
# from models.splatting.hierarchical_model.hierarhical_utils import generate_octree
# import torch

In [10]:
N = 200000
points = torch.rand(N, 3).cuda()
covs = torch.rand(N, 3, 3).cuda()
shs = torch.rand(N, 16, 3).cuda()
opacities = torch.rand(N).cuda()
weights = torch.rand(N).cuda()

box_min = torch.min(points, dim=0)[0]
box_max = torch.max(points, dim=0)[0]
box_d = box_max - box_min
box_min = box_min - 0.1 * box_d
box_max = box_max + 0.1 * box_d
max_depth = 10
init_level=1

# point_level_bboxes, point_node_assignment = generate_octree(
#     points, max_depth)

In [11]:
# unique_nodes, inverse_indices = torch.unique(
#     point_node_assignment, return_inverse=True)
# node_ids = torch.arange(0, unique_nodes.size(0), device=point_node_assignment.device)
# node_ids = node_ids[inverse_indices]
# num_unique_nodes = unique_nodes.size(0)
node_ids = torch.load("notebooks/node_ids_200000.pt").cuda()
print(node_ids)
num_unique_nodes = torch.unique(node_ids).size(0)
print(num_unique_nodes)

tensor([[     3,     30,    241,  ..., 395293, 593547, 784532],
        [     0,      4,     33,  ..., 216635, 406623, 604848],
        [     5,     43,    344,  ..., 441626, 640065, 826282],
        ...,
        [     1,      9,     76,  ..., 321810, 519622, 717667],
        [     5,     42,    338,  ..., 437264, 635704, 822385],
        [     7,     62,    497,  ..., 512255, 710483, 886554]],
       device='cuda:0')
889655


In [12]:
node_xyzs, node_shs, node_covs, node_opacities, node_weights = \
    module.aggregate_gaussians_forward_cuda(
        points, shs, covs, opacities, weights, node_ids.type(torch.int32), 
        num_unique_nodes, max_depth
    )

In [17]:
node_xyzs / node_weights[:, None]

tensor([[0.4989, 0.5012, 0.5012],
        [0.4968, 0.4992, 0.4993],
        [0.4996, 0.4994, 0.4995],
        ...,
        [0.1703, 0.2481, 0.5655],
        [0.4204, 0.7861, 0.0779],
        [0.4590, 0.7854, 0.2827]], device='cuda:0')

In [14]:
node_xyzs

tensor([[6.6961e+03, 6.7276e+03, 6.7276e+03],
        [6.8676e+03, 6.9005e+03, 6.9017e+03],
        [6.8560e+03, 6.8533e+03, 6.8550e+03],
        ...,
        [1.5648e-01, 2.2798e-01, 5.1960e-01],
        [2.6751e-01, 5.0026e-01, 4.9569e-02],
        [4.5149e-01, 7.7249e-01, 2.7810e-01]], device='cuda:0')

In [15]:
points

tensor([[0.2781, 0.7842, 0.2118],
        [0.3426, 0.9232, 0.5663],
        [0.5632, 0.0404, 0.2173],
        ...,
        [0.2359, 0.8826, 0.4234],
        [0.0728, 0.8092, 0.9179],
        [0.6692, 0.9160, 0.6547]], device='cuda:0')

In [14]:
torch.save(node_ids, "notebooks/node_ids_200000.pt")

In [None]:
aggregate_gaussians(self, weights, gaussian_node_assignments, num_unique_nodes)

In [18]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [20]:
cuda_src = cuda_begin + r'''
#include <torch/types.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#include <vector_types.h>
#include <device_launch_parameters.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

__global__ void aggregate_gaussians_forward(
    float *xyzs, 
    float *shs, 
    float *covariances, 
    float *opacities, 
    float *weights,
    int *node_ids, 
    float *node_locations, 
    float *node_harmonics, 
    float *node_covariances, 
    float *node_opacities,
    float *node_total_weights,
    int N) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    int node_idx;
    float weight = weights[idx];
    
    for (int j = 0; j < 10; j++) {
        node_idx = node_ids[idx * 10 + j];
        if (node_idx == -1) continue; // Skip empty slots

        atomicAdd(&node_opacities[node_idx], weight * opacities[idx]);
        atomicAdd(&node_total_weights[node_idx], weight);

        for (int k = 0; k < 3; k++) {
            atomicAdd(&node_locations[node_idx * 3 + k], weight * xyzs[idx * 3 + k]);
        }

        for (int k = 0; k < 48; k++) {
            atomicAdd(&node_harmonics[node_idx * 48 + k], weight * shs[idx * 48 + k]);
        }

        // Aggregated all except covariances. Now we will do them
    }
}

__global__ void aggregate_gaussians_backward(
    float *grad_xyzs, 
    float *grad_shs, 
    float *grad_covariances, 
    float *grad_opacities,
    float *xyzs, 
    float *shs, 
    float *covariances, 
    float *opacities, 
    float *weights,
    int *node_ids, 
    float *node_grad_locations, 
    float *node_grad_harmonics, 
    float *node_grad_covariances, 
    float *node_grad_opacities,
    float *node_total_weights,
    int N) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    int node_idx;
    float weight = weights[idx];

    for (int j = 0; j < 10; j++) {
        node_idx = node_ids[idx * 10 + j];
        if (node_idx == -1) continue; // Skip empty slots

        float normalized_weight = weight / (node_total_weights[node_idx] + 1e-8); // Avoid division by zero
        
        atomicAdd(&grad_opacities[idx], normalized_weight * node_grad_opacities[node_idx]);
        
        for (int k = 0; k < 3; k++) {
            atomicAdd(&grad_xyzs[idx * 3 + k], normalized_weight * node_grad_locations[node_idx * 3 + k]);
        }
        
        for (int k = 0; k < 48; k++) {
            atomicAdd(&grad_shs[idx * 48 + k], normalized_weight * node_grad_harmonics[node_idx * 48 + k]);
        }

        for (int k = 0; k < 9; k++) {
            atomicAdd(&grad_covariances[idx * 9 + k], normalized_weight * node_grad_covariances[node_idx * 9 + k]);
        }
    }
}


// Host function to invoke the forward kernel
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> 
aggregate_gaussians_forward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const int num_unique_nodes) 
{
    CHECK_INPUT(xyzs);
    CHECK_INPUT(shs);
    CHECK_INPUT(covariances);
    CHECK_INPUT(opacities);
    CHECK_INPUT(weights);
    CHECK_INPUT(node_ids);

    int N = xyzs.size(0);
    int num_nodes = node_ids.size(0);

    torch::Tensor node_locations = torch::zeros({num_unique_nodes, 3}, torch::CUDA(torch::kFloat));
    torch::Tensor node_harmonics = torch::zeros({num_unique_nodes, 48}, torch::CUDA(torch::kFloat)); 
    torch::Tensor node_covariances = torch::zeros({num_unique_nodes, 9}, torch::CUDA(torch::kFloat));
    torch::Tensor node_opacities = torch::zeros({num_unique_nodes}, torch::CUDA(torch::kFloat));
    torch::Tensor node_total_weights = torch::zeros({num_unique_nodes}, torch::CUDA(torch::kFloat));

    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    aggregate_gaussians_forward<<<blocks, threads>>>(
        xyzs.data_ptr<float>(), 
        shs.data_ptr<float>(), 
        covariances.data_ptr<float>(), 
        opacities.data_ptr<float>(), 
        weights.data_ptr<float>(),
        node_ids.data_ptr<int>(), 
        node_locations.data_ptr<float>(), 
        node_harmonics.data_ptr<float>(), 
        node_covariances.data_ptr<float>(), 
        node_opacities.data_ptr<float>(),
        node_total_weights.data_ptr<float>(),
        N
    );

    aggregate_gaussians_covariances_forward<<<blocks, threads>>>(
        xyzs.data_ptr<float>(),
        covariances.data_ptr<float>(),
        weights.data_ptr<float>(),
        node_ids.data_ptr<int>(),
        node covariances.data_ptr<float>());

    return std::make_tuple(
        node_locations, node_harmonics, node_covariances, 
        node_opacities, node_total_weights, node_ids);
}

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
aggregate_gaussians_backward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const torch::Tensor node_grad_locations, 
    const torch::Tensor node_grad_harmonics, 
    const torch::Tensor node_grad_covariances, 
    const torch::Tensor node_grad_opacities,
    const torch::Tensor node_total_weights) 
{
    CHECK_INPUT(xyzs);
    CHECK_INPUT(shs);
    CHECK_INPUT(covariances);
    CHECK_INPUT(opacities);
    CHECK_INPUT(weights);
    CHECK_INPUT(node_ids);
    CHECK_INPUT(node_grad_locations);
    CHECK_INPUT(node_grad_harmonics);
    CHECK_INPUT(node_grad_covariances);
    CHECK_INPUT(node_grad_opacities);
    CHECK_INPUT(node_total_weights);

    int N = xyzs.size(0);

    auto grad_xyzs = torch::zeros_like(xyzs);
    auto grad_shs = torch::zeros_like(shs);
    auto grad_covariances = torch::zeros_like(covariances);
    auto grad_opacities = torch::zeros_like(opacities);

    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    aggregate_gaussians_backward<<<blocks, threads>>>(
        grad_xyzs.data_ptr<float>(), 
        grad_shs.data_ptr<float>(), 
        grad_covariances.data_ptr<float>(), 
        grad_opacities.data_ptr<float>(),
        xyzs.data_ptr<float>(), 
        shs.data_ptr<float>(), 
        covariances.data_ptr<float>(), 
        opacities.data_ptr<float>(), 
        weights.data_ptr<float>(),
        node_ids.data_ptr<int>(),
        node_grad_locations.data_ptr<float>(), 
        node_grad_harmonics.data_ptr<float>(), 
        node_grad_covariances.data_ptr<float>(), 
        node_grad_opacities.data_ptr<float>(),
        node_total_weights.data_ptr<float>(),
        N
    );

    return std::make_tuple(grad_xyzs, grad_shs, grad_covariances, grad_opacities);
}
'''

In [21]:
cpp_src = r'''
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> 
aggregate_gaussians_forward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const int num_unique_nodes);

std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
aggregate_gaussians_backward_cuda(
    const torch::Tensor xyzs, 
    const torch::Tensor shs, 
    const torch::Tensor covariances, 
    const torch::Tensor opacities, 
    const torch::Tensor weights,
    const torch::Tensor node_ids,
    const torch::Tensor node_grad_locations, 
    const torch::Tensor node_grad_harmonics, 
    const torch::Tensor node_grad_covariances, 
    const torch::Tensor node_grad_opacities,
    const torch::Tensor node_total_weights);
'''

In [22]:
module = load_inline(
    cuda_sources=[cuda_src], cpp_sources=[cpp_src], 
    functions=["aggregate_gaussians_forward_cuda", "aggregate_gaussians_backward_cuda"],
    build_directory="aggregate",
    extra_cuda_cflags=[],
    verbose=True, name="my_cuda_extension",
)

No modifications detected for re-loaded extension module my_cuda_extension_v2, skipping build step...
Loading extension module my_cuda_extension_v2...


In [23]:
%cd /scratch_net/biwidl214/ecetin_scratch/GSCodec

/scratch_net/biwidl214/ecetin_scratch/GSCodec


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [24]:
from models.splatting.hierarchical_model.hierarhical_utils import generate_octree

Using /home/ecetin/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ecetin/.cache/torch_extensions/py310_cu118/pointops/build.ninja...
Building extension module pointops...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.


Loading extension module pointops...
Using /home/ecetin/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ecetin/.cache/torch_extensions/py310_cu118/octree_generation/build.ninja...
Building extension module octree_generation...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.


Loading extension module octree_generation...


In [25]:
N = 10
points = torch.rand(N, 3).cuda()
covs = torch.rand(N, 3, 3).cuda()
shs = torch.rand(N, 16, 3).cuda()
opacities = torch.rand(N).cuda()
weights = torch.rand(N).cuda()

box_min = torch.min(points, dim=0)[0]
box_max = torch.max(points, dim=0)[0]
box_d = box_max - box_min
box_min = box_min - 0.1 * box_d
box_max = box_max + 0.1 * box_d
max_depth = 8
init_level=1

point_level_bboxes, point_node_assignment = generate_octree(
    points, max_depth)

In [26]:
print(point_node_assignment.shape)

torch.Size([10, 8])


In [27]:
flattened_assignments = point_node_assignment.view(-1).type(torch.int64)

unique_nodes, inverse_indices = torch.unique(
    flattened_assignments, return_inverse=True, sorted=True)

In [28]:
unique_nodes, inverse_indices = torch.unique(point_node_assignment, return_inverse=True, sorted=True)
# unique_nodes[inverse_indices] == point_node_assignment

In [29]:
new_node_ids = torch.arange(0, unique_nodes.size(0), dtype=torch.int32).cuda()
# unique_nodes[new_node_ids][inverse_indices] == point_node_assignment

In [30]:
print(new_node_ids.shape)

torch.Size([76])


In [31]:
node_locations, node_harmonics, node_covariances, \
    node_opacities, node_total_weights, node_ids = module.aggregate_gaussians_forward_cuda(
    points, shs, covs, opacities, weights, new_node_ids, unique_nodes.size(0)
)

In [32]:
print(node_locations.shape)
print(node_harmonics.shape)
print(node_covariances.shape)
print(node_opacities.shape)
print(node_total_weights.shape)
print(node_ids.shape)

torch.Size([76, 3])
torch.Size([76, 48])
torch.Size([76, 9])
torch.Size([76])
torch.Size([76])
torch.Size([76])


torch.Size([43, 48])