In [6]:
#include <iostream>
using stride_t = int; 
using T = float;

In [7]:
void vecSumElement(const T* in, T* out, const size_t* shape, const stride_t* stride, size_t axis, size_t i, size_t ndims, size_t outSize) {
  if (i < outSize) {
    if (axis - 1 < 0) {
        std::cout << "Crashing due to illegal memory access" << std::endl;
        return;
    }
    stride_t outer = stride[axis];
    stride_t inner = stride[axis - 1];
    stride_t idx = (i / outer) * inner + (i % outer);
    T sum = 0.0;
    for (size_t j = 0; j < shape[axis]; ++j) {
      sum += in[idx + j * outer];
    }
    out[i] = sum;
  }
}

In [8]:
void vecSumCPU(const T* in, T* out, const size_t* shape, const stride_t* stride, size_t axis, size_t ndims, size_t size) {
  // Allocate regular memory instead of cudaMalloc
  size_t* h_shape = (size_t*)malloc(ndims * sizeof(size_t));
  stride_t* h_stride = (stride_t*)malloc(ndims * sizeof(stride_t));
  
  // Copy data using regular memory operations instead of cudaMemcpy
  memcpy(h_shape, shape, ndims * sizeof(size_t));
  memcpy(h_stride, stride, ndims * sizeof(stride_t));
  
  size_t outSize = size / shape[axis];
  
  // Instead of launching a CUDA kernel, just use a CPU for loop
  size_t threads = 256;
  size_t blocks = (outSize + threads - 1) / threads;
  for (size_t i = 0; i < blocks * threads; i++) {
    vecSumElement(in, out, shape, stride, axis, i, ndims, outSize);
  }
  
  // Free memory with standard free instead of cudaFree
  free(h_shape);
  free(h_stride);
}


In [9]:

const size_t rows = 3;
const size_t cols = 2;
const size_t ndims = 2;  // 2D array

// Calculate total size
const size_t size = rows * cols;

// Allocate memory for the input array
float* array = (float*)malloc(size * sizeof(float));
assert(array != NULL);

// Initialize the array with values 1, 2, 3, 4, 5, 2
float values[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f};
memcpy(array, values, size * sizeof(float));

// Print the original array
printf("Original 3x2 array:\n");
for (size_t i = 0; i < rows; i++) {
    for (size_t j = 0; j < cols; j++) {
        printf("%.1f ", array[i * cols + j]);
    }
    printf("\n");
}

// Define the shape and stride for our array
size_t shape[ndims] = {rows, cols};
stride_t stride[ndims];

// Calculate strides (row-major order)
stride[ndims-1] = 1;
for (int i = ndims-2; i >= 0; i--) {
    stride[i] = stride[i+1] * shape[i+1];
}

// Allocate memory for the output array
// When summing along axis 1 (columns), the result will have shape (3,)
float* result = (float*)malloc(rows * sizeof(float));
assert(result != NULL);

// Sum along axis 1 (columns)
const size_t axis = 1;
vecSumCPU(array, result, shape, stride, axis, ndims, size);

// Print the result
printf("\nSum along axis 1 (columns):\n");
for (size_t i = 0; i < rows; i++) {
    printf("%.1f\n", result[i]);
}

// Free allocated memory
free(array);
free(result);

Original 3x2 array:
1.0 2.0 
3.0 4.0 
5.0 2.0 

Sum along axis 1 (columns):
3.0
7.0
7.0
