crc-org · kpouget · Nov 4, 2025 · Aug 29, 2025 · Aug 29, 2025 · Nov 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -44,9 +44,7 @@ lcov-report/
 
 tags
 .build/
-build*
-release
-debug
+build-*
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -30,6 +30,8 @@
     { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
+    { "name": "remoting_backend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,

diff --git a/OWNERS b/OWNERS
@@ -0,0 +1,13 @@
+approvers:
+- kpouget
+- cfergeau
+- praveenkumar
+- vyasgun
+- gbraad
+options: {}
+reviewers:
+- kpouget
+- cfergeau
+- praveenkumar
+- vyasgun
+- gbraad
diff --git a/build.backend.sh b/build.backend.sh
@@ -0,0 +1,36 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY_backend FAILED_backend
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
+
+if [[ "$FLAVOR" == "-prod" ]]; then
+    cat <<EOF
+###
+### Building the prod flavor
+###
+EOF
+fi
+
+TARGETS="llama-run"
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $TARGETS "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY_backend
+else
+    touch FAILED_backend
+fi
diff --git a/build.remoting.sh b/build.remoting.sh
@@ -0,0 +1,25 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY FAILED
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+TARGETS="ggml-remotingfrontend"
+
+TARGETS="$BUILD_TARGET llama-run"
+set -x
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "server" ]]; then
+    TARGETS="$TARGETS llama-server"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-frontend$FLAVOR --parallel 8 --target $TARGETS "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY
+else
+    touch FAILED
+fi
diff --git a/build.sh b/build.sh
@@ -0,0 +1 @@
+cmake --build ./build/ --parallel 8
diff --git a/build.vulkan.sh b/build.vulkan.sh
@@ -0,0 +1,9 @@
+rm -f READY FAILED
+
+cmake --build ../build.vulkan --parallel 8 --target llama-run
+
+if [[ $? == 0 ]]; then
+    touch READY
+else
+    touch FAILED
+fi
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -226,6 +226,8 @@ option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
 
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
+option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
+option(GGML_REMOTING_BACKEND                "ggml: use the API Remoting backend"              OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
@@ -317,6 +319,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -406,6 +406,8 @@ ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
+ggml_add_backend(RemotingFrontend)
+ggml_add_backend(RemotingBackend)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
@@ -73,6 +73,10 @@
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGFRONTEND
+#include "ggml-remoting-frontend.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -200,6 +204,10 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_ZDNN
         register_backend(ggml_backend_zdnn_reg());
 #endif
+#ifdef GGML_USE_REMOTINGFRONTEND
+        register_backend(ggml_backend_remoting_frontend_reg());
+#endif
+
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
@@ -604,6 +612,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("remoting_frontend", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("remoting_frontend", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("remotingfrontend", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("remoting_frontend", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("remotingfrontend", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);

diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
@@ -11,6 +11,7 @@ ggml_add_backend_library(ggml-metal
                          ggml-metal-common.cpp
                          ggml-metal-context.m
                          ggml-metal-ops.cpp
+                         ggml-metal-remoting.cpp
                         )
-                         ggml-metal-remoting.cpp
-                        )
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.cpp
+                         ggml-metal-device.m
+                         ggml-metal-device.cpp
+                         ggml-metal-common.cpp
+                         ggml-metal-context.m
+                         ggml-metal-ops.cpp
+                         $<$<BOOL:${GGML_REMOTING_BACKEND}>:ggml-metal-remoting.cpp>
+                         )
-                         ggml-metal-remoting.cpp
-                        )
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.cpp
+                         ggml-metal-device.m
+                         ggml-metal-device.cpp
+                         ggml-metal-common.cpp
+                         ggml-metal-context.m
+                         ggml-metal-ops.cpp
+                         $<$<BOOL:${GGML_REMOTING_BACKEND}>:ggml-metal-remoting.cpp>
+                         )
 
 target_link_libraries(ggml-metal PRIVATE

diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -514,13 +514,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
 }
 
 void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
-    //const int64_t t_start = ggml_time_us();
+    const int64_t t_start = ggml_time_us();
 
     if (ctx->use_graph_optimize) {
         ggml_graph_optimize(gf);
     }
 
-    //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
+    printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
 }
-    const int64_t t_start = ggml_time_us();
-
-    if (ctx->use_graph_optimize) {
-        ggml_graph_optimize(gf);
-    }
-
-    //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
-    printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
-}
+    const int64_t t_start = ggml_time_us();
+
+    if (ctx->use_graph_optimize) {
+        ggml_graph_optimize(gf);
+    }
+
+    GGML_LOG_INFO("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
+}
-    const int64_t t_start = ggml_time_us();
-
-    if (ctx->use_graph_optimize) {
-        ggml_graph_optimize(gf);
-    }
-
-    //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
-    printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
-}
+    const int64_t t_start = ggml_time_us();
+
+    if (ctx->use_graph_optimize) {
+        ggml_graph_optimize(gf);
+    }
+
+    GGML_LOG_INFO("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
+}
 
 void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -333,9 +333,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
         NSError * error = nil;
 
         NSString * base_func = [NSString stringWithUTF8String:base];
-
+#if 0
         GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
-
+#endif
         id<MTLFunction> mtl_function;
         if (!cv) {
             mtl_function = [lib->obj newFunctionWithName:base_func];
@@ -358,10 +358,11 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
         ggml_metal_pipelines_add(lib->pipelines, name, res);
 
         [mtl_function release];
-
+#if 0
         GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
                 (int) res->obj.maxTotalThreadsPerThreadgroup,
                 (int) res->obj.threadExecutionWidth);
+#endif
     }
 
     ggml_critical_section_end();

diff --git a/ggml/src/ggml-metal/ggml-metal-remoting.cpp b/ggml/src/ggml-metal/ggml-metal-remoting.cpp
@@ -0,0 +1,28 @@
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
+#include "ggml-metal-device.h"
+#include "ggml-metal-impl.h"
+#include "ggml-metal-context.h"
+
+extern "C" {
+  GGML_BACKEND_API void ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+							      bool *has_simdgroup_mm,
+							      bool *has_simdgroup_reduction,
+							      bool *use_bfloat);
+
+  GGML_BACKEND_API void
+  ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+					bool *has_simdgroup_mm,
+					bool *has_simdgroup_reduction,
+					bool *has_bfloat) {
+    ggml_metal_device_t dev_ctx = (ggml_metal_device_t)dev->context;
+
+    const struct ggml_metal_device_props *props = ggml_metal_device_get_props(dev_ctx);
+
+    *has_bfloat = props->has_bfloat;
+    *has_simdgroup_reduction = props->has_simdgroup_reduction;
+    *has_simdgroup_mm = props->has_simdgroup_mm;
+  }
+}
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting backend")
+
+ggml_add_backend_library(ggml-remotingbackend
+                         backend.cpp
+                         backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
+                         backend-dispatched-buffer-type.cpp
+                         backend-dispatched-metal.cpp
+                         backend-utils.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/venus_cs.h
+                         venus_cs_ggml-rpc-back.cpp
+                        )
+
+target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -0,0 +1,15 @@
+#include "shared/apir_backend.h"
-#include "shared/apir_backend.h"
+#pragma once
+
+#include "shared/apir_backend.h"
-#include "shared/apir_backend.h"
+#pragma once
+
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
-static inline apir_buffer_host_handle_t
-ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
-  // in the backend, the buffer handle is the buffer pointer
-  return (apir_buffer_host_handle_t) buffer;
-}
-
-static inline apir_buffer_type_host_handle_t
-ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
-  // in the backend, the buffer handle is the buffer pointer
-  return (apir_buffer_type_host_handle_t) buft;
-}
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer type handle is the buffer type pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
-static inline apir_buffer_host_handle_t
-ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
-  // in the backend, the buffer handle is the buffer pointer
-  return (apir_buffer_host_handle_t) buffer;
-}
-
-static inline apir_buffer_type_host_handle_t
-ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
-  // in the backend, the buffer handle is the buffer pointer
-  return (apir_buffer_type_host_handle_t) buft;
-}
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer type handle is the buffer type pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -0,0 +1,58 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "shared/apir_backend.h"
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+uint32_t
+backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&graph_compute_timer);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+  size_t cgraph_size;
+  vn_decode_size_t(dec, &cgraph_size);
+
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+  ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+  for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+    ggml_tensor *op = ggml_graph_node(cgraph, idx);
+    if (dev->iface.supports_op(dev, op)) {
+      continue;
+    }
+    ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op));
+
+    status = GGML_STATUS_ABORTED;
+    vn_encode_ggml_status(enc, &status);
+
+    stop_timer(&graph_compute_timer);
+    return 0;
+  }
+#endif
+  status = bck->iface.graph_compute(bck, cgraph);
+  bck->iface.synchronize(bck);
+
+  vn_encode_ggml_status(enc, &status);
+
+  stop_timer(&graph_compute_timer);
+
+  return 0;
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,9 +44,7 @@ lcov-report/ @@
     tags
     .build/
-    build*
-    release
-    debug
+    build-*
     !build-info.cmake
     !build-info.cpp.in
     !build-info.sh
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		cmake --build ./build/ --parallel 8
kpouget marked this conversation as resolved. Show resolved Hide resolved