Skip to content

Commit

Permalink
Add RPC backend
Browse files Browse the repository at this point in the history
The purpose of the RPC backend is to proxy all operations to another
host where they are implemented with one of the existing backends (e.g.
CUDA, Metal, etc.).
  • Loading branch information
rgerganov committed Mar 11, 2024
1 parent 43a6d4a commit a066e35
Show file tree
Hide file tree
Showing 9 changed files with 1,052 additions and 0 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ option(GGML_CLBLAST "ggml: use clBLAST" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_CUBLAS "ggml: use cuBLAS" OFF)
option(GGML_METAL "ggml: use Metal" OFF)
option(GGML_RPC "ggml: use RPC" OFF)

option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
Expand Down
3 changes: 3 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ add_subdirectory(sam)
add_subdirectory(yolo)
add_subdirectory(simple)
add_subdirectory(magika)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
5 changes: 5 additions & 0 deletions examples/rpc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
add_executable(client client.cpp)
target_link_libraries(client PRIVATE ggml)

add_executable(server server.cpp)
target_link_libraries(server PRIVATE ggml)
182 changes: 182 additions & 0 deletions examples/rpc/client.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#include "ggml.h"
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#include "ggml-rpc.h"
#include <cstdio>
#include <cstdlib>

// This is a simple model with two tensors a and b
struct simple_model {
struct ggml_tensor * a;
struct ggml_tensor * b;

// RPC backend
ggml_backend_t backend = NULL;

// the backend buffer to storage the tensors data of a and b
ggml_backend_buffer_t buffer;

// the context to define the tensor information (dimensions, size, memory address)
struct ggml_context * ctx;
};

void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B)
{
int num_tensors = 2;

struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};

// create context
model.ctx = ggml_init(params);

// create tensors
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);

// create a backend buffer (backend memory) and alloc the tensors from the context
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);

// load data from cpu memory to backend buffer
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
}

// build the compute graph to perform a matrix multiplication
struct ggml_cgraph * build_graph(const simple_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};

// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);

// result = a*b^T
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);

// build operations nodes
ggml_build_forward_expand(gf, result);

// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}

// compute with backend
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
// reset the allocator to free all the memory allocated during the previous inference

struct ggml_cgraph * gf = build_graph(model);

// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);

ggml_status status = ggml_backend_graph_compute(model.backend, gf);
if (status != GGML_STATUS_SUCCESS) {
fprintf(stderr, "%s: ggml_backend_graph_compute() failed\n", __func__);
exit(1);
}

// in this case, the output tensor is the last one in the graph
return gf->nodes[gf->n_nodes - 1];
}

int main(int argc, char * argv[])
{
if (argc < 2) {
fprintf(stderr, "Usage: %s <server_addr>\n", argv[0]);
return 1;
}
ggml_time_init();

// initialize data of matrices to perform matrix multiplication
const int rows_A = 4, cols_A = 2;

float matrix_A[rows_A * cols_A] = {
2, 8,
5, 1,
4, 2,
8, 6
};

const int rows_B = 3, cols_B = 2;
/* Transpose([
10, 9, 5,
5, 9, 4
]) 2 rows, 3 cols */
float matrix_B[rows_B * cols_B] = {
10, 5,
9, 9,
5, 4
};

simple_model model;
model.backend = ggml_backend_rpc_init(argv[1]);
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_rpc_init() failed\n", __func__);
exit(1);
}
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);

// calculate the temporaly memory required to compute
ggml_gallocr_t allocr = NULL;

{
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));

// create the worst case graph for memory usage estimation
struct ggml_cgraph * gf = build_graph(model);
ggml_gallocr_reserve(allocr, gf);
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);

fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
}

// perform computation
struct ggml_tensor * result = compute(model, allocr);

// create a array to print result
std::vector<float> out_data(ggml_nelements(result));

// bring the data from the backend memory
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));

// expected result:
// [ 60.00 110.00 54.00 29.00
// 55.00 90.00 126.00 28.00
// 50.00 54.00 42.00 64.00 ]

printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]);
for (int j = 0; j < result->ne[1] /* rows */; j++) {
if (j > 0) {
printf("\n");
}

for (int i = 0; i < result->ne[0] /* cols */; i++) {
printf(" %.2f", out_data[i * result->ne[1] + j]);
}
}
printf(" ]\n");

// release backend memory used for computation
ggml_gallocr_free(allocr);

// free memory
ggml_free(model.ctx);

// release backend memory and free backend
ggml_backend_buffer_free(model.buffer);
ggml_backend_free(model.backend);
return 0;
}
33 changes: 33 additions & 0 deletions examples/rpc/server.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include <memory>
#include <string>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
#include <grpcpp/health_check_service_interface.h>

#include "ggml-rpc.h"

int main(int argc, char * argv[])
{
if (argc < 2) {
fprintf(stderr, "Usage: %s <port>\n", argv[0]);
return 1;
}
int port = std::stoi(argv[1]);
std::string server_address = "0.0.0.0:" + std::to_string(port);
BackendImpl service;

grpc::EnableDefaultHealthCheckService(true);
grpc::reflection::InitProtoReflectionServerBuilderPlugin();
grpc::ServerBuilder builder;
// Listen on the given address without any authentication mechanism.
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
std::cout << "RPC backend listening on " << server_address << std::endl;

// Wait for the server to shutdown. Note that some other thread must be
// responsible for shutting down the server for this call to ever return.
server->Wait();

return 0;
}
38 changes: 38 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,43 @@ if (GGML_METAL)
)
endif()

if (GGML_RPC)
find_package(protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${Protobuf_VERSION}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)

find_package(gRPC CONFIG REQUIRED)
message(STATUS "Using gRPC ${gRPC_VERSION}")
set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)

# Proto file
get_filename_component(ggml_proto "ggml-rpc.proto" ABSOLUTE)
get_filename_component(ggml_proto_path "${ggml_proto}" PATH)

# Generated sources
set(ggml_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.pb.cc")
set(ggml_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.pb.h")
set(ggml_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.grpc.pb.cc")
set(ggml_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.grpc.pb.h")

add_custom_command(
OUTPUT "${ggml_proto_srcs}" "${ggml_proto_hdrs}" "${ggml_grpc_srcs}" "${ggml_grpc_hdrs}"
COMMAND ${_PROTOBUF_PROTOC}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
-I "${ggml_proto_path}"
--plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
"${ggml_proto}"
DEPENDS "${ggml_proto}")

# Include generated *.pb.h files
set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${CMAKE_CURRENT_BINARY_DIR})

set(GGML_RPC_SOURCES ggml-rpc.cpp ggml-rpc.h ${ggml_grpc_srcs} ${ggml_grpc_hdrs} ${ggml_proto_srcs} ${ggml_proto_hdrs})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} gRPC::grpc++ gRPC::grpc gRPC::grpc++_reflection ${_PROTOBUF_LIBPROTOBUF})
endif()

if (GGML_PERF)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
endif()
Expand All @@ -337,6 +374,7 @@ add_library(${TARGET}
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES}
${GGML_METAL_SOURCES}
${GGML_RPC_SOURCES}
)

target_include_directories(${TARGET} PUBLIC
Expand Down
Loading

0 comments on commit a066e35

Please sign in to comment.