Skip to content

Commit

Permalink
llama : ggml-backend integration
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Jan 4, 2024
1 parent 7bed7eb commit e712935
Show file tree
Hide file tree
Showing 8 changed files with 753 additions and 1,511 deletions.
11 changes: 11 additions & 0 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -779,10 +779,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}

ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
#endif
return NULL;
}

ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);

for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
Expand Down
1 change: 1 addition & 0 deletions ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ extern "C" {
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
enum ggml_backend_buffer_usage usage;
};

ggml_backend_buffer_t ggml_backend_buffer_init(
Expand Down
62 changes: 55 additions & 7 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
};

return buffer;
Expand Down Expand Up @@ -109,6 +110,10 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
}

void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage;
}

ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
Expand Down Expand Up @@ -773,7 +778,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
}

#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
Expand Down Expand Up @@ -808,17 +813,25 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
if (src == NULL) {
break;
}

ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
if (src_backend != NULL) {
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
// operations with weights are always on the same backend as the weights
cur_backend = src_backend;
SET_CAUSE(node, "1.wgt%d", i);
break;
}

//if (src_backend != NULL) {
int src_prio = sched_backend_prio(sched, src_backend);
size_t src_size = ggml_nbytes(src);
if (src_prio < cur_prio && src_size >= cur_size) {
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
cur_prio = src_prio;
cur_size = src_size;
cur_backend = src_backend;
SET_CAUSE(node, "1.src%d", i);
}
}
//}
}
return cur_backend;
}
Expand Down Expand Up @@ -929,6 +942,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);

#if 0
// pass 2: assign backends to ops from current assignments
// TODO:
// - reuse sched_backend_from_cur
Expand Down Expand Up @@ -960,6 +974,23 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
}
}
#else
// pass 2: assign backends to ops from current assignments
// start from the end and assign the same backend to previous ops
{
ggml_tallocr_t cur_allocr = NULL;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
ggml_tallocr_t node_allocr = node_allocr(node);
if (node_allocr != NULL) {
cur_allocr = node_allocr;
} else {
node_allocr(node) = cur_allocr;
SET_CAUSE(node, "2.cur");
}
}
}
#endif
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);

// pass 3: assign backends to remaining src from dst (should only be leafs)
Expand Down Expand Up @@ -1025,9 +1056,21 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
ggml_tallocr_t src_allocr = node_allocr(src);
if (src_allocr != node_allocr) {
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
// check if the input is already in the split
bool found = false;
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
if (sched->splits[cur_split].inputs[k] == src) {
found = true;
break;
}
}

if (!found) {
int n_inputs = sched->splits[cur_split].n_inputs++;
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
}

// create copies
size_t id = hash_id(src);
Expand Down Expand Up @@ -1231,6 +1274,10 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
sched_reset(sched);
}

int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
return sched->n_splits;
}

ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
return sched->tallocs[backend_index];
Expand Down Expand Up @@ -1316,6 +1363,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor

struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst);
}
else {
Expand Down
8 changes: 8 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ extern "C" {
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);

// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};

GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
Expand All @@ -32,8 +37,10 @@ extern "C" {
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);


//
// Backend
//
Expand Down Expand Up @@ -146,6 +153,7 @@ extern "C" {

// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);

GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
Expand Down

0 comments on commit e712935

Please sign in to comment.