Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#include "unary-ops.h"
#include "vec.h"

#include <float.h>
#include <cfloat>
#include <algorithm>
#include <functional>

// ggml_compute_forward_dup

Expand Down Expand Up @@ -7682,24 +7683,24 @@ static void ggml_compute_forward_argsort_f32(
ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);

for (int64_t i = ith; i < nr; i += nth) {
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
const float * src_data = (float *)((char *) src0->data + i*nb01);

int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);

for (int64_t j = 0; j < ne0; j++) {
dst_data[j] = j;
}

// C doesn't have a functional sort, so we do a bubble sort instead
for (int64_t j = 0; j < ne0; j++) {
for (int64_t k = j + 1; k < ne0; k++) {
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
int32_t tmp = dst_data[j];
dst_data[j] = dst_data[k];
dst_data[k] = tmp;
}
}
std::function<bool(int32_t, int32_t)> cmp;

// note: this might be causing memory allocations? ideally should be avoided if it's the case
Comment on lines +7694 to +7696
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah std::function can be quite heavy, it is better to avoid it if possible.

Copy link
Member Author

@ggerganov ggerganov Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the past I've used an alloc-free alternative of std::function (based on this), but it's quite cumbersome (nvm, not relevant for this use case). Will probably just implement qsort and avoid std::sort all together.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both qsort() and std::function involves at least one indirect function call.
Perhaps better to simply inline the std::sort() call in the switch block as it's just an one-liner:

case GGML_SORT_ORDER_ASC: std::sort(dst_data, dst_data + ne0, [src_data](int32_t a, int32_t b) { return src_data[a] < src_data[b]; }); break;

This will specialize the std::sort template and avoids indirect function call.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative would be to do something like this:

template<enum ggml_sort_order order>
struct sort_cmp {
    const float * data;
    bool operator()(int32_t a, int32_t b) const {
        if constexpr (order == GGML_SORT_ORDER_ASC) {
            return data[a] < data[b];
        } else {
            return data[a] > data[b];
        }
    }
};
switch (order) {
    case GGML_SORT_ORDER_ASC:
        std::sort(dst_data, dst_data + ne0, sort_cmp<GGML_SORT_ORDER_ASC>{src_data});
        break;

    case GGML_SORT_ORDER_DESC:
        std::sort(dst_data, dst_data + ne0, sort_cmp<GGML_SORT_ORDER_DESC>{src_data});
        break;

    default: GGML_ABORT("invalid sort order");
}

switch (order) {
case GGML_SORT_ORDER_ASC: cmp = [src_data](int32_t a, int32_t b) { return src_data[a] < src_data[b]; }; break;
case GGML_SORT_ORDER_DESC: cmp = [src_data](int32_t a, int32_t b) { return src_data[a] > src_data[b]; }; break;
default: GGML_ABORT("invalid sort order");
}

std::sort(dst_data, dst_data + ne0, cmp);
}
}

Expand Down
Loading