Skip to content

Commit

Permalink
[add] himin and himax: find index of min or max elt
Browse files Browse the repository at this point in the history
  • Loading branch information
digikar99 committed Jan 20, 2024
1 parent 5ae1171 commit 542b93a
Show file tree
Hide file tree
Showing 22 changed files with 3,323 additions and 7,373 deletions.
6 changes: 4 additions & 2 deletions bmas/README.md
Expand Up @@ -68,8 +68,10 @@ SSE and AVX512 support exists to a limited extent due to limited developer time.
| min | + | + | + | + | + | + | + | + | + | + |
| max | + | + | + | + | + | + | + | + | + | + |
| sum (horizontal) | + | + | + | + | + | + | - | - | - | - |
| min (horizontal) | + | + | + | + | + | + | + | + | + | + |
| max (horizontal) | + | + | + | + | + | + | + | + | + | + |
| hmin (horizontal) | + | + | + | + | + | + | + | + | + | + |
| hmax (horizontal) | + | + | + | + | + | + | + | + | + | + |
| himin (index of min) | + | + | + | + | + | + | + | + | + | + |
| himax (index of max) | + | + | + | + | + | + | + | + | + | + |
| **Function \ Data type** | float32 | float64 | int64 | int32 | int16 | int8 | uint64 | uint32 | uint16 | uint8 |
| lt | + | + | + | + | + | + | + | + | + | + |
| le | + | + | + | + | + | + | + | + | + | + |
Expand Down
23 changes: 23 additions & 0 deletions bmas/bmas.c
Expand Up @@ -98,6 +98,7 @@ void static inline BMAS_ivec_store_multi(
#include "cast.h"
#include "copy.h"
#include "one_arg_reduce_fn_body.h"
#include "one_arg_ireduce_fn_body.h"
#include "dot_fn_body.h"

copy_fn_body(s, SIMD_SINGLE_STRIDE, float, BMAS_svec);
Expand Down Expand Up @@ -147,6 +148,28 @@ one_arg_reduce_fn_body(u32hmin, SIMD_SINGLE_STRIDE, uint32_t, BMAS_ivec, uint3
one_arg_reduce_fn_body(u16hmin, 2*SIMD_SINGLE_STRIDE, uint16_t, BMAS_ivec, uint16_t, uMAX, u16min, u16hmin);
one_arg_reduce_fn_body(u8hmin, 4*SIMD_SINGLE_STRIDE, uint8_t, BMAS_ivec, uint8_t, uMAX, u8min, u8hmin);

one_arg_ireduce_fn_body(shimax, SIMD_SINGLE_STRIDE, float, svec, sINDEX, sMIN, sindex, shindex, 'a', sgt);
one_arg_ireduce_fn_body(dhimax, SIMD_DOUBLE_STRIDE, double, dvec, dINDEX, dMIN, dindex, dhindex, 'a', dgt);
one_arg_ireduce_fn_body(i64himax, SIMD_DOUBLE_STRIDE, int64_t, ivec, iINDEX, i64MIN, i64index, i64hindex, 'a', i64gt);
one_arg_ireduce_fn_body(i32himax, SIMD_SINGLE_STRIDE, int32_t, ivec, iINDEX, i32MIN, i32index, i32hindex, 'a', i32gt);
one_arg_ireduce_fn_body(i16himax, 2*SIMD_SINGLE_STRIDE, int16_t, ivec, iINDEX, i16MIN, i16index, i16hindex, 'a', i16gt);
one_arg_ireduce_fn_body(i8himax, 4*SIMD_SINGLE_STRIDE, int8_t, ivec, iINDEX, i8MIN, i8index, i8hindex, 'a', i8gt);
one_arg_ireduce_fn_body(u64himax, SIMD_DOUBLE_STRIDE, uint64_t, ivec, iINDEX, uMIN, u64index, u64hindex, 'a', u64gt);
one_arg_ireduce_fn_body(u32himax, SIMD_SINGLE_STRIDE, uint32_t, ivec, iINDEX, uMIN, u32index, u32hindex, 'a', u32gt);
one_arg_ireduce_fn_body(u16himax, 2*SIMD_SINGLE_STRIDE, uint16_t, ivec, iINDEX, uMIN, u16index, u16hindex, 'a', u16gt);
one_arg_ireduce_fn_body(u8himax, 4*SIMD_SINGLE_STRIDE, uint8_t, ivec, iINDEX, uMIN, u8index, u8hindex, 'a', u8gt);

one_arg_ireduce_fn_body(shimin, SIMD_SINGLE_STRIDE, float, svec, sINDEX, sMAX, sindex, shindex, 'i', slt);
one_arg_ireduce_fn_body(dhimin, SIMD_DOUBLE_STRIDE, double, dvec, dINDEX, dMAX, dindex, dhindex, 'i', dlt);
one_arg_ireduce_fn_body(i64himin, SIMD_DOUBLE_STRIDE, int64_t, ivec, iINDEX, i64MAX, i64index, i64hindex, 'i', i64lt);
one_arg_ireduce_fn_body(i32himin, SIMD_SINGLE_STRIDE, int32_t, ivec, iINDEX, i32MAX, i32index, i32hindex, 'i', i32lt);
one_arg_ireduce_fn_body(i16himin, 2*SIMD_SINGLE_STRIDE, int16_t, ivec, iINDEX, i16MAX, i16index, i16hindex, 'i', i16lt);
one_arg_ireduce_fn_body(i8himin, 4*SIMD_SINGLE_STRIDE, int8_t, ivec, iINDEX, i8MAX, i8index, i8hindex, 'i', i8lt);
one_arg_ireduce_fn_body(u64himin, SIMD_DOUBLE_STRIDE, uint64_t, ivec, iINDEX, uMAX, u64index, u64hindex, 'i', u64lt);
one_arg_ireduce_fn_body(u32himin, SIMD_SINGLE_STRIDE, uint32_t, ivec, iINDEX, uMAX, u32index, u32hindex, 'i', u32lt);
one_arg_ireduce_fn_body(u16himin, 2*SIMD_SINGLE_STRIDE, uint16_t, ivec, iINDEX, uMAX, u16index, u16hindex, 'i', u16lt);
one_arg_ireduce_fn_body(u8himin, 4*SIMD_SINGLE_STRIDE, uint8_t, ivec, iINDEX, uMAX, u8index, u8hindex, 'i', u8lt);


dot_fn_body(sdot, SIMD_SINGLE_STRIDE, float, BMAS_svec, float, szero, sadd, smul, shadd);
dot_fn_body(ddot, SIMD_DOUBLE_STRIDE, double, BMAS_dvec, double, dzero, dadd, dmul, dhadd);
Expand Down
22 changes: 22 additions & 0 deletions bmas/bmas.h
Expand Up @@ -353,6 +353,28 @@ one_arg_reduce_fn(u16hmin, uint16_t, uint16_t);
one_arg_reduce_fn(u32hmin, uint32_t, uint32_t);
one_arg_reduce_fn(u64hmin, uint64_t, uint64_t);

#define one_arg_ireduce_fn(name, itype, otype) long BMAS_##name(const long n, itype* x, const int64_t incx);
one_arg_ireduce_fn(shimax, float, float);
one_arg_ireduce_fn(dhimax, double, double);
one_arg_ireduce_fn(i8himax, int8_t, int8_t);
one_arg_ireduce_fn(i16himax, int16_t, int16_t);
one_arg_ireduce_fn(i32himax, int32_t, int32_t);
one_arg_ireduce_fn(i64himax, int64_t, int64_t);
one_arg_ireduce_fn(u8himax, uint8_t, uint8_t);
one_arg_ireduce_fn(u16himax, uint16_t, uint16_t);
one_arg_ireduce_fn(u32himax, uint32_t, uint32_t);
one_arg_ireduce_fn(u64himax, uint64_t, uint64_t);

one_arg_ireduce_fn(shimin, float, float);
one_arg_ireduce_fn(dhimin, double, double);
one_arg_ireduce_fn(i8himin, int8_t, int8_t);
one_arg_ireduce_fn(i16himin, int16_t, int16_t);
one_arg_ireduce_fn(i32himin, int32_t, int32_t);
one_arg_ireduce_fn(i64himin, int64_t, int64_t);
one_arg_ireduce_fn(u8himin, uint8_t, uint8_t);
one_arg_ireduce_fn(u16himin, uint16_t, uint16_t);
one_arg_ireduce_fn(u32himin, uint32_t, uint32_t);
one_arg_ireduce_fn(u64himin, uint64_t, uint64_t);

#define dot_fn(name, itype, otype) \
otype BMAS_##name(const long n,\
Expand Down
51 changes: 51 additions & 0 deletions bmas/one_arg_ireduce_fn_body.h
@@ -0,0 +1,51 @@

// The way this is currently implemented, a vector-accumulator, having
// a larger otype wouldn't make a difference; for that, the vector-accumulator and
// store-load instructions themselves would need to be changed.

// An acc (accumulator) below comprises of struct made of a "value" vector
// and one or more "index" vector(s)

#define one_arg_ireduce_fn_body( \
name, _stride, itype, vec, \
init_index_fn, init_fn, \
vreduce_fn, hreduce_fn, reduce_fn_char, sreduce_fn) \
\
long BMAS_##name(const long n, itype* x, const long incx){ \
itype* x_end = x + incx * n; \
BMAS_##vec v; \
struct BMAS_ipair_##vec acc \
= BMAS_vector_##init_index_fn(BMAS_vector_##init_fn()); \
const int stride = _stride; \
long idx = 0; \
if (incx == 1){ \
itype* simd_end = x + (n/stride)*stride; \
while(x != simd_end){ \
v = BMAS_##vec##_load(x); \
acc = BMAS_vector_##vreduce_fn(acc, v, idx, reduce_fn_char); \
x += stride; \
idx += stride; \
} \
}else{ \
long i=0; \
const long simd_end = (n/stride)*stride; \
while(i != simd_end){ \
v = BMAS_##vec##_make(x, incx, sizeof(itype)); \
acc = BMAS_vector_##vreduce_fn(acc, v, idx, reduce_fn_char); \
i += stride; \
x += stride * incx; \
idx += stride; \
} \
} \
struct BMAS_ipair_##itype result \
= BMAS_vector_##hreduce_fn(acc, reduce_fn_char); \
while(x!=x_end){ \
if (BMAS_scalar_##sreduce_fn(x[0], result.value)){ \
result.idx = idx; \
result.value = x[0]; \
} \
x += incx; \
idx += 1; \
} \
return result.idx; \
}
11 changes: 11 additions & 0 deletions bmas/scalar.h
@@ -1,6 +1,17 @@
#include <math.h>
#include "sleefinline_purec_scalar.h"

struct BMAS_ipair_float{long idx; float value;};
struct BMAS_ipair_double{long idx; double value;};
struct BMAS_ipair_int64_t{long idx; int64_t value;};
struct BMAS_ipair_int32_t{long idx; int32_t value;};
struct BMAS_ipair_int16_t{long idx; int16_t value;};
struct BMAS_ipair_int8_t {long idx; int8_t value;};
struct BMAS_ipair_uint64_t{long idx; uint64_t value;};
struct BMAS_ipair_uint32_t{long idx; uint32_t value;};
struct BMAS_ipair_uint16_t{long idx; uint16_t value;};
struct BMAS_ipair_uint8_t {long idx; uint8_t value;};

float static inline BMAS_scalar_sadd(float a, float b){return a+b;}
float static inline BMAS_scalar_ssub(float a, float b){return a-b;}
float static inline BMAS_scalar_smul(float a, float b){return a*b;}
Expand Down

0 comments on commit 542b93a

Please sign in to comment.