Skip to content

Commit

Permalink
VECT: Apply COND_LEN_* into vectorizable_operation
Browse files Browse the repository at this point in the history
Hi, Richard and Richi.
As we disscussed before, COND_LEN_* patterns were added for multiple situations.
This patch apply CON_LEN_* for the following situation:

Support for the situation that in "vectorizable_operation":
  /* If operating on inactive elements could generate spurious traps,
     we need to restrict the operation to active lanes.  Note that this
     specifically doesn't apply to unhoisted invariants, since they
     operate on the same value for every lane.

     Similarly, if this operation is part of a reduction, a fully-masked
     loop should only change the active lanes of the reduction chain,
     keeping the inactive lanes as-is.  */
  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
			    || reduc_idx >= 0);

For mask_out_inactive is true with length loop control.

So, we can these 2 following cases:

1. Integer division:

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] % b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
   TEST_ALL()

With this patch:

  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

2. Floating-point arithmetic **WITHOUT** -ffast-math

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] + b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(float)	\
   TEST_ALL()

With this patch:

  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".

gcc/ChangeLog:

	* internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
	(CASE): Ditto.
	(get_conditional_len_internal_fn): New function.
	* internal-fn.h (get_conditional_len_internal_fn): Ditto.
	* tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_*
	support.
  • Loading branch information
zhongjuzhe authored and Incarnation-p-lee committed Jul 12, 2023
1 parent 13c3e29 commit 0d4dd7e
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 29 deletions.
71 changes: 53 additions & 18 deletions gcc/internal-fn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
0
};

/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
tree code CODE. */
/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
that maps to a tree code CODE. There is also an IFN_COND_LEN_##SUFFIX
for each such IFN_COND_##SUFFIX. */
#define FOR_EACH_CODE_MAPPING(T) \
T (PLUS_EXPR, IFN_COND_ADD) \
T (MINUS_EXPR, IFN_COND_SUB) \
T (MULT_EXPR, IFN_COND_MUL) \
T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
T (RDIV_EXPR, IFN_COND_RDIV) \
T (MIN_EXPR, IFN_COND_MIN) \
T (MAX_EXPR, IFN_COND_MAX) \
T (BIT_AND_EXPR, IFN_COND_AND) \
T (BIT_IOR_EXPR, IFN_COND_IOR) \
T (BIT_XOR_EXPR, IFN_COND_XOR) \
T (LSHIFT_EXPR, IFN_COND_SHL) \
T (RSHIFT_EXPR, IFN_COND_SHR) \
T (NEGATE_EXPR, IFN_COND_NEG)
T (PLUS_EXPR, ADD) \
T (MINUS_EXPR, SUB) \
T (MULT_EXPR, MUL) \
T (TRUNC_DIV_EXPR, DIV) \
T (TRUNC_MOD_EXPR, MOD) \
T (RDIV_EXPR, RDIV) \
T (MIN_EXPR, MIN) \
T (MAX_EXPR, MAX) \
T (BIT_AND_EXPR, AND) \
T (BIT_IOR_EXPR, IOR) \
T (BIT_XOR_EXPR, XOR) \
T (LSHIFT_EXPR, SHL) \
T (RSHIFT_EXPR, SHR) \
T (NEGATE_EXPR, NEG)

/* Return a function that only performs CODE when a certain condition is met
and that uses a given fallback value otherwise. For example, if CODE is
Expand All @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
{
switch (code)
{
#define CASE(CODE, IFN) case CODE: return IFN;
#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
FOR_EACH_CODE_MAPPING(CASE)
#undef CASE
default:
Expand All @@ -4329,14 +4330,48 @@ conditional_internal_fn_code (internal_fn ifn)
{
switch (ifn)
{
#define CASE(CODE, IFN) case IFN: return CODE;
#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
FOR_EACH_CODE_MAPPING(CASE)
#undef CASE
default:
return ERROR_MARK;
}
}

/* Like get_conditional_internal_fn, but return a function that
additionally restricts the operation to the leading elements
of a vector. The number of elements to process is given by a length
and bias pair, as for IFN_LOAD_LEN. The values of the remaining
elements are taken from the fallback ("else") argument.
For example, if CODE is a binary operation associated with FN:
LHS = FN (COND, A, B, ELSE, LEN, BIAS)
is equivalent to the C code:
for (int i = 0; i < NUNITS; i++)
{
if (i < LEN + BIAS && COND[i])
LHS[i] = A[i] CODE B[i];
else
LHS[i] = ELSE[i];
}
*/

internal_fn
get_conditional_len_internal_fn (tree_code code)
{
switch (code)
{
#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
FOR_EACH_CODE_MAPPING(CASE)
#undef CASE
default:
return IFN_LAST;
}
}

/* Invoke T(IFN) for each internal function IFN that also has an
IFN_COND_* form. */
#define FOR_EACH_COND_FN_PAIR(T) \
Expand Down
1 change: 1 addition & 0 deletions gcc/internal-fn.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);

extern internal_fn get_conditional_internal_fn (tree_code);
extern internal_fn get_conditional_internal_fn (internal_fn);
extern internal_fn get_conditional_len_internal_fn (tree_code);
extern tree_code conditional_internal_fn_code (internal_fn);
extern internal_fn get_unconditional_internal_fn (internal_fn);
extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
Expand Down
48 changes: 37 additions & 11 deletions gcc/tree-vect-stmts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,

int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
internal_fn cond_fn = get_conditional_internal_fn (code);
internal_fn cond_len_fn = get_conditional_len_internal_fn (code);

/* If operating on inactive elements could generate spurious traps,
we need to restrict the operation to active lanes. Note that this
Expand All @@ -6730,19 +6732,24 @@ vectorizable_operation (vec_info *vinfo,
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
&& mask_out_inactive)
{
if (cond_fn == IFN_LAST
|| !direct_internal_fn_supported_p (cond_fn, vectype,
OPTIMIZE_FOR_SPEED))
if (cond_fn != IFN_LAST
&& direct_internal_fn_supported_p (cond_fn, vectype,
OPTIMIZE_FOR_SPEED))
vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
vectype, NULL);
else if (cond_len_fn != IFN_LAST
&& direct_internal_fn_supported_p (cond_len_fn, vectype,
OPTIMIZE_FOR_SPEED))
vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
1);
else
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't use a fully-masked loop because no"
" conditional operation is available.\n");
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else
vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
vectype, NULL);
}

/* Put types on constant and invariant SLP children. */
Expand Down Expand Up @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
"transform binary/unary operation.\n");

bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);

/* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
vectors with unsigned elements, but the result is signed. So, we
Expand Down Expand Up @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
gimple_assign_set_lhs (new_stmt, new_temp);
vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
}
else if (masked_loop_p && mask_out_inactive)
else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
{
tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
vec_num * ncopies, vectype, i);
auto_vec<tree> vops (5);
tree mask;
if (masked_loop_p)
mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
vec_num * ncopies, vectype, i);
else
/* Dummy mask. */
mask = build_minus_one_cst (truth_type_for (vectype));
auto_vec<tree> vops (6);
vops.quick_push (mask);
vops.quick_push (vop0);
if (vop1)
Expand All @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
(cond_fn, vectype, vops.length () - 1, &vops[1]);
vops.quick_push (else_value);
}
gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
if (len_loop_p)
{
tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
vec_num * ncopies, vectype, i, 1);
signed char biasval
= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
tree bias = build_int_cst (intQI_type_node, biasval);
vops.quick_push (len);
vops.quick_push (bias);
}
gcall *call
= gimple_build_call_internal_vec (masked_loop_p ? cond_fn
: cond_len_fn,
vops);
new_temp = make_ssa_name (vec_dest, call);
gimple_call_set_lhs (call, new_temp);
gimple_call_set_nothrow (call, true);
Expand Down

0 comments on commit 0d4dd7e

Please sign in to comment.