Skip to content

Commit

Permalink
Detect more optimizable cases for initializing arrays with memset, wh…
Browse files Browse the repository at this point in the history
…ich gives a 5x speedup for make-u8vector
  • Loading branch information
feeley committed Oct 22, 2023
1 parent c97a0f4 commit 8aec26d
Showing 1 changed file with 40 additions and 48 deletions.
88 changes: 40 additions & 48 deletions lib/_kernel.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2369,10 +2369,11 @@ if (!___FIXNUMP(result))
___VECTORSET(result,___FIX(i),fill)
#else
___SCMOBJ *body = ___CAST(___SCMOBJ*, ___BODY_AS(result,___tSUBTYPED));
if (fill == 0)
if (___CAST_U8(fill) * (~___CAST(___UWORD,0)/0xff) ==
___CAST(___UWORD,fill)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___SCMOBJ));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(fill), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -2456,10 +2457,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___C f = ___INT(fill);
___C *body = ___CAST(___C*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___C,0)/0xff) ==
f) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___C));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -2543,16 +2545,8 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___U8 f = ___INT(fill);
___U8 *body = ___CAST(___U8*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___U8));
}
else
{
for (i=0; i<n; i++)
body[i] = f;
}
/* it is usually faster to initialize an array with memset */
memset(body, f, n * sizeof (*body));
#endif
}
___RESULT = result;
Expand Down Expand Up @@ -2631,16 +2625,8 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___S8 f = ___INT(fill);
___S8 *body = ___CAST(___S8*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___S8));
}
else
{
for (i=0; i<n; i++)
body[i] = f;
}
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
#endif
}
___RESULT = result;
Expand Down Expand Up @@ -2719,10 +2705,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___U16 f = ___INT(fill);
___U16 *body = ___CAST(___U16*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U16,0)/0xff) ==
___CAST_U16(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___U16));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -2807,10 +2794,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___S16 f = ___INT(fill);
___S16 *body = ___CAST(___S16*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U16,0)/0xff) ==
___CAST_U16(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___S16));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -2895,10 +2883,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___U32 f = ___U32UNBOX(fill);
___U32 *body = ___CAST(___U32*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U32,0)/0xff) ==
___CAST_U32(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___U32));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -2983,10 +2972,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___S32 f = ___S32UNBOX(fill);
___S32 *body = ___CAST(___S32*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U32,0)/0xff) ==
___CAST_U32(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___S32));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -3080,10 +3070,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___U64 f = ___U64UNBOX(fill);
___U64 *body = ___CAST(___U64*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U64,0)/0xff) ==
___CAST_U64(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___U64));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -3177,10 +3168,11 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
#else
___S64 f = ___S64UNBOX(fill);
___S64 *body = ___CAST(___S64*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0)
if (___CAST_U8(f) * (~___CAST(___U64,0)/0xff) ==
___CAST_U64(f)) /* repetition of identical bytes? */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___S64));
/* it is usually faster to initialize an array with memset */
memset(body, ___CAST_U8(f), n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -3267,8 +3259,8 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
___F32 *body = ___CAST(___F32*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0.0 && ___copysign (1.0, f) > 0.0) /* detect positive 0 */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___F32));
/* it is usually faster to initialize an array with memset */
memset(body, 0, n * sizeof (*body));
}
else
{
Expand Down Expand Up @@ -3363,8 +3355,8 @@ if (!___FIXNUMP(result) && fill != ___ABSENT)
___F64 *body = ___CAST(___F64*, ___BODY_AS(result,___tSUBTYPED));
if (f == 0.0 && ___copysign (1.0, f) > 0.0) /* detect positive 0 */
{
/* it is usually faster to zero out an array with memset */
memset(body, 0, n * sizeof (___F64));
/* it is usually faster to initialize an array with memset */
memset(body, 0, n * sizeof (*body));
}
else
{
Expand Down

4 comments on commit 8aec26d

@gambiteer
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! This speeds initializing a u1-storage-class in SRFI 231 whether it’s initialized with 1 or 0.

@feeley
Copy link
Member Author

@feeley feeley commented on 8aec26d Oct 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes the initialization of u8vectors and s8vectors with the C memset function is straightforward. The code also detects when wider homogeneous vectors are initialized and all the bytes are the same. This means initializing to 0 and -1 and 0.0 are all done with memset which can be quite a bit faster than a plain for loop. I wonder if ##bignum.make could similarly be optimized...

@gambiteer
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if ##bignum.make could similarly be optimized...

The answer is "yes" for the "left fill" of the new bignum (which might be the entire bignum if there is no bignum argument passed to copy or to complement).

I wonder if that might speed up small bignum addition, etc.

@gambiteer
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked at the code, and don't know how to set up the memset call in the middle of the body of the bignum.

Please sign in to comment.