Skip to content

Commit

Permalink
Reduce function size in fast & dfast
Browse files Browse the repository at this point in the history
Take the same approach as in PR #2828 [0] to remove functions that force
inline many function bodies and `switch`. Instead, create one function per
"template" combination, and then switch between these functions. This
allows the compiler to break the large function into many small
functions, which generally helps codegen.

Also, in the `extDict` modes when there is no ext-dict, call the top
level function instead of the force inlined one, to save on code size.

I'm specifically doing this because gcc on the parisc architecture doesn't
handle the large function body well, and ends up using a lot of excess
stack space. Outlining these functions fixes it.
  • Loading branch information
terrelln committed Nov 16, 2021
1 parent ddae153 commit 908951a
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 27 deletions.
48 changes: 35 additions & 13 deletions lib/compress/zstd_double_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
return (size_t)(iend - anchor);
}

#define ZSTD_GEN_FN(dictMode, mls) \
static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
void const* src, size_t srcSize) \
{ \
return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
}

ZSTD_GEN_FN(noDict, 4)
ZSTD_GEN_FN(noDict, 5)
ZSTD_GEN_FN(noDict, 6)
ZSTD_GEN_FN(noDict, 7)

ZSTD_GEN_FN(dictMatchState, 4)
ZSTD_GEN_FN(dictMatchState, 5)
ZSTD_GEN_FN(dictMatchState, 6)
ZSTD_GEN_FN(dictMatchState, 7)


size_t ZSTD_compressBlock_doubleFast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
Expand All @@ -478,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
}
}

Expand All @@ -498,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
}
}

Expand Down Expand Up @@ -540,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(

/* if extDict is invalidated due to maxDistance, switch to "regular" variant */
if (prefixStartIndex == dictStartIndex)
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, mls);
return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);

/* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */
Expand Down Expand Up @@ -653,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
return (size_t)(iend - anchor);
}

ZSTD_GEN_FN(extDict, 4)
ZSTD_GEN_FN(extDict, 5)
ZSTD_GEN_FN(extDict, 6)
ZSTD_GEN_FN(extDict, 7)

size_t ZSTD_compressBlock_doubleFast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
Expand All @@ -663,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
}
}
50 changes: 36 additions & 14 deletions lib/compress/zstd_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
* This is also the work we do at the beginning to enter the loop initially.
*/
FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_fast_generic(
ZSTD_compressBlock_fast_noDict_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize,
U32 const mls)
Expand Down Expand Up @@ -310,6 +310,18 @@ ZSTD_compressBlock_fast_generic(
goto _start;
}

#define ZSTD_GEN_FN(dictMode, mls) \
static size_t ZSTD_compressBlock_fast_##dictMode##_##mls( \
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
void const* src, size_t srcSize) \
{ \
return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
}

ZSTD_GEN_FN(noDict, 4)
ZSTD_GEN_FN(noDict, 5)
ZSTD_GEN_FN(noDict, 6)
ZSTD_GEN_FN(noDict, 7)

size_t ZSTD_compressBlock_fast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
Expand All @@ -321,13 +333,13 @@ size_t ZSTD_compressBlock_fast(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_fast_noDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_fast_noDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_fast_noDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_fast_noDict_7(ms, seqStore, rep, src, srcSize);
}
}

Expand Down Expand Up @@ -479,6 +491,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
return (size_t)(iend - anchor);
}


ZSTD_GEN_FN(dictMatchState, 4)
ZSTD_GEN_FN(dictMatchState, 5)
ZSTD_GEN_FN(dictMatchState, 6)
ZSTD_GEN_FN(dictMatchState, 7)

size_t ZSTD_compressBlock_fast_dictMatchState(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
Expand All @@ -489,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_fast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_fast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_fast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_fast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
}
}

Expand Down Expand Up @@ -530,7 +548,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(

/* switch to "regular" variant if extDict is invalidated due to maxDistance */
if (prefixStartIndex == dictStartIndex)
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls);
return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);

/* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */
Expand Down Expand Up @@ -603,6 +621,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
return (size_t)(iend - anchor);
}

ZSTD_GEN_FN(extDict, 4)
ZSTD_GEN_FN(extDict, 5)
ZSTD_GEN_FN(extDict, 6)
ZSTD_GEN_FN(extDict, 7)

size_t ZSTD_compressBlock_fast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
Expand All @@ -613,12 +635,12 @@ size_t ZSTD_compressBlock_fast_extDict(
{
default: /* includes case 3 */
case 4 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
return ZSTD_compressBlock_fast_extDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
return ZSTD_compressBlock_fast_extDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
return ZSTD_compressBlock_fast_extDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
return ZSTD_compressBlock_fast_extDict_7(ms, seqStore, rep, src, srcSize);
}
}

0 comments on commit 908951a

Please sign in to comment.