Skip to content

Commit

Permalink
Reduce size of dctx by reutilizing dst buffer (#2751)
Browse files Browse the repository at this point in the history
* Reduce size of dctx by reutilizing dst buffer

Co-authored-by: Binh Vo <binhvo@fb.com>
  • Loading branch information
binhdvo and Binh Vo committed Oct 25, 2021
1 parent 0a794f5 commit 6a7ede3
Show file tree
Hide file tree
Showing 7 changed files with 663 additions and 103 deletions.
6 changes: 6 additions & 0 deletions lib/README.md
Expand Up @@ -155,6 +155,12 @@ The file structure is designed to make this selection manually achievable for an
- The build macro `ZSTD_NO_INTRINSICS` can be defined to disable all explicit intrinsics.
Compiler builtins are still used.

- The build macro `ZSTD_LITBUFFEREXTRASIZE` can be set to control the amount of extra memory used
during decompression to store literals. This defaults to 64kB. Reducing it can reduce the
memory footprint required for decompression by increasing the portion of the literal buffer that
is stored in the unwritten portion of the dst buffer, at the cost of performance impact for
decompression.


#### Windows : using MinGW+MSYS to create DLL

Expand Down
4 changes: 1 addition & 3 deletions lib/common/zstd_internal.h
Expand Up @@ -181,7 +181,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
#if defined(ZSTD_ARCH_ARM_NEON)
vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
#else
ZSTD_memcpy(dst, src, 16);
ZSTD_memmove(dst, src, 16);
#endif
}
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
Expand Down Expand Up @@ -210,8 +210,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
BYTE* op = (BYTE*)dst;
BYTE* const oend = op + length;

assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN));

if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
/* Handle short offset copies. */
do {
Expand Down
7 changes: 4 additions & 3 deletions lib/decompress/zstd_decompress.c
Expand Up @@ -916,7 +916,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
switch(blockProperties.blockType)
{
case bt_compressed:
decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1);
decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
break;
case bt_raw :
decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
Expand Down Expand Up @@ -1229,7 +1229,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
{
case bt_compressed:
DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1);
rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
dctx->expected = 0; /* Streaming not supported */
break;
case bt_raw :
Expand Down Expand Up @@ -1824,7 +1824,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
{
size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
/* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
size_t const minRBSize = (size_t) neededSize;
RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
Expand Down
720 changes: 628 additions & 92 deletions lib/decompress/zstd_decompress_block.c

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion lib/decompress/zstd_decompress_block.h
Expand Up @@ -33,6 +33,12 @@
*/


/* Streaming state is used to inform allocation of the literal buffer */
typedef enum {
not_streaming = 0,
is_streaming = 1
} streaming_operation;

/* ZSTD_decompressBlock_internal() :
* decompress block, starting at `src`,
* into destination buffer `dst`.
Expand All @@ -41,7 +47,7 @@
*/
size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize, const int frame);
const void* src, size_t srcSize, const int frame, const streaming_operation streaming);

/* ZSTD_buildFSETable() :
* generate FSE decoding table for one symbol (ll, ml or off)
Expand Down
15 changes: 14 additions & 1 deletion lib/decompress/zstd_decompress_internal.h
Expand Up @@ -106,6 +106,16 @@ typedef struct {
size_t ddictPtrCount;
} ZSTD_DDictHashSet;

#ifndef ZSTD_LITBUFFEREXTRASIZE
#define ZSTD_LITBUFFEREXTRASIZE (1 << 16) /* extra buffer reduces amount of dst required to store litBuffer */
#endif

typedef enum {
ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */
ZSTD_in_dst = 1, /* Stored entirely within dst (in memory after current output write) */
ZSTD_split = 2 /* Split between litExtraBuffer and dst */
} ZSTD_litLocation_e;

struct ZSTD_DCtx_s
{
const ZSTD_seqSymbol* LLTptr;
Expand Down Expand Up @@ -171,7 +181,10 @@ struct ZSTD_DCtx_s
ZSTD_outBuffer expectedOutBuffer;

/* workspace */
BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
BYTE* litBuffer;
const BYTE* litBufferEnd;
ZSTD_litLocation_e litBufferLocation;
BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */
BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];

size_t oversizedDuration;
Expand Down
6 changes: 3 additions & 3 deletions tests/fullbench.c
Expand Up @@ -123,11 +123,11 @@ static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
static ZSTD_DCtx* g_zdc = NULL;

#ifndef ZSTD_DLL_IMPORT
extern size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
extern size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* ctx, const void* src, size_t srcSize, void* dst, size_t dstCapacity);
static size_t local_ZSTD_decodeLiteralsBlock(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
{
(void)src; (void)srcSize; (void)dst; (void)dstSize;
return ZSTD_decodeLiteralsBlock(g_zdc, buff2, g_cSize);
return ZSTD_decodeLiteralsBlock(g_zdc, buff2, g_cSize, dst, dstSize);
}

static size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
Expand Down Expand Up @@ -577,7 +577,7 @@ static int benchMem(unsigned benchNb,
ip += ZSTD_blockHeaderSize; /* skip block header */
ZSTD_decompressBegin(g_zdc);
CONTROL(iend > ip);
ip += ZSTD_decodeLiteralsBlock(g_zdc, ip, (size_t)(iend-ip)); /* skip literal segment */
ip += ZSTD_decodeLiteralsBlock(g_zdc, ip, (size_t)(iend-ip), dstBuff, dstBuffSize); /* skip literal segment */
g_cSize = (size_t)(iend-ip);
memcpy(dstBuff2, ip, g_cSize); /* copy rest of block (it starts by SeqHeader) */
srcSize = srcSize > 128 KB ? 128 KB : srcSize; /* speed relative to block */
Expand Down

0 comments on commit 6a7ede3

Please sign in to comment.