Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix & fuzz ZSTD_generateSequences #3981

Merged
merged 1 commit into from Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG
Expand Up @@ -33,6 +33,7 @@ port: sparc64 support validation in CI, by @Cyan4973
port: AIX compatibility, by @likema
port: HP-UX compatibility, by @likema
doc: Improved specification accuracy, by @elasota
bug: Fix and deprecate ZSTD_generateSequences (#3981)

v1.5.5 (Apr 2023)
fix: fix rare corruption bug affecting the high compression mode, reported by @danlark1 (#3517, @terrelln)
Expand Down
127 changes: 85 additions & 42 deletions lib/compress/zstd_compress.c
Expand Up @@ -3361,29 +3361,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
return ZSTDbss_compress;
}

static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
{
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
const seqDef* seqStoreSeqs = seqStore->sequencesStart;
size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
size_t literalsRead = 0;
size_t lastLLSize;
const seqDef* inSeqs = seqStore->sequencesStart;
const size_t nbInSequences = seqStore->sequences - inSeqs;
const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);

ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
const size_t nbOutSequences = nbInSequences + 1;
size_t nbOutLiterals = 0;
repcodes_t repcodes;
size_t i;
repcodes_t updatedRepcodes;

assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
/* Ensure we have enough space for last literals "sequence" */
assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
for (i = 0; i < seqStoreSeqSize; ++i) {
U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
outSeqs[i].litLength = seqStoreSeqs[i].litLength;
outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
/* Bounds check that we have enough space for every input sequence
* and the block delimiter
*/
assert(seqCollector->seqIndex <= seqCollector->maxSequences);
RETURN_ERROR_IF(
nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
dstSize_tooSmall,
"Not enough space to copy sequences");

ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
for (i = 0; i < nbInSequences; ++i) {
U32 rawOffset;
outSeqs[i].litLength = inSeqs[i].litLength;
outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
outSeqs[i].rep = 0;

/* Handle the possible single length >= 64K
* There can only be one because we add MINMATCH to every match length,
* and blocks are at most 128K.
*/
if (i == seqStore->longLengthPos) {
if (seqStore->longLengthType == ZSTD_llt_literalLength) {
outSeqs[i].litLength += 0x10000;
Expand All @@ -3392,41 +3401,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
}
}

if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
/* Derive the correct offset corresponding to a repcode */
outSeqs[i].rep = seqStoreSeqs[i].offBase;
/* Determine the raw offset given the offBase, which may be a repcode. */
if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
assert(repcode > 0);
outSeqs[i].rep = repcode;
if (outSeqs[i].litLength != 0) {
rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
rawOffset = repcodes.rep[repcode - 1];
} else {
if (outSeqs[i].rep == 3) {
rawOffset = updatedRepcodes.rep[0] - 1;
if (repcode == 3) {
assert(repcodes.rep[0] > 1);
rawOffset = repcodes.rep[0] - 1;
} else {
rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
rawOffset = repcodes.rep[repcode];
}
}
} else {
rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
}
outSeqs[i].offset = rawOffset;
/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
so we provide seqStoreSeqs[i].offset - 1 */
ZSTD_updateRep(updatedRepcodes.rep,
seqStoreSeqs[i].offBase,
seqStoreSeqs[i].litLength == 0);
literalsRead += outSeqs[i].litLength;

/* Update repcode history for the sequence */
ZSTD_updateRep(repcodes.rep,
inSeqs[i].offBase,
inSeqs[i].litLength == 0);

nbOutLiterals += outSeqs[i].litLength;
}
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
* If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
* for the block boundary, according to the API.
*/
assert(seqStoreLiteralsSize >= literalsRead);
lastLLSize = seqStoreLiteralsSize - literalsRead;
outSeqs[i].litLength = (U32)lastLLSize;
outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
seqStoreSeqSize++;
zc->seqCollector.seqIndex += seqStoreSeqSize;
assert(nbInLiterals >= nbOutLiterals);
{
const size_t lastLLSize = nbInLiterals - nbOutLiterals;
outSeqs[nbInSequences].litLength = (U32)lastLLSize;
outSeqs[nbInSequences].matchLength = 0;
outSeqs[nbInSequences].offset = 0;
assert(nbOutSequences == nbInSequences + 1);
}
seqCollector->seqIndex += nbOutSequences;
assert(seqCollector->seqIndex <= seqCollector->maxSequences);

return 0;
}

size_t ZSTD_sequenceBound(size_t srcSize) {
return (srcSize / ZSTD_MINMATCH_MIN) + 1;
const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
return maxNbSeq + maxNbDelims;
}

size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
Expand All @@ -3435,6 +3458,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
const size_t dstCapacity = ZSTD_compressBound(srcSize);
void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
SeqCollector seqCollector;
{
int targetCBlockSize;
FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
}
{
int nbWorkers;
FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
}

RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");

Expand All @@ -3444,8 +3477,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
seqCollector.maxSequences = outSeqsSize;
zc->seqCollector = seqCollector;

ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
ZSTD_customFree(dst, ZSTD_defaultCMem);
{
const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
ZSTD_customFree(dst, ZSTD_defaultCMem);
FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
}
assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
return zc->seqCollector.seqIndex;
}

Expand Down Expand Up @@ -4038,8 +4075,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
cSeqsSize = 1;
}

/* Sequence collection not supported when block splitting */
if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}
Expand Down Expand Up @@ -4261,6 +4299,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
if (bss == ZSTDbss_noCompress) {
if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
Expand Down Expand Up @@ -4293,11 +4332,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,

{ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
if (bss == ZSTDbss_noCompress) {
RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
cSize = 0;
goto out;
}
}

if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions lib/compress/zstdmt_compress.c
Expand Up @@ -121,7 +121,7 @@ static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)

static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
{
ZSTDMT_bufferPool* const bufPool =
ZSTDMT_bufferPool* const bufPool =
(ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem);
if (bufPool==NULL) return NULL;
if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
Expand Down Expand Up @@ -380,7 +380,7 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
ZSTD_customMem cMem)
{
ZSTDMT_CCtxPool* const cctxPool =
ZSTDMT_CCtxPool* const cctxPool =
(ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem);
assert(nbWorkers > 0);
if (!cctxPool) return NULL;
Expand Down
33 changes: 23 additions & 10 deletions lib/zstd.h
Expand Up @@ -1538,25 +1538,38 @@ typedef enum {
ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);

/*! ZSTD_generateSequences() :
* WARNING: This function is meant for debugging and informational purposes ONLY!
* Its implementation is flawed, and it will be deleted in a future version.
* It is not guaranteed to succeed, as there are several cases where it will give
* up and fail. You should NOT use this function in production code.
*
* This function is deprecated, and will be removed in a future version.
*
* Generate sequences using ZSTD_compress2(), given a source buffer.
*
* @param zc The compression context to be used for ZSTD_compress2(). Set any
* compression parameters you need on this context.
* @param outSeqs The output sequences buffer of size @p outSeqsSize
* @param outSeqsSize The size of the output sequences buffer.
* ZSTD_sequenceBound(srcSize) is an upper bound on the number
* of sequences that can be generated.
* @param src The source buffer to generate sequences from of size @p srcSize.
* @param srcSize The size of the source buffer.
*
* Each block will end with a dummy sequence
* with offset == 0, matchLength == 0, and litLength == length of last literals.
* litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
* simply acts as a block delimiter.
*
* @zc can be used to insert custom compression params.
* This function invokes ZSTD_compress2().
*
* The output of this function can be fed into ZSTD_compressSequences() with CCtx
* setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
* @return : number of sequences generated
* @returns The number of sequences generated, necessarily less than
* ZSTD_sequenceBound(srcSize), or an error code that can be checked
* with ZSTD_isError().
*/

ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
ZSTDLIB_STATIC_API size_t
ZSTD_generateSequences( ZSTD_CCtx* zc,
ZSTD_Sequence* outSeqs, size_t outSeqsSize,
const void* src, size_t srcSize);
ZSTD_generateSequences(ZSTD_CCtx* zc,
ZSTD_Sequence* outSeqs, size_t outSeqsSize,
const void* src, size_t srcSize);

/*! ZSTD_mergeBlockDelimiters() :
* Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
Expand Down
8 changes: 6 additions & 2 deletions tests/fuzz/Makefile
Expand Up @@ -125,7 +125,8 @@ FUZZ_TARGETS := \
seekable_roundtrip \
huf_round_trip \
huf_decompress \
decompress_cross_format
decompress_cross_format \
generate_sequences

all: libregression.a $(FUZZ_TARGETS)

Expand Down Expand Up @@ -239,10 +240,13 @@ huf_round_trip: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_huf_round_trip.o

huf_decompress: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o $(LIB_FUZZING_ENGINE) -o $@

decompress_cross_format: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o $(LIB_FUZZING_ENGINE) -o $@

generate_sequences: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o $(LIB_FUZZING_ENGINE) -o $@

libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c d_fuzz_regression_driver.o
$(AR) $(FUZZ_ARFLAGS) $@ d_fuzz_regression_driver.o

Expand Down
1 change: 1 addition & 0 deletions tests/fuzz/fuzz.py
Expand Up @@ -66,6 +66,7 @@ def __init__(self, input_type, frame_type=FrameType.ZSTD):
'huf_round_trip': TargetInfo(InputType.RAW_DATA),
'huf_decompress': TargetInfo(InputType.RAW_DATA),
'decompress_cross_format': TargetInfo(InputType.RAW_DATA),
'generate_sequences': TargetInfo(InputType.RAW_DATA),
}
TARGETS = list(TARGET_INFO.keys())
ALL_TARGETS = TARGETS + ['all']
Expand Down
88 changes: 88 additions & 0 deletions tests/fuzz/generate_sequences.c
@@ -0,0 +1,88 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#define ZSTD_STATIC_LINKING_ONLY

#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#include "fuzz_data_producer.h"
#include "fuzz_helpers.h"
#include "zstd_helpers.h"

/**
* This fuzz target ensures that ZSTD_generateSequences() does not crash and
* if it succeeds that ZSTD_compressSequences() round trips.
*/

static void testRoundTrip(ZSTD_CCtx* cctx, ZSTD_Sequence const* seqs, size_t nbSeqs, const void* src, size_t srcSize) {
/* Compress the sequences with block delimiters */
const size_t compressBound = ZSTD_compressBound(srcSize);
void* dst = FUZZ_malloc(compressBound);
FUZZ_ASSERT(dst);

size_t compressedSize = ZSTD_compressSequences(cctx, dst, compressBound, seqs, nbSeqs, src, srcSize);
FUZZ_ZASSERT(compressedSize);

void* decompressed = FUZZ_malloc(srcSize);
FUZZ_ASSERT(srcSize == 0 || decompressed);
size_t decompressedSize = ZSTD_decompress(decompressed, srcSize, dst, compressedSize);
FUZZ_ZASSERT(decompressedSize);
FUZZ_ASSERT(decompressedSize == srcSize);
if (srcSize != 0) {
FUZZ_ASSERT(!memcmp(src, decompressed, srcSize));
}

free(decompressed);
free(dst);
}

int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {

FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(data, size);
size = FUZZ_dataProducer_reserveDataPrefix(producer);

ZSTD_CCtx* cctx = ZSTD_createCCtx();
FUZZ_ASSERT(cctx);

const size_t seqsCapacity = FUZZ_dataProducer_uint32Range(producer, 0, 2 * ZSTD_sequenceBound(size));
ZSTD_Sequence* seqs = (ZSTD_Sequence*)FUZZ_malloc(sizeof(ZSTD_Sequence) * seqsCapacity);
FUZZ_ASSERT(seqsCapacity == 0 || seqs);

FUZZ_setRandomParameters(cctx, size, producer);
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 0));
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0));

const size_t nbSeqs = ZSTD_generateSequences(cctx, seqs, seqsCapacity, data, size);
if (ZSTD_isError(nbSeqs)) {
/* Allowed to error if the destination is too small */
if (ZSTD_getErrorCode(nbSeqs) == ZSTD_error_dstSize_tooSmall) {
FUZZ_ASSERT(seqsCapacity < ZSTD_sequenceBound(size));
}
} else {
/* Ensure we round trip with and without block delimiters*/

FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters));
testRoundTrip(cctx, seqs, nbSeqs, data, size);

const size_t nbMergedSeqs = ZSTD_mergeBlockDelimiters(seqs, nbSeqs);
FUZZ_ASSERT(nbMergedSeqs <= nbSeqs);
FUZZ_ZASSERT(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only));
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_noBlockDelimiters));
testRoundTrip(cctx, seqs, nbMergedSeqs, data, size);
}

free(seqs);
ZSTD_freeCCtx(cctx);
FUZZ_dataProducer_free(producer);
return 0;
}