diff --git a/CHANGELOG b/CHANGELOG index 23c0128203..668b45a190 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -32,6 +32,7 @@ port: sparc64 support validation in CI, by @Cyan4973 port: AIX compatibility, by @likema port: HP-UX compatibility, by @likema doc: Improved specification accuracy, by @elasota +bug: Fix and deprecate ZSTD_generateSequences (#3981) v1.5.5 (Apr 2023) fix: fix rare corruption bug affecting the high compression mode, reported by @danlark1 (#3517, @terrelln) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 451f2f91e6..f6a84e6b72 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -3361,29 +3361,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) return ZSTDbss_compress; } -static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) { - const seqStore_t* seqStore = ZSTD_getSeqStore(zc); - const seqDef* seqStoreSeqs = seqStore->sequencesStart; - size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; - size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); - size_t literalsRead = 0; - size_t lastLLSize; + const seqDef* inSeqs = seqStore->sequencesStart; + const size_t nbInSequences = seqStore->sequences - inSeqs; + const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); - ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; + ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; + const size_t nbOutSequences = nbInSequences + 1; + size_t nbOutLiterals = 0; + repcodes_t repcodes; size_t i; - repcodes_t updatedRepcodes; - assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); - /* Ensure we have enough space for last literals "sequence" */ - assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); - ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); - for (i = 0; i < seqStoreSeqSize; ++i) { - U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; - outSeqs[i].litLength = seqStoreSeqs[i].litLength; - outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; + /* Bounds check that we have enough space for every input sequence + * and the block delimiter + */ + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + RETURN_ERROR_IF( + nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), + dstSize_tooSmall, + "Not enough space to copy sequences"); + + ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); + for (i = 0; i < nbInSequences; ++i) { + U32 rawOffset; + outSeqs[i].litLength = inSeqs[i].litLength; + outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; outSeqs[i].rep = 0; + /* Handle the possible single length >= 64K + * There can only be one because we add MINMATCH to every match length, + * and blocks are at most 128K. + */ if (i == seqStore->longLengthPos) { if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; @@ -3392,41 +3401,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) } } - if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { - /* Derive the correct offset corresponding to a repcode */ - outSeqs[i].rep = seqStoreSeqs[i].offBase; + /* Determine the raw offset given the offBase, which may be a repcode. */ + if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { + const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); + assert(repcode > 0); + outSeqs[i].rep = repcode; if (outSeqs[i].litLength != 0) { - rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; + rawOffset = repcodes.rep[repcode - 1]; } else { - if (outSeqs[i].rep == 3) { - rawOffset = updatedRepcodes.rep[0] - 1; + if (repcode == 3) { + assert(repcodes.rep[0] > 1); + rawOffset = repcodes.rep[0] - 1; } else { - rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; + rawOffset = repcodes.rep[repcode]; } } + } else { + rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); } outSeqs[i].offset = rawOffset; - /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode - so we provide seqStoreSeqs[i].offset - 1 */ - ZSTD_updateRep(updatedRepcodes.rep, - seqStoreSeqs[i].offBase, - seqStoreSeqs[i].litLength == 0); - literalsRead += outSeqs[i].litLength; + + /* Update repcode history for the sequence */ + ZSTD_updateRep(repcodes.rep, + inSeqs[i].offBase, + inSeqs[i].litLength == 0); + + nbOutLiterals += outSeqs[i].litLength; } /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker * for the block boundary, according to the API. */ - assert(seqStoreLiteralsSize >= literalsRead); - lastLLSize = seqStoreLiteralsSize - literalsRead; - outSeqs[i].litLength = (U32)lastLLSize; - outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; - seqStoreSeqSize++; - zc->seqCollector.seqIndex += seqStoreSeqSize; + assert(nbInLiterals >= nbOutLiterals); + { + const size_t lastLLSize = nbInLiterals - nbOutLiterals; + outSeqs[nbInSequences].litLength = (U32)lastLLSize; + outSeqs[nbInSequences].matchLength = 0; + outSeqs[nbInSequences].offset = 0; + assert(nbOutSequences == nbInSequences + 1); + } + seqCollector->seqIndex += nbOutSequences; + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + + return 0; } size_t ZSTD_sequenceBound(size_t srcSize) { - return (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; + return maxNbSeq + maxNbDelims; } size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, @@ -3435,6 +3458,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, const size_t dstCapacity = ZSTD_compressBound(srcSize); void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); SeqCollector seqCollector; + { + int targetCBlockSize; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); + RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); + } + { + int nbWorkers; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); + RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); + } RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); @@ -3444,8 +3477,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, seqCollector.maxSequences = outSeqsSize; zc->seqCollector = seqCollector; - ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); - ZSTD_customFree(dst, ZSTD_defaultCMem); + { + const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_customFree(dst, ZSTD_defaultCMem); + FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); + } + assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); return zc->seqCollector.seqIndex; } @@ -4038,8 +4075,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, cSeqsSize = 1; } + /* Sequence collection not supported when block splitting */ if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } @@ -4261,6 +4299,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, if (bss == ZSTDbss_noCompress) { if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); @@ -4293,11 +4332,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); - if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } + if (bss == ZSTDbss_noCompress) { + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = 0; + goto out; + } } if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c index e86fdb2baf..86ccce3184 100644 --- a/lib/compress/zstdmt_compress.c +++ b/lib/compress/zstdmt_compress.c @@ -121,7 +121,7 @@ static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) { - ZSTDMT_bufferPool* const bufPool = + ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem); if (bufPool==NULL) return NULL; if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { @@ -380,7 +380,7 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers, ZSTD_customMem cMem) { - ZSTDMT_CCtxPool* const cctxPool = + ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem); assert(nbWorkers > 0); if (!cctxPool) return NULL; diff --git a/lib/zstd.h b/lib/zstd.h index ba611656ff..9b3876126c 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1538,25 +1538,38 @@ typedef enum { ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); /*! ZSTD_generateSequences() : + * WARNING: This function is meant for debugging and informational purposes ONLY! + * Its implementation is flawed, and it will be deleted in a future version. + * It is not guaranteed to succeed, as there are several cases where it will give + * up and fail. You should NOT use this function in production code. + * + * This function is deprecated, and will be removed in a future version. + * * Generate sequences using ZSTD_compress2(), given a source buffer. * + * @param zc The compression context to be used for ZSTD_compress2(). Set any + * compression parameters you need on this context. + * @param outSeqs The output sequences buffer of size @p outSeqsSize + * @param outSeqsSize The size of the output sequences buffer. + * ZSTD_sequenceBound(srcSize) is an upper bound on the number + * of sequences that can be generated. + * @param src The source buffer to generate sequences from of size @p srcSize. + * @param srcSize The size of the source buffer. + * * Each block will end with a dummy sequence * with offset == 0, matchLength == 0, and litLength == length of last literals. * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) * simply acts as a block delimiter. * - * @zc can be used to insert custom compression params. - * This function invokes ZSTD_compress2(). - * - * The output of this function can be fed into ZSTD_compressSequences() with CCtx - * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters - * @return : number of sequences generated + * @returns The number of sequences generated, necessarily less than + * ZSTD_sequenceBound(srcSize), or an error code that can be checked + * with ZSTD_isError(). */ - +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ZSTDLIB_STATIC_API size_t -ZSTD_generateSequences( ZSTD_CCtx* zc, - ZSTD_Sequence* outSeqs, size_t outSeqsSize, - const void* src, size_t srcSize); +ZSTD_generateSequences(ZSTD_CCtx* zc, + ZSTD_Sequence* outSeqs, size_t outSeqsSize, + const void* src, size_t srcSize); /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index f96adcfaea..9279afa5c5 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -124,7 +124,8 @@ FUZZ_TARGETS := \ sequence_compression_api \ seekable_roundtrip \ huf_round_trip \ - huf_decompress + huf_decompress \ + generate_sequences all: libregression.a $(FUZZ_TARGETS) @@ -239,6 +240,9 @@ huf_round_trip: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_huf_round_trip.o huf_decompress: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o $(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o $(LIB_FUZZING_ENGINE) -o $@ +generate_sequences: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o + $(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o $(LIB_FUZZING_ENGINE) -o $@ + libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c d_fuzz_regression_driver.o $(AR) $(FUZZ_ARFLAGS) $@ d_fuzz_regression_driver.o diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py index c489b8fa64..a41fe77c04 100755 --- a/tests/fuzz/fuzz.py +++ b/tests/fuzz/fuzz.py @@ -65,6 +65,7 @@ def __init__(self, input_type, frame_type=FrameType.ZSTD): 'seekable_roundtrip': TargetInfo(InputType.RAW_DATA), 'huf_round_trip': TargetInfo(InputType.RAW_DATA), 'huf_decompress': TargetInfo(InputType.RAW_DATA), + 'generate_sequences': TargetInfo(InputType.RAW_DATA), } TARGETS = list(TARGET_INFO.keys()) ALL_TARGETS = TARGETS + ['all'] diff --git a/tests/fuzz/generate_sequences.c b/tests/fuzz/generate_sequences.c new file mode 100644 index 0000000000..812b659186 --- /dev/null +++ b/tests/fuzz/generate_sequences.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#define ZSTD_STATIC_LINKING_ONLY + +#include +#include +#include +#include + +#include "fuzz_data_producer.h" +#include "fuzz_helpers.h" +#include "zstd_helpers.h" + +/** + * This fuzz target ensures that ZSTD_generateSequences() does not crash and + * if it succeeds that ZSTD_compressSequences() round trips. + */ + +static void testRoundTrip(ZSTD_CCtx* cctx, ZSTD_Sequence const* seqs, size_t nbSeqs, const void* src, size_t srcSize) { + /* Compress the sequences with block delimiters */ + const size_t compressBound = ZSTD_compressBound(srcSize); + void* dst = FUZZ_malloc(compressBound); + FUZZ_ASSERT(dst); + + size_t compressedSize = ZSTD_compressSequences(cctx, dst, compressBound, seqs, nbSeqs, src, srcSize); + FUZZ_ZASSERT(compressedSize); + + void* decompressed = FUZZ_malloc(srcSize); + FUZZ_ASSERT(srcSize == 0 || decompressed); + size_t decompressedSize = ZSTD_decompress(decompressed, srcSize, dst, compressedSize); + FUZZ_ZASSERT(decompressedSize); + FUZZ_ASSERT(decompressedSize == srcSize); + FUZZ_ASSERT(!memcmp(src, decompressed, srcSize)); + + free(decompressed); + free(dst); +} + +int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + + FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(data, size); + size = FUZZ_dataProducer_reserveDataPrefix(producer); + + ZSTD_CCtx* cctx = ZSTD_createCCtx(); + FUZZ_ASSERT(cctx); + + const size_t seqsCapacity = FUZZ_dataProducer_uint32Range(producer, 0, 2 * ZSTD_sequenceBound(size)); + ZSTD_Sequence* seqs = (ZSTD_Sequence*)FUZZ_malloc(sizeof(ZSTD_Sequence) * seqsCapacity); + FUZZ_ASSERT(seqsCapacity == 0 || seqs); + + FUZZ_setRandomParameters(cctx, size, producer); + FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 0)); + FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0)); + + const size_t nbSeqs = ZSTD_generateSequences(cctx, seqs, seqsCapacity, data, size); + if (ZSTD_isError(nbSeqs)) { + /* Allowed to error if the destination is too small */ + if (ZSTD_getErrorCode(nbSeqs) == ZSTD_error_dstSize_tooSmall) { + FUZZ_ASSERT(seqsCapacity < ZSTD_sequenceBound(size)); + } + } else { + /* Ensure we round trip with and without block delimiters*/ + + FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters)); + testRoundTrip(cctx, seqs, nbSeqs, data, size); + + const size_t nbMergedSeqs = ZSTD_mergeBlockDelimiters(seqs, nbSeqs); + FUZZ_ASSERT(nbMergedSeqs <= nbSeqs); + FUZZ_ZASSERT(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only)); + FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_noBlockDelimiters)); + testRoundTrip(cctx, seqs, nbMergedSeqs, data, size); + } + + free(seqs); + ZSTD_freeCCtx(cctx); + FUZZ_dataProducer_free(producer); + return 0; +} diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 0934921811..f7bdae90e9 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -3701,6 +3701,31 @@ static int basicUnitTests(U32 const seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : ZSTD_generateSequences too small output buffer : ", testNb++); + { + const size_t seqsCapacity = 10; + const size_t srcSize = 150 KB; + const BYTE* src = (BYTE*)CNBuffer; + + ZSTD_CCtx* const cctx = ZSTD_createCCtx(); + ZSTD_Sequence* const seqs = (ZSTD_Sequence*)malloc(seqsCapacity * sizeof(ZSTD_Sequence)); + + if (seqs == NULL) goto _output_error; + if (cctx == NULL) goto _output_error; + /* Populate src with random data */ + RDG_genBuffer(CNBuffer, srcSize, compressibility, 0.5, seed); + + /* Test with block delimiters roundtrip */ + { + size_t const seqsSize = ZSTD_generateSequences(cctx, seqs, seqsCapacity, src, srcSize); + if (!ZSTD_isError(seqsSize)) goto _output_error; + } + + ZSTD_freeCCtx(cctx); + free(seqs); + } + DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences followed by ZSTD_compressSequences : ", testNb++); { const size_t srcSize = 500 KB;