Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a lorem ipsum generator #3890

Merged
merged 5 commits into from Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions build/VS2008/zstd/zstd.vcproj
Expand Up @@ -356,6 +356,10 @@
RelativePath="..\..\..\programs\dibio.c"
>
</File>
<File
RelativePath="..\..\..\programs\lorem.c"
>
</File>
<File
RelativePath="..\..\..\lib\dictBuilder\cover.c"
>
Expand Down
1 change: 1 addition & 0 deletions build/VS2010/zstd/zstd.vcxproj
Expand Up @@ -63,6 +63,7 @@
<ClCompile Include="..\..\..\programs\dibio.c" />
<ClCompile Include="..\..\..\programs\fileio.c" />
<ClCompile Include="..\..\..\programs\fileio_asyncio.c" />
<ClCompile Include="..\..\..\programs\lorem.c" />
<ClCompile Include="..\..\..\programs\zstdcli.c" />
<ClCompile Include="..\..\..\programs\zstdcli_trace.c" />
</ItemGroup>
Expand Down
7 changes: 1 addition & 6 deletions build/cmake/programs/CMakeLists.txt
Expand Up @@ -32,12 +32,7 @@ if (MSVC)
set(PlatformDependResources ${MSVC_RESOURCE_DIR}/zstd.rc)
endif ()

set(ZSTD_PROGRAM_SRCS ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/util.c
${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/fileio.c
${PROGRAMS_DIR}/fileio_asyncio.c ${PROGRAMS_DIR}/benchfn.c
${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c
${PROGRAMS_DIR}/dibio.c ${PROGRAMS_DIR}/zstdcli_trace.c
${PlatformDependResources})
file(GLOB ZSTD_PROGRAM_SRCS "${PROGRAMS_DIR}/*.c")
if (MSVC AND ZSTD_PROGRAMS_LINK_SHARED)
list(APPEND ZSTD_PROGRAM_SRCS ${LIBRARY_DIR}/common/pool.c ${LIBRARY_DIR}/common/threading.c)
endif ()
Expand Down
4 changes: 2 additions & 2 deletions build/cmake/tests/CMakeLists.txt
Expand Up @@ -56,7 +56,7 @@ target_link_libraries(datagen libzstd_static)
#
# fullbench
#
add_executable(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${TESTS_DIR}/fullbench.c)
add_executable(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/lorem.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${TESTS_DIR}/fullbench.c)
if (NOT MSVC)
target_compile_options(fullbench PRIVATE "-Wno-deprecated-declarations")
endif()
Expand Down Expand Up @@ -110,7 +110,7 @@ endif()
# Label the "Medium" set of tests (see TESTING.md)
set_property(TEST fuzzer zstreamtest playTests APPEND PROPERTY LABELS Medium)

add_executable(paramgrill ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/paramgrill.c)
add_executable(paramgrill ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/lorem.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/paramgrill.c)
if (UNIX)
target_link_libraries(paramgrill libzstd_static m) #m is math library
else()
Expand Down
1 change: 1 addition & 0 deletions build/meson/programs/meson.build
Expand Up @@ -18,6 +18,7 @@ zstd_programs_sources = [join_paths(zstd_rootdir, 'programs/zstdcli.c'),
join_paths(zstd_rootdir, 'programs/benchfn.c'),
join_paths(zstd_rootdir, 'programs/benchzstd.c'),
join_paths(zstd_rootdir, 'programs/datagen.c'),
join_paths(zstd_rootdir, 'programs/lorem.c'),
join_paths(zstd_rootdir, 'programs/dibio.c'),
join_paths(zstd_rootdir, 'programs/zstdcli_trace.c')]

Expand Down
1 change: 1 addition & 0 deletions build/meson/tests/meson.build
Expand Up @@ -29,6 +29,7 @@ DECODECORPUS_TESTTIME = '-T30'
test_includes = [ include_directories(join_paths(zstd_rootdir, 'programs')) ]

testcommon_sources = [join_paths(zstd_rootdir, 'programs/datagen.c'),
join_paths(zstd_rootdir, 'programs/lorem.c'),
join_paths(zstd_rootdir, 'programs/util.c'),
join_paths(zstd_rootdir, 'programs/timefn.c'),
join_paths(zstd_rootdir, 'programs/benchfn.c'),
Expand Down
17 changes: 12 additions & 5 deletions programs/benchzstd.c
Expand Up @@ -32,12 +32,13 @@
#include "benchfn.h"
#include "../lib/common/mem.h"
#ifndef ZSTD_STATIC_LINKING_ONLY
#define ZSTD_STATIC_LINKING_ONLY
# define ZSTD_STATIC_LINKING_ONLY
#endif
#include "../lib/zstd.h"
#include "datagen.h" /* RDG_genBuffer */
#include "lorem.h" /* LOREM_genBuffer */
#ifndef XXH_INLINE_ALL
#define XXH_INLINE_ALL
# define XXH_INLINE_ALL
#endif
#include "../lib/common/xxhash.h"
#include "benchzstd.h"
Expand Down Expand Up @@ -701,7 +702,8 @@ int BMK_syntheticTest(int cLevel, double compressibility,
const ZSTD_compressionParameters* compressionParams,
int displayLevel, const BMK_advancedParams_t* adv)
{
char name[20] = {0};
char nameBuff[20] = {0};
const char* name = nameBuff;
size_t const benchedSize = 10000000;
void* srcBuffer;
BMK_benchOutcome_t res;
Expand All @@ -719,10 +721,15 @@ int BMK_syntheticTest(int cLevel, double compressibility,
}

/* Fill input buffer */
RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
if (compressibility < 0.0) {
LOREM_genBuffer(srcBuffer, benchedSize, 0);
name = "Lorem ipsum";
} else {
RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
snprintf (nameBuff, sizeof(nameBuff), "Synthetic %2u%%", (unsigned)(compressibility*100));
}

/* Bench */
snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
res = BMK_benchCLevel(srcBuffer, benchedSize,
&benchedSize /* ? */, 1 /* ? */,
cLevel, compressionParams,
Expand Down
11 changes: 6 additions & 5 deletions programs/benchzstd.h
Expand Up @@ -126,11 +126,12 @@ int BMK_benchFilesAdvanced(

/*! BMK_syntheticTest() -- called from zstdcli */
/* Generates a sample with datagen, using compressibility argument */
/* cLevel - compression level to benchmark, errors if invalid
* compressibility - determines compressibility of sample
* compressionParams - basic compression Parameters
* displayLevel - see benchFiles
* adv - see advanced_Params_t
/* @cLevel - compression level to benchmark, errors if invalid
* @compressibility - determines compressibility of sample, range [0.0 - 1.0]
* if @compressibility < 0.0, uses the lorem ipsum generator
* @compressionParams - basic compression Parameters
* @displayLevel - see benchFiles
* @adv - see advanced_Params_t
* @return: 0 on success, !0 on error
*/
int BMK_syntheticTest(int cLevel, double compressibility,
Expand Down
207 changes: 207 additions & 0 deletions programs/lorem.c
@@ -0,0 +1,207 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/


/* Implementation notes:
*
* This is a very simple lorem ipsum generator
* which features a static list of words
* and print them one after another randomly
* with a fake sentence / paragraph structure.
*
* The goal is to generate a printable text
* that can be used to fake a text compression scenario.
* The resulting compression / ratio curve of the lorem ipsum generator
* is more satisfying than the previous statistical generator,
* which was initially designed for entropy compression,
* and lacks a regularity more representative of text.
*
* The compression ratio achievable on the generated lorem ipsum
* is still a bit too good, presumably because the dictionary is too small.
* It would be possible to create some more complex scheme,
* notably by enlarging the dictionary with a word generator,
* and adding grammatical rules (composition) and syntax rules.
* But that's probably overkill for the intended goal.
*/

#include "lorem.h"
#include <string.h> /* memcpy */
#include <limits.h> /* INT_MAX */
#include <assert.h>

#define WORD_MAX_SIZE 20

/* Define the word pool */
static const char *words[] = {
"lorem", "ipsum", "dolor", "sit", "amet",
"consectetur", "adipiscing", "elit", "sed", "do",
"eiusmod", "tempor", "incididunt", "ut", "labore",
"et", "dolore", "magna", "aliqua", "dis",
"lectus", "vestibulum", "mattis", "ullamcorper", "velit",
"commodo", "a", "lacus", "arcu", "magnis",
"parturient", "montes", "nascetur", "ridiculus", "mus",
"mauris", "nulla", "malesuada", "pellentesque", "eget",
"gravida", "in", "dictum", "non", "erat",
"nam", "voluptat", "maecenas", "blandit", "aliquam",
"etiam", "enim", "lobortis", "scelerisque", "fermentum",
"dui", "faucibus", "ornare", "at", "elementum",
"eu", "facilisis", "odio", "morbi", "quis",
"eros", "donec", "ac", "orci", "purus",
"turpis", "cursus", "leo", "vel", "porta"};

/* simple distribution that favors small words :
* 1 letter : weight 3
* 2-3 letters : weight 2
* 4+ letters : weight 1
* This is expected to be a bit more difficult to compress */
static const int distrib[] = {
0, 1, 2, 3, 3, 4, 5, 6, 7, 8,
8,9, 9, 10, 11, 12, 13, 13, 14, 15,
15, 16, 17, 18, 19, 19, 20, 21, 22, 23,
24, 25, 26, 26, 26, 27, 28, 29, 30, 31,
32, 33, 34, 34, 35, 36, 37, 38, 39, 40,
41, 41, 42, 43, 43, 44, 45, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 55, 56,
57, 58, 58, 59, 60, 60, 61, 62, 63, 64,
65, 66, 67, 67, 68, 69, 70, 71, 72, 72,
73, 73, 74 };
static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);

/* Note: this unit only works when invoked sequentially.
* No concurrent access is allowed */
static char *g_ptr = NULL;
static size_t g_nbChars = 0;
static size_t g_maxChars = 10000000;
static unsigned g_randRoot = 0;

#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
static unsigned LOREM_rand(unsigned range) {
static const unsigned prime1 = 2654435761U;
static const unsigned prime2 = 2246822519U;
unsigned rand32 = g_randRoot;
rand32 *= prime1;
rand32 ^= prime2;
rand32 = RDG_rotl32(rand32, 13);
g_randRoot = rand32;
return (unsigned)(((unsigned long long)rand32 * range) >> 32);
}

static void writeLastCharacters(void) {
size_t lastChars = g_maxChars - g_nbChars;
assert(g_maxChars >= g_nbChars);
if (lastChars == 0)
return;
g_ptr[g_nbChars++] = '.';
if (lastChars > 2) {
memset(g_ptr + g_nbChars, ' ', lastChars - 2);
}
if (lastChars > 1) {
g_ptr[g_maxChars-1] = '\n';
}
g_nbChars = g_maxChars;
}

static void generateWord(const char *word, const char *separator, int upCase)
{
size_t const len = strlen(word) + strlen(separator);
if (g_nbChars + len > g_maxChars) {
writeLastCharacters();
return;
}
memcpy(g_ptr + g_nbChars, word, strlen(word));
if (upCase) {
static const char toUp = 'A' - 'a';
g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
}
g_nbChars += strlen(word);
memcpy(g_ptr + g_nbChars, separator, strlen(separator));
g_nbChars += strlen(separator);
}

static int about(unsigned target) {
return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
}

/* Function to generate a random sentence */
static void generateSentence(int nbWords) {
int commaPos = about(9);
int comma2 = commaPos + about(7);
int i;
for (i = 0; i < nbWords; i++) {
int const wordID = distrib[LOREM_rand(distribCount)];
const char *const word = words[wordID];
const char* sep = " ";
if (i == commaPos)
sep = ", ";
if (i == comma2)
sep = ", ";
if (i == nbWords - 1)
sep = ". ";
generateWord(word, sep, i==0);
}
}

static void generateParagraph(int nbSentences) {
int i;
for (i = 0; i < nbSentences; i++) {
int wordsPerSentence = about(8);
generateSentence(wordsPerSentence);
}
if (g_nbChars < g_maxChars) {
g_ptr[g_nbChars++] = '\n';
}
if (g_nbChars < g_maxChars) {
g_ptr[g_nbChars++] = '\n';
}
}

/* It's "common" for lorem ipsum generators to start with the same first
* pre-defined sentence */
static void generateFirstSentence(void) {
int i;
for (i = 0; i < 18; i++) {
const char *word = words[i];
const char *separator = " ";
if (i == 4)
separator = ", ";
if (i == 7)
separator = ", ";
generateWord(word, separator, i==0);
}
generateWord(words[18], ". ", 0);
}

size_t LOREM_genBlock(void* buffer, size_t size,
unsigned seed,
int first, int fill)
{
g_ptr = (char*)buffer;
assert(size < INT_MAX);
g_maxChars = size;
g_nbChars = 0;
g_randRoot = seed;
if (first) {
generateFirstSentence();
}
while (g_nbChars < g_maxChars) {
int sentencePerParagraph = about(7);
generateParagraph(sentencePerParagraph);
if (!fill)
break; /* only generate one paragraph in not-fill mode */
}
g_ptr = NULL;
return g_nbChars;
}

void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
{
LOREM_genBlock(buffer, size, seed, 1, 1);
}

32 changes: 32 additions & 0 deletions programs/lorem.h
@@ -0,0 +1,32 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

/* lorem ipsum generator */

#include <stddef.h> /* size_t */

/*
* LOREM_genBuffer():
* Generate @size bytes of compressible data using lorem ipsum generator
* into provided @buffer.
*/
void LOREM_genBuffer(void* buffer, size_t size, unsigned seed);

/*
* LOREM_genBlock():
* Similar to LOREM_genBuffer, with additional controls :
* - @first : generate the first sentence
* - @fill : fill the entire @buffer,
* if ==0: generate one paragraph at most.
* @return : nb of bytes generated into @buffer.
*/
size_t LOREM_genBlock(void* buffer, size_t size,
unsigned seed,
int first, int fill);
4 changes: 2 additions & 2 deletions programs/zstdcli.c
Expand Up @@ -856,7 +856,7 @@ int main(int argCount, const char* argv[])
ZSTD_paramSwitch_e useRowMatchFinder = ZSTD_ps_auto;
FIO_compressionType_t cType = FIO_zstdCompression;
unsigned nbWorkers = 0;
double compressibility = 0.5;
double compressibility = -1.0; /* lorem ipsum generator */
unsigned bench_nbSeconds = 3; /* would be better if this value was synchronized from bench */
size_t blockSize = 0;

Expand Down Expand Up @@ -1280,7 +1280,7 @@ int main(int argCount, const char* argv[])
break;

/* unknown command */
default :
default :
sprintf(shortArgument, "-%c", argument[0]);
badUsage(programName, shortArgument);
CLEAN_RETURN(1);
Expand Down
2 changes: 1 addition & 1 deletion tests/Makefile
Expand Up @@ -203,7 +203,7 @@ zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
CLEAN += paramgrill
paramgrill : DEBUGFLAGS = # turn off debug for speed measurements
paramgrill : LDLIBS += -lm
paramgrill : $(ZSTD_FILES) $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PRGDIR)/benchfn.c $(PRGDIR)/benchzstd.c $(PRGDIR)/datagen.c paramgrill.c
paramgrill : $(ZSTD_FILES) $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PRGDIR)/benchfn.c $(PRGDIR)/benchzstd.c $(PRGDIR)/datagen.c $(PRGDIR)/lorem.c paramgrill.c

CLEAN += datagen
datagen : $(PRGDIR)/datagen.c datagencli.c
Expand Down