Simplify test mode with small document sets (#1792)

Modifies test mode to use original document set instead of `-1k` variant if it has 1000 or less documents. This makes the creation of `-1k` document set files unnecessary for small corpora.
elastic · Jan 17, 2024 · 39fa2ad · 39fa2ad
1 parent 3833d53
commit 39fa2ad
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 23 deletions.
diff --git a/docs/adding_tracks.rst b/docs/adding_tracks.rst
@@ -299,12 +299,17 @@ Congratulations, you have created your first track! You can test it with ``esral
 Adding support for test mode
 ----------------------------
 
-You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``. Rally postprocesses its internal track representation as follows:
+You can check your track very quickly for syntax errors when you invoke Rally with ``--test-mode``.
+
+In test mode Rally postprocesses its internal track representation as follows:
 
 * Iteration-based tasks run at most one warmup iteration and one measurement iteration.
 * Time-period-based tasks run at most for 10 seconds without warmup.
 
-Rally also postprocesses all data file names. Instead of ``documents.json``, Rally expects ``documents-1k.json`` and assumes the file contains 1.000 documents. You need to prepare these data files though. Pick 1.000 documents for every data file in your track and store them in a file with the suffix ``-1k``. We choose the first 1.000 with ``head -n 1000 documents.json > documents-1k.json``.
+In test mode Rally also post-processes all data file names:
+
+* If ``documents.json`` has 1000 documents or fewer, Rally uses it (no modifications).
+* If ``documents.json`` has more than 1000 documents, Rally assumes an additional ``documents-1k.json`` file is present and uses it. You need to prepare these additional files manually. Pick 1000 documents for every data file in your track and store them in a file with the ``-1k`` suffix. On Linux you can do it as follows: ``head -n 1000 documents.json > documents-1k.json``.
 
 Challenges
 ----------

diff --git a/esrally/track/loader.py b/esrally/track/loader.py
@@ -965,30 +965,44 @@ def on_after_load_track(self, track):
             return track
         self.logger.info("Preparing track [%s] for test mode.", str(track))
         for corpus in track.corpora:
-            if self.logger.isEnabledFor(logging.DEBUG):
-                self.logger.debug("Reducing corpus size to 1000 documents for [%s]", corpus.name)
             for document_set in corpus.documents:
                 # TODO #341: Should we allow this for snapshots too?
                 if document_set.is_bulk:
-                    document_set.number_of_documents = 1000
+                    if document_set.number_of_documents > 1000:
+                        if self.logger.isEnabledFor(logging.DEBUG):
+                            self.logger.debug(
+                                "Reducing corpus size to 1000 documents in corpus [%s], uncompressed source file [%s]",
+                                corpus.name,
+                                document_set.document_file,
+                            )
 
-                    if document_set.has_compressed_corpus():
-                        path, ext = io.splitext(document_set.document_archive)
-                        path_2, ext_2 = io.splitext(path)
+                        document_set.number_of_documents = 1000
 
-                        document_set.document_archive = f"{path_2}-1k{ext_2}{ext}"
-                        document_set.document_file = f"{path_2}-1k{ext_2}"
-                    elif document_set.has_uncompressed_corpus():
-                        path, ext = io.splitext(document_set.document_file)
-                        document_set.document_file = f"{path}-1k{ext}"
-                    else:
-                        raise exceptions.RallyAssertionError(
-                            f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus."
-                        )
+                        if document_set.has_compressed_corpus():
+                            path, ext = io.splitext(document_set.document_archive)
+                            path_2, ext_2 = io.splitext(path)
 
-                    # we don't want to check sizes
-                    document_set.compressed_size_in_bytes = None
-                    document_set.uncompressed_size_in_bytes = None
+                            document_set.document_archive = f"{path_2}-1k{ext_2}{ext}"
+                            document_set.document_file = f"{path_2}-1k{ext_2}"
+                        elif document_set.has_uncompressed_corpus():
+                            path, ext = io.splitext(document_set.document_file)
+                            document_set.document_file = f"{path}-1k{ext}"
+                        else:
+                            raise exceptions.RallyAssertionError(
+                                f"Document corpus [{corpus.name}] has neither compressed nor uncompressed corpus."
+                            )
+
+                        # we don't want to check sizes
+                        document_set.compressed_size_in_bytes = None
+                        document_set.uncompressed_size_in_bytes = None
+                    else:
+                        if self.logger.isEnabledFor(logging.DEBUG):
+                            self.logger.debug(
+                                "Maintaining existing size of %d documents in corpus [%s], uncompressed source file [%s]",
+                                document_set.number_of_documents,
+                                corpus.name,
+                                document_set.document_file,
+                            )
 
         for challenge in track.challenges:
             for task in challenge.schedule:

diff --git a/tests/track/loader_test.py b/tests/track/loader_test.py
@@ -1259,7 +1259,18 @@ def test_post_processes_track_spec(self):
             ],
             "corpora": [
                 {
-                    "name": "unittest",
+                    "name": "unittest-reduce-to-1k-documents",
+                    "documents": [
+                        {
+                            "source-file": "documents.json.bz2",
+                            "document-count": 1001,
+                            "compressed-bytes": 100,
+                            "uncompressed-bytes": 10000,
+                        }
+                    ],
+                },
+                {
+                    "name": "unittest-keep-less-than-1k-documents",
                     "documents": [
                         {
                             "source-file": "documents.json.bz2",
@@ -1268,7 +1279,7 @@ def test_post_processes_track_spec(self):
                             "uncompressed-bytes": 10000,
                         }
                     ],
-                }
+                },
             ],
             "operations": [
                 {
@@ -1335,11 +1346,22 @@ def test_post_processes_track_spec(self):
             ],
             "corpora": [
                 {
-                    "name": "unittest",
+                    "name": "unittest-reduce-to-1k-documents",
                     "documents": [
                         {"source-file": "documents-1k.json.bz2", "document-count": 1000},
                     ],
                 },
+                {
+                    "name": "unittest-keep-less-than-1k-documents",
+                    "documents": [
+                        {
+                            "source-file": "documents.json.bz2",
+                            "document-count": 10,
+                            "compressed-bytes": 100,
+                            "uncompressed-bytes": 10000,
+                        },
+                    ],
+                },
             ],
             "operations": [
                 {