From 93c397f45b999653272413526e6e999d3366d69a Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 28 Apr 2026 11:11:20 -0400 Subject: [PATCH] brought back "source" information as part of workflowId --- mmif/utils/workflow_helper.py | 10 +++++++++- tests/test_utils.py | 6 +++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index bdde664a..512a6cd8 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -115,7 +115,11 @@ def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], Generate a workflow identifier string from a MMIF file or object. The identifier follows the storage directory structure format: - app_name/version/param_hash/app_name2/version2/param_hash2/... + source_composition/app_name/version/param_hash/app_name2/version2/param_hash2/... + + The leading ``source_composition`` segment encodes the top-level + document mix as ``Type-N`` pairs joined by ``-`` and sorted by type + name (e.g. ``TextDocument-1-VideoDocument-1``). Uses view.metadata.parameters (raw user-passed values) for hashing to ensure reproducibility. Views with errors or warnings are excluded @@ -128,6 +132,10 @@ def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], data = _read_mmif_from_path(mmif_input) segments = [] + # First prefix is source information, sorted by document type + sources = Counter(doc.at_type.shortname for doc in data.documents) + segments.append('-'.join([f'{k}-{sources[k]}' for k in sorted(sources.keys())])) + # Group views into runs grouped_apps = group_views_by_app(data.views) diff --git a/tests/test_utils.py b/tests/test_utils.py index fff35331..cf573f86 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -471,9 +471,9 @@ def test_generate_workflow_identifier_grouped(self): try: workflow_id = wfh.generate_workflow_identifier(tmp_file) segments = workflow_id.split('/') - self.assertEqual(len(segments), 6) - self.assertIn('app1', segments[0]) - self.assertIn('app2', segments[3]) + self.assertEqual(len(segments), 7) + self.assertIn('app1', segments[1]) + self.assertIn('app2', segments[4]) finally: os.unlink(tmp_file)