docling-project · praveenmidde · Mar 13, 2025 · Mar 14, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/docling_eval/benchmarks/constants.py b/docling_eval/benchmarks/constants.py
@@ -2,6 +2,8 @@
 
 
 class BenchMarkColumns(str, Enum):
+    DOCLING_VERSION = "docling_version"
+    AZURE_VERSION = "azure_version"
     CONVERTER_TYPE = "converter_type"
     CONVERTER_VERSION = "converter_version"
     DOCLING_PIPELINE = "docling_pipeline"

diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py
@@ -390,8 +390,6 @@ def save_shard_to_disk(
     batch = Dataset.from_list(items)  # , features=features)
 
     output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}"
-    logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents")
-
     if shard_format == "json":
         batch.to_json(output_file)
 
@@ -401,6 +399,8 @@ def save_shard_to_disk(
     else:
         raise ValueError(f"Unsupported shard_format: {shard_format}")
 
+    logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents")
+
     shard_id += 1
 
     return shard_id, [], 0

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -261,6 +261,9 @@ def evaluate(
     if not os.path.exists(idir):
         _log.error(f"Benchmark directory not found: {idir}")
 
+    # creare the output directories if they do not exist
+    os.makedirs(odir, exist_ok=True)
+
     # Save the evaluation
     save_fn = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
 

diff --git a/docling_eval_next/dataset_builders/dataset_builder.py b/docling_eval_next/dataset_builders/dataset_builder.py
@@ -2,8 +2,9 @@
 from abc import abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Any, Iterable, Optional, Union
 
+import ibm_boto3
 from docling.utils.utils import chunkify
 from docling_core.types.io import DocumentStream
 from huggingface_hub import snapshot_download
@@ -23,9 +24,80 @@ class HFSource(BaseModel):
 
 
 class S3Source(BaseModel):
-    # TBD
-    pass
+    endpoint: str
+    access_key: str
+    secret_key: str
+    cos_bucket: str # Bucket of interest inside of COS.
+    cos_dir: str    # Path to dataset "directory" of interest in COS.
+    cos_resource: Optional[Any] = None
+    cos_client: Optional[Any] = None
+    overwrite_downloads: Optional[bool] = True
+
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self.cos_resource = self.initialize_s3_resource()
+        self.cos_client = self.initialize_s3_client()
+
+    def initialize_s3_client(self):
+        """Initializes boto3 resource - s3 instance
+        Returns the s3 instance
+        """
+        return ibm_boto3.client(
+            "s3",
+            endpoint_url=self.endpoint,
+            aws_access_key_id=self.access_key,
+            aws_secret_access_key=self.secret_key,
+        )
+
+    def initialize_s3_resource(self):
+        """Initializes boto3 resource - s3 instance
+        Returns the s3 instance
+        """
+
+        return ibm_boto3.resource(
+            "s3",
+            endpoint_url=self.endpoint,
+            aws_access_key_id=self.access_key,
+            aws_secret_access_key=self.secret_key,
+        )
 
+    def download_objects(self, download_dir):
+        """Downloads the objects from the bucket to the given download directory."""
+        print(f"Download objects from {self.cos_bucket}/{self.cos_dir} to {download_dir}")
+        paginator = self.cos_client.get_paginator("list_objects_v2")
+        pagination_params = {
+            "Bucket": self.cos_bucket,
+            "Prefix": self.cos_dir,
+            "MaxKeys": 100,
+        }
+        page_iterator = paginator.paginate(**pagination_params)
+        for page in page_iterator:
+            for file_meta in page["Contents"]:
+                # print(file_meta)
+                relative_path = file_meta["Key"][len(self.cos_dir) + 1:]
+                if len(relative_path) ==0:
+                    continue
+                if file_meta["Size"] == 0:
+                    continue
+
+                # Identify the path to the file on disk.
+                local_file_path = os.path.join(download_dir, relative_path)
+                print(f"Download {file_meta['Key']} to {local_file_path}")
+
+                # If the option to overwrite downloads is ON, and the file already exists, skip it.
+                if self.overwrite_downloads and os.path.exists(local_file_path):
+                    print(f"File {local_file_path} already exists. Skipping.")
+                    continue
+
+                # Create the directories as required
+                local_dir = os.path.dirname(local_file_path)
+                if not os.path.exists(local_dir):
+                    os.makedirs(local_dir)
+
+                self.cos_resource.Bucket(self.cos_bucket).download_file(file_meta["Key"], local_file_path)
+
+        return download_dir
 
 class BaseEvaluationDatasetBuilder:
     def __init__(
@@ -65,6 +137,11 @@ def retrieve_input_dataset(self) -> Path:
                 path = Path(path_str)
         elif isinstance(self.dataset_source, Path):
             path = self.dataset_source
+        elif isinstance(self.dataset_source, S3Source):
+            # Download the data from S3 bucket to the target folder
+            self.dataset_source.download_objects(self.target)
+            path = Path(self.target)
+            self.dataset_local_path = path
         else:
             raise RuntimeError(
                 f"Unknown dataset_source type {type(self.dataset_source)}"
@@ -103,13 +180,14 @@ def save_to_disk(self):
                 "You must first retrieve the source dataset. Call retrieve_input_dataset()."
             )
 
-        test_dir = self.target / "test"
+        test_dir = self.target / "intermediate_files"
         os.makedirs(test_dir, exist_ok=True)
 
         count = 0
         for record_chunk in chunkify(self.iterate(), 80):
             record_chunk = [r.as_record_dict() for r in record_chunk]
-            save_shard_to_disk(items=record_chunk, dataset_path=test_dir)
+            shard_id = count # set the id based on the count of records..
+            save_shard_to_disk(items=record_chunk, dataset_path=test_dir, shard_id=shard_id)
             count += len(record_chunk)
 
         write_datasets_info(