Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docling_eval/benchmarks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@


class BenchMarkColumns(str, Enum):
DOCLING_VERSION = "docling_version"
AZURE_VERSION = "azure_version"
CONVERTER_TYPE = "converter_type"
CONVERTER_VERSION = "converter_version"
DOCLING_PIPELINE = "docling_pipeline"
Expand Down
4 changes: 2 additions & 2 deletions docling_eval/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,6 @@ def save_shard_to_disk(
batch = Dataset.from_list(items) # , features=features)

output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}"
logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents")

if shard_format == "json":
batch.to_json(output_file)

Expand All @@ -401,6 +399,8 @@ def save_shard_to_disk(
else:
raise ValueError(f"Unsupported shard_format: {shard_format}")

logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents")

shard_id += 1

return shard_id, [], 0
Expand Down
3 changes: 3 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,9 @@ def evaluate(
if not os.path.exists(idir):
_log.error(f"Benchmark directory not found: {idir}")

# creare the output directories if they do not exist
os.makedirs(odir, exist_ok=True)

# Save the evaluation
save_fn = odir / f"evaluation_{benchmark.value}_{modality.value}.json"

Expand Down
88 changes: 83 additions & 5 deletions docling_eval_next/dataset_builders/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from abc import abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Any, Iterable, Optional, Union

import ibm_boto3
from docling.utils.utils import chunkify
from docling_core.types.io import DocumentStream
from huggingface_hub import snapshot_download
Expand All @@ -23,9 +24,80 @@ class HFSource(BaseModel):


class S3Source(BaseModel):
# TBD
pass
endpoint: str
access_key: str
secret_key: str
cos_bucket: str # Bucket of interest inside of COS.
cos_dir: str # Path to dataset "directory" of interest in COS.
cos_resource: Optional[Any] = None
cos_client: Optional[Any] = None
overwrite_downloads: Optional[bool] = True


def __init__(self, **data):
super().__init__(**data)
self.cos_resource = self.initialize_s3_resource()
self.cos_client = self.initialize_s3_client()

def initialize_s3_client(self):
"""Initializes boto3 resource - s3 instance
Returns the s3 instance
"""
return ibm_boto3.client(
"s3",
endpoint_url=self.endpoint,
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
)

def initialize_s3_resource(self):
"""Initializes boto3 resource - s3 instance
Returns the s3 instance
"""

return ibm_boto3.resource(
"s3",
endpoint_url=self.endpoint,
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
)

def download_objects(self, download_dir):
"""Downloads the objects from the bucket to the given download directory."""
print(f"Download objects from {self.cos_bucket}/{self.cos_dir} to {download_dir}")
paginator = self.cos_client.get_paginator("list_objects_v2")
pagination_params = {
"Bucket": self.cos_bucket,
"Prefix": self.cos_dir,
"MaxKeys": 100,
}
page_iterator = paginator.paginate(**pagination_params)
for page in page_iterator:
for file_meta in page["Contents"]:
# print(file_meta)
relative_path = file_meta["Key"][len(self.cos_dir) + 1:]
if len(relative_path) ==0:
continue
if file_meta["Size"] == 0:
continue

# Identify the path to the file on disk.
local_file_path = os.path.join(download_dir, relative_path)
print(f"Download {file_meta['Key']} to {local_file_path}")

# If the option to overwrite downloads is ON, and the file already exists, skip it.
if self.overwrite_downloads and os.path.exists(local_file_path):
print(f"File {local_file_path} already exists. Skipping.")
continue

# Create the directories as required
local_dir = os.path.dirname(local_file_path)
if not os.path.exists(local_dir):
os.makedirs(local_dir)

self.cos_resource.Bucket(self.cos_bucket).download_file(file_meta["Key"], local_file_path)

return download_dir

class BaseEvaluationDatasetBuilder:
def __init__(
Expand Down Expand Up @@ -65,6 +137,11 @@ def retrieve_input_dataset(self) -> Path:
path = Path(path_str)
elif isinstance(self.dataset_source, Path):
path = self.dataset_source
elif isinstance(self.dataset_source, S3Source):
# Download the data from S3 bucket to the target folder
self.dataset_source.download_objects(self.target)
path = Path(self.target)
self.dataset_local_path = path
else:
raise RuntimeError(
f"Unknown dataset_source type {type(self.dataset_source)}"
Expand Down Expand Up @@ -103,13 +180,14 @@ def save_to_disk(self):
"You must first retrieve the source dataset. Call retrieve_input_dataset()."
)

test_dir = self.target / "test"
test_dir = self.target / "intermediate_files"
os.makedirs(test_dir, exist_ok=True)

count = 0
for record_chunk in chunkify(self.iterate(), 80):
record_chunk = [r.as_record_dict() for r in record_chunk]
save_shard_to_disk(items=record_chunk, dataset_path=test_dir)
shard_id = count # set the id based on the count of records..
save_shard_to_disk(items=record_chunk, dataset_path=test_dir, shard_id=shard_id)
count += len(record_chunk)

write_datasets_info(
Expand Down
Loading
Loading