<a href="https://colab.research.google.com/github/dantetemplar/pdf-extraction-agenda/blob/main/repack_olmOCR_mix_0225.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This script extract eval from [olmOCR dataset](https://huggingface.co/datasets/allenai/olmOCR-mix-0225) and repack it to reduce download time

In [7]:
!pip install -q datasets huggingface-hub[hf-transfer]

In [8]:
import json
import os

from datasets import load_dataset
from huggingface_hub import snapshot_download


def download_files():
    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
    path_to_snaphot = snapshot_download(repo_id="allenai/olmOCR-mix-0225", repo_type="dataset", allow_patterns=["*.tar.gz"])
    return path_to_snaphot

In [9]:
dataset = load_dataset("allenai/olmOCR-mix-0225", "00_documents", split="eval_s2pdf")

In [10]:
path_to_snaphot = download_files()

Fetching 52 files:   0%|          | 0/52 [00:00<?, ?it/s]

In [12]:
import tarfile

from collections import Counter
from tqdm import tqdm

ids = dataset.select_columns(["id"]).to_list()
id_set = {id_dict["id"] + ".pdf" for id_dict in ids}
pdf_tarballs_dir = f"{path_to_snaphot}/pdf_tarballs"
tar_files = [os.path.join(pdf_tarballs_dir, f) for f in os.listdir(pdf_tarballs_dir) if f.endswith(".tar.gz")]

counter = Counter()
extracted_files = {}
temp_extract_dir = "temp_extracted"

os.makedirs(temp_extract_dir, exist_ok=True)

for tarball_path in tqdm(tar_files):
    with tarfile.open(tarball_path, "r:gz") as tar:
        for member in tar.getmembers():
            if member.name in id_set:  # Only extract relevant PDFs
                tar.extract(member, path=temp_extract_dir)
                extracted_files[member.name] = os.path.join(temp_extract_dir, member.name)
                counter[tarball_path] += 1

print(f"Extracted {len(extracted_files)} PDFs.")

100%|██████████| 52/52 [13:32<00:00, 15.62s/it]

Extracted 1166 PDFs.





In [13]:
assert all([extracted_files.get(id_) for id_ in id_set]), "All files should be extracted"
print(f"Relevant pdfs was distributed into {len(counter)} tarballs originally")

Relevant pdfs was distributed into 49 tarballs originally


In [14]:
target_chunk_size = 1 * 1024**3  # 1GB in bytes
chunks = []
current_chunk = []
current_size = 0
total_size = 0

for pdf_name, pdf_path in extracted_files.items():
    file_size = os.path.getsize(pdf_path)
    if current_size + file_size > target_chunk_size and current_chunk:
        chunks.append(current_chunk)
        current_chunk = []
        current_size = 0

    current_chunk.append((pdf_name, pdf_path))
    current_size += file_size
    total_size += file_size

if current_chunk:
    chunks.append(current_chunk)
print(f"Will be {len(chunks)} chunks. Total {total_size / (1 * 1024 ** 3):.2f} GB")

Will be 1 chunks. Total 0.20 GB


In [15]:
output_dir = "new_tarballs"
os.makedirs(output_dir, exist_ok=True)
new_tarballs = []

for i, chunk in enumerate(chunks):
    tarball_path = os.path.join(output_dir, f"pdf_chunk_{i:04d}.tar.gz")
    with tarfile.open(tarball_path, "w:gz") as tar:
        for pdf_name, pdf_path in chunk:
            tar.add(pdf_path, arcname=pdf_name)
    new_tarballs.append(tarball_path)

print("Tarballs packed, ready for upload.")

Tarballs packed, ready for upload.


In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from huggingface_hub import HfApi
import shutil

api = HfApi()
repo_id = "dantetemplar/pdf-extraction-agenda"

for tarball in new_tarballs:
    api.upload_file(
        path_or_fileobj=tarball,
        path_in_repo=f"data/olmOCR-mix-0225/{os.path.basename(tarball)}",
        repo_id=repo_id,
        repo_type="dataset"
    )

shutil.rmtree(temp_extract_dir)
print("Uploaded all new tarballs to Hugging Face!")

pdf_chunk_0000.tar.gz:   0%|          | 0.00/200M [00:00<?, ?B/s]

Uploaded all new tarballs to Hugging Face!
