# Parse PDFs with Docling

In [27]:
from utils import get_companies_dict

companies_dict = get_companies_dict(r"../data/round2/subset.json")

In [29]:
# only keep subset for dev
companies = ['RWE AG']
companies_dict = {company: data for company, data in companies_dict.items() if company in companies}

In [30]:
# Potential for parallelization for files?
import os
from docling.document_converter import DocumentConverter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


converter = DocumentConverter()  # 20 sec for setting up

preproc_logs = {"failed": [], "skipped": []}


def process_company(company, data):
    pdf_path = f"../data/round2/pdfs/{data['sha1']}_150_letter.pdf"
    docling_md_path = f"../data/docling_md/round2/docling_{data['sha1']}.md"

    if not os.path.exists(docling_md_path):
        print(f"Processing {company}")
        try:
            result = converter.convert(pdf_path)
            with open(docling_md_path, "w", encoding="utf-8") as file:
                file.write(result.document.export_to_markdown())
            # Return None error to indicate success.
            return (company, None)
        except Exception as e:
            print(f"Error processing {company}: {e}")
            return (company, e)
    else:
        print(f"Skipping {company}")
        return (company, "skipped")

# Use ThreadPoolExecutor to process companies concurrently.
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit all tasks
    futures = {executor.submit(process_company, company, data): company
               for company, data in companies_dict.items()}

    # Process tasks as they complete.
    for future in tqdm(as_completed(futures), total=len(futures)):
        company, result = future.result()
        if result is None:
            continue  # Successful processing.
        elif result == "skipped":
            preproc_logs["skipped"].append(company)
        else:
            preproc_logs["failed"].append({"company_id": company, "error": result})

if preproc_logs["failed"]:
    print(f"Nr of failed companies: {len(preproc_logs['failed'])}")


# RUNTIMES:
# REIT: execution for 7.4MB with 97 pages -> 5 min
# Yellow Pages: execution for 1.4MB with 77 pages -> 4 min
# Calyxt: execution for 0.7MB with 88 pages -> 3min

# NOTES:
# brackets in tables indicate negative values
# parsing issues (see current system prompt)

Processing RWE AG


100%|██████████| 1/1 [07:36<00:00, 456.29s/it]


In [8]:
 preproc_logs["failed"]

[{'company_id': 'RWE AG',
  'error': AssertionError('Merged element must be of same type as element.')},
 {'company_id': 'Playtech plc',
  'error': AssertionError('Merged element must be of same type as element.')}]

In [28]:
companies_dict["Datalogic"]

{'name': 'Datalogic',
 'sha1': '980742aa08ea64d552c153bcefbd7e8243fb9efd',
 'id': None}

In [24]:
companies_dict[preproc_logs["failed"][0]["company_id"]]

{'name': 'RWE AG',
 'sha1': 'cc0fc5888b99758100a7ff024863fc4337b6b3c5',
 'id': None}