# Parse PDFs with Docling

In [4]:
from utils import get_companies_dict

companies_dict = get_companies_dict(r"../data/round2/subset.json")

In [2]:
# only keep subset for dev
companies = ["Ziff Davis, Inc.", ""]
companies_dict = {company: data for company, data in companies_dict.items() if company in companies}

In [None]:
# Potential for parallelization for files?
import os
from docling.document_converter import DocumentConverter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


converter = DocumentConverter()  # 20 sec for setting up

preproc_logs = {"failed": [], "skipped": []}


def process_company(company, data):
    pdf_path = f"../data/round2/pdfs/{data['sha1']}.pdf"
    docling_md_path = f"../data/docling_md/round2/docling_{data['sha1']}.md"

    if not os.path.exists(docling_md_path):
        print(f"Processing {company}")
        try:
            result = converter.convert(pdf_path)
            with open(docling_md_path, "w", encoding="utf-8") as file:
                file.write(result.document.export_to_markdown())
            # Return None error to indicate success.
            return (company, None)
        except Exception as e:
            print(f"Error processing {company}: {e}")
            return (company, e)
    else:
        print(f"Skipping {company}")
        return (company, "skipped")

# Use ThreadPoolExecutor to process companies concurrently.
with ThreadPoolExecutor(max_workers=1) as executor:
    # Submit all tasks
    futures = {executor.submit(process_company, company, data): company
               for company, data in companies_dict.items()}

    # Process tasks as they complete.
    for future in tqdm(as_completed(futures), total=len(futures)):
        company, result = future.result()
        if result is None:
            continue  # Successful processing.
        elif result == "skipped":
            preproc_logs["skipped"].append(company)
        else:
            preproc_logs["failed"].append({"company_id": company, "error": result})

if preproc_logs["failed"]:
    print(f"Nr of failed companies: {len(preproc_logs['failed'])}")


# RUNTIMES:
# REIT: execution for 7.4MB with 97 pages -> 5 min
# Yellow Pages: execution for 1.4MB with 77 pages -> 4 min
# Calyxt: execution for 0.7MB with 88 pages -> 3min
# TODO check on google colab

# NOTES:
# brackets in tables indicate negative values
# parsing issues (see current system prompt)

Skipping ACRES Commercial Realty Corp.
Skipping Aptevo Therapeutics Inc.


  0%|          | 0/100 [00:00<?, ?it/s]

Skipping Downer EDI Limited
Skipping Odyssey Gold Limited
Skipping NextNav Inc.
Skipping Peako Limited
Skipping Mosaic Brands Limited
Skipping Aurora Innovation, Inc.
Skipping Crombie REIT
Skipping Medallion Financial Corp.
Skipping Enact Holdings, Inc.
Skipping BetMakers Technology Group Ltd
Skipping OFX Group Limited
Skipping FNCB Bancorp, Inc.
Skipping Celldex Therapeutics, Inc.
Skipping SIG plc
Skipping Motability Operations Group plc
Skipping BCB Bancorp, Inc.
Skipping 1-800-FLOWERS.COM, INC.
Skipping Weis Markets, Inc.
Skipping Odyssey Group Holdings, Inc.
Skipping Blue Apron Holdings, Inc.
Skipping Ocugen, Inc.
Skipping Rectifier Technologies Ltd
Skipping Structural Monitoring Systems Plc
Skipping Origin Bancorp, Inc.
Skipping Liberty Broadband Corporation
Skipping Incyte Corporation
Skipping Guaranty Bancshares, Inc.
Skipping Infinera Corporation
Skipping MainStreet Bancshares, Inc.
Skipping Starvest plc
Skipping INMUNE BIO INC.
Skipping Bionano Genomics, Inc.
Skipping AstraZen

 41%|████      | 41/100 [1:18:11<1:52:31, 114.44s/it]

Skipping Sonic Automotive, Inc.
Processing Commerzbank


 43%|████▎     | 43/100 [1:41:21<2:24:38, 152.25s/it]

Skipping Seiko Epson Corporation
Processing DiaMedica Therapeutics Inc.


 45%|████▌     | 45/100 [1:48:58<2:25:31, 158.75s/it]

Processing James Halstead plc


 46%|████▌     | 46/100 [1:53:11<2:27:44, 164.16s/it]

Processing Kiniksa Pharmaceuticals, Ltd.


 47%|████▋     | 47/100 [1:59:40<2:40:01, 181.16s/it]

Processing MongoDB, Inc.


 48%|████▊     | 48/100 [2:08:48<3:08:01, 216.94s/it]

Processing Ritchie Bros. Auctioneers Incorporated


 49%|████▉     | 49/100 [2:15:37<3:24:20, 240.41s/it]

Processing Terns Pharmaceuticals, Inc.


 50%|█████     | 50/100 [2:22:50<3:44:12, 269.05s/it]

Processing Insperity, Inc.


 51%|█████     | 51/100 [2:26:43<3:34:34, 262.74s/it]

Processing Bridgewater Bancshares, Inc.


 52%|█████▏    | 52/100 [2:34:34<4:03:32, 304.43s/it]

Processing Empire Company Limited


 53%|█████▎    | 53/100 [2:39:28<3:56:42, 302.19s/it]

Processing Nordic American Tankers Limited


 54%|█████▍    | 54/100 [2:43:33<3:41:04, 288.37s/it]

Processing Microsoft Corporation


 55%|█████▌    | 55/100 [2:48:09<3:33:51, 285.15s/it]

Processing Westwater Resources, Inc.


 56%|█████▌    | 56/100 [2:51:10<3:08:36, 257.19s/it]

Processing Aptiv PLC


 57%|█████▋    | 57/100 [3:00:25<4:03:29, 339.76s/it]

Processing Datalogic


 58%|█████▊    | 58/100 [3:20:51<6:53:35, 590.85s/it]

Processing Rapid7


 59%|█████▉    | 59/100 [3:27:52<6:10:21, 541.98s/it]

Processing Alcoa Corporation


 60%|██████    | 60/100 [3:37:37<6:09:43, 554.60s/it]

Processing NuCana plc


 61%|██████    | 61/100 [3:47:07<6:03:19, 558.96s/it]

Processing Pintec Technology Holdings Limited


 62%|██████▏   | 62/100 [3:59:36<6:29:40, 615.29s/it]

Processing ARCA Biopharma, Inc.


 63%|██████▎   | 63/100 [4:02:39<5:00:16, 486.94s/it]

Processing HCA Healthcare, Inc.


 64%|██████▍   | 64/100 [4:08:37<4:29:07, 448.54s/it]

Processing Armadale Capital Plc


 65%|██████▌   | 65/100 [4:10:27<3:22:36, 347.34s/it]

Processing AA Limited


 66%|██████▌   | 66/100 [4:20:06<3:56:03, 416.56s/it]

Processing CSG Systems International, Inc.


 67%|██████▋   | 67/100 [4:25:39<3:35:22, 391.58s/it]

Processing Arcadia Minerals Limited


 68%|██████▊   | 68/100 [4:29:38<3:04:28, 345.89s/it]

Processing Tellurian Inc.


 69%|██████▉   | 69/100 [4:37:21<3:16:47, 380.88s/it]

Processing Wheeler Real Estate Investment Trust, Inc.


 70%|███████   | 70/100 [4:43:42<3:10:27, 380.90s/it]

Processing archTIS Limited


 71%|███████   | 71/100 [4:53:35<3:34:50, 444.50s/it]

Processing Beazley plc


 72%|███████▏  | 72/100 [5:22:22<6:26:55, 829.11s/it]

Processing Poste Italiane
