# Set up notebook


In [48]:
# Warning control
import warnings

warnings.filterwarnings("ignore")
import logging

# logger = logging.getLogger()
# logger.setLevel(logging.CRITICAL)

import nltk

nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
import collections
from io import StringIO
from lxml import etree

[nltk_data] Downloading package punkt_tab to /Users/ducdo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/ducdo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [47]:
import json
from IPython.display import JSON

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title

from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements
from unstructured_client.models import operations
import chromadb

from pathlib import Path
import pickle
import os
from dotenv import load_dotenv
from IPython.core.display import HTML


In [11]:
load_dotenv()
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")

In [12]:
import unstructured_client


client = unstructured_client.UnstructuredClient(
    api_key_auth=UNSTRUCTURED_API_KEY,
    server_url="https://api.unstructuredapp.io",
)


# Utility functions


In [None]:
def partition_pdf_dld(
    filename,
    skip_infer_table_types=[],
    pdf_infer_table_structure=True,
):
    """
    Partitions a PDF file into its constituent elements using the Unstructured API.

    This function reads a PDF file, partitions it into its elements such as text, tables, and images,
    and returns these elements in a structured format. It allows for the inference of table structures
    within the PDF and can skip certain types of tables during the inference process.

    Args:
        filename (str): The path to the PDF file to be partitioned.
        skip_infer_table_types (list[str], optional): A list of table types to skip during table structure inference. Defaults to [].
        pdf_infer_table_structure (bool, optional): A flag indicating whether to infer table structures within the PDF. Defaults to True.

    Returns:
        list[dict]: A list of dictionaries, each representing an element extracted from the PDF file.
    """
    with open(filename, "rb") as f:
        files = shared.Files(
            content=f.read(),
            file_name=filename,
        )

    req = shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        hi_res_model_name="yolox",
        pdf_infer_table_structure=pdf_infer_table_structure,
        skip_infer_table_types=skip_infer_table_types,
    )

    try:
        resp = client.general.partition(req)
        dld_elements = dict_to_elements(resp.elements)
    except SDKError as e:
        print(e)
    return dld_elements


In [57]:
def parse_and_display_table_html(
    table_html: str, print_html: bool = False, display_html: bool = True
) -> None:
    """
    Parses and displays the given HTML table string.

    Args:
        table_html (str): The HTML string representation of the table to be parsed and displayed.

    Returns:
        None
    """
    parser = etree.XMLParser(remove_blank_text=True)
    file_obj = StringIO(table_html)
    tree = etree.parse(file_obj, parser)
    if print_html:
        print(etree.tostring(tree, pretty_print=True).decode())
    if display_html:
        display(HTML(table_html))

# File paths


In [6]:
file_dir = Path.cwd().parents[1] / "data" / "pdf"
doc1 = file_dir / "DOA-vs-VKA-blood-clots.pdf"
doc2 = file_dir / "fimmu-14-1145145.pdf"


# Extract elements using different strategies


## 1) Fast strategy -- only text extraction from a text-based PDF, then content normalization


In [7]:
doc1_pdf_elements = partition_pdf(
    filename=doc1,
    strategy="fast",
)


In [34]:
doc1_pdf_elements_categories = [el.category for el in doc1_pdf_elements]
collections.Counter(doc1_pdf_elements_categories)


Counter({'NarrativeText': 143,
         'UncategorizedText': 121,
         'Title': 114,
         'ListItem': 52})

## 2) Document-layout-detection (DLD) strategy -- extract text and layout from a mixed-format PDF


In [49]:
str(doc1)


'/Users/ducdo/Repo/Demo/data/pdf/DOA-vs-VKA-blood-clots.pdf'

In [17]:
doc1_dld_elements = partition_pdf_dld(filename=str(doc1))


In [46]:
doc1_dld_elements_with_infer_table_structure = partition_pdf_dld(
    filename=str(doc1), skip_infer_table_types=[], pdf_infer_table_structure=True
)

with open("doc1_dld_elements.pkl", "wb") as f:
    pickle.dump(doc1_dld_elements_with_infer_table_structure, f)

In [53]:
doc1_dld_elements_categories = [el.category for el in doc1_dld_elements]
print(collections.Counter(doc1_dld_elements_categories))
doc1_dld_elements_with_infer_table_structure_categories = [
    el.category for el in doc1_dld_elements_with_infer_table_structure
]
print(collections.Counter(doc1_dld_elements_with_infer_table_structure_categories))


Counter({'Title': 437, 'UncategorizedText': 125, 'ListItem': 45, 'NarrativeText': 44, 'Header': 14, 'Footer': 4, 'FigureCaption': 1, 'Table': 1})
Counter({'Title': 437, 'UncategorizedText': 125, 'ListItem': 45, 'NarrativeText': 44, 'Header': 14, 'Footer': 4, 'FigureCaption': 1, 'Table': 1})


In [58]:
doc1_tables = [el for el in doc1_dld_elements if el.category == "Table"]
# doc1_table_example = doc1_tables[0].text
# table_html = doc1_tables[0].metadata.text_as_html
parse_and_display_table_html(doc1_tables[0].metadata.text_as_html)

Clinical Guidelines,Recommendation
"European Alliance of Associations for Rheumatology (EULEAR), 2019 (5)","For patients with venous thrombosis, indefinite anticoagulation is recommended. DOACs may be an alternative in patients unable to achieve target INR with VKAs, or intolerant to VKAs. Rivaroxaban should not be used in triple positive patients due to the increased risk of recurrent thrombosis."
"European Alliance of Associations for Rheumatology (EULEAR), 2019 (5)","For arterial events, indefinite anticoagulation is also recommended, avoiding the use of DOACs"
"European Society of Cardiology (ESC), 2019 (35)",Indefinite treatment with VK As is recommended. DOACs are not recommended.
"American Society of Hematology (ASH), 2020 (36)",Indefinite anticoagulation with VK As is recommended. The use of DOACs is discouraged
"British Society for Haematology (BSH), 2020 (37)","For patients with venous thrombosis, indefinite anticoagulation is recommended. DOACs should not be used in triple-positive patients. Evidence i insufficient to establish recommendations in single or double positive patients. In general, it is suggested to avoid them; however, if patients are already being treated with DOACs, they may be continued depending on the clinical profile and patient preferences."
"British Society for Haematology (BSH), 2020 (37)","In patients with arterial thrombosis, indefinite treatment with VKAs is recommended. DOACs are not recommended."
"National Institute for Health and Care Excellence (NICE), 2020 (38)","'VKAs are recommended in triple positive patients,"
"International Society on Thrombosis and Haemostasis (ISTH), 2020 (39)","In patients with high-risk thrombotic APS*, VKAs are recommended. In patients with APS without high-risk criteria who are already on DOACs therapy, it may be maintained depending on the clinical profile and patient preference."
"American College of Cardiology (ACC), 2024 (40)",DOAC:s are not considered standard treatment in patients with APS.


In [59]:
doc2_dld_elements_with_infer_table_structure = partition_pdf_dld(
    filename=str(doc2), skip_infer_table_types=[], pdf_infer_table_structure=True
)
doc2_tables = [
    el for el in doc2_dld_elements_with_infer_table_structure if el.category == "Table"
]
# doc2_table_example = doc2_tables[0].text
parse_and_display_table_html(doc2_tables[0].metadata.text_as_html)
