In [2]:
import unstructured_client
from unstructured_client.models import operations, shared
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY")
url_unstructured = os.getenv("UNSTRUCTURED_URL")

client = unstructured_client.UnstructuredClient(
    api_key_auth=unstructured_api_key,
    server_url=url_unstructured,
)

filename = "data/attention-is-all-you-need.pdf"
with open(filename, "rb") as f:
    data = f.read()

req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=data,
            file_name=filename,
        ),
        # --- Other partition parameters ---
        strategy=shared.Strategy.AUTO,
        languages=['eng'],
    ),
)

try:
    res = client.general.partition(request=req)
    print(res.elements[0])
    print(res.elements[1])
    print(res.elements[2])
except Exception as e:
    print(e)


INFO: Preparing to split document for partition.
INFO: Starting page number set to 1


INFO: Concurrency level set to 5
INFO: Determined optimal split size of 3 pages.
INFO: Document split into 4, 3-paged sets.
INFO: Partitioning 4, 3-paged sets.
INFO: Partitioning set #1 (pages 1-4).
INFO: Partitioning set #2 (pages 4-7).
INFO: Partitioning set #3 (pages 7-10).
INFO: Partitioning set #4 (pages 10-11).
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: Successfully partitioned set #1, elements added to the final result.
INFO: Successfully partitioned set #2, elements added to the final result.
INFO: Successfully partitioned set #3, elements added to the final result.
INFO: Successfully partitioned set #4, elements added to the final result.
INFO: Successfully partitioned the document.


{'type': 'Title', 'element_id': '237531ab021355e47d29794149af6a94', 'text': 'Attention Is All You Need', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'attention-is-all-you-need.pdf'}}
{'type': 'NarrativeText', 'element_id': 'd4e57c05e627d81797276d0bb0a03faa', 'text': 'Ashish Vaswani∗ Google Brain avaswani@google.com', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '237531ab021355e47d29794149af6a94', 'filename': 'attention-is-all-you-need.pdf'}}
{'type': 'Title', 'element_id': '4cf8b15c81cb51aa0ae3135b22eb265b', 'text': 'Llion Jones∗ Google Research llion@google.com', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'attention-is-all-you-need.pdf'}}


In [None]:
{'type': 'Title', 'element_id': '237531ab021355e47d29794149af6a94', 'text': 'Attention Is All You Need', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'attention-is-all-you-need.pdf'}}

{'type': 'NarrativeText', 'element_id': 'd4e57c05e627d81797276d0bb0a03faa', 'text': 'Ashish Vaswani∗ Google Brain avaswani@google.com', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'parent_id': '237531ab021355e47d29794149af6a94', 'filename': 'attention-is-all-you-need.pdf'}}

{'type': 'Title', 'element_id': '4cf8b15c81cb51aa0ae3135b22eb265b', 'text': 'Llion Jones∗ Google Research llion@google.com', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'attention-is-all-you-need.pdf'}}