The unstructured library aims to simplify and streamline the preprocessing of structured and unstructured documents for downstream tasks. And what that means is no matter where your data is and no matter what format that data is in, Unstructured’s toolkit will transform and preprocess that data into an easily digestable and usable format.

In [85]:
# %pip install "unstructured[all-docs]" unstructured-client watermark

In [86]:
# pip install watermark

In [87]:
# Load the watermark extension
%load_ext watermark 

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [88]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [89]:
# Import JSON display for Jupyter Notebook
from IPython.display import JSON

# Import json module for working with JSON data
import json

# Import UnstructuredClient class and shared models from the unstructured_client module
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared

# Import SDKError class from the unstructured_client.models.errors module
from unstructured_client.models.errors import SDKError

# Import partition_html and partition_pdf functions for partitioning HTML and PDF documents
from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf

# Import dict_to_elements and elements_to_json functions from the unstructured.staging.base module
from unstructured.staging.base import dict_to_elements, elements_to_json

In [90]:
# Display the versions of installed packages
%watermark --iversions 

unstructured       : 0.13.6
unstructured_client: 0.22.0
json               : 2.0.9



In [91]:
# Import the `unstructured.partition` module
import unstructured.partition

# Print the help documentation for the `unstructured.partition` module
help(unstructured.partition)

Help on package unstructured.partition in unstructured:

NAME
    unstructured.partition

PACKAGE CONTENTS
    api
    auto
    common
    csv
    doc
    docx
    email
    epub
    html
    image
    json
    lang
    md
    model_init
    msg
    odt
    org
    pdf
    pdf_image (package)
    ppt
    pptx
    rst
    rtf
    strategies
    text
    text_type
    tsv
    utils (package)
    xlsx
    xml

FILE
    c:\users\divak\anaconda3\envs\myenv\lib\site-packages\unstructured\partition\__init__.py




In [92]:
# Import the `partition_pdf` function from the `unstructured.partition.pdf` module
from unstructured.partition.pdf import partition_pdf

# Specify the path to your PDF file
filename = "data\MINIGPT_5.pdf"

# Call the partition_pdf function
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf(filename)

# Now, elements is a list of all elements present in the pages of the parsed pdf document

In [93]:
elements

[<unstructured.documents.elements.Text at 0x14d158a6790>,
 <unstructured.documents.elements.Title at 0x14d15639be0>,
 <unstructured.documents.elements.Text at 0x14d15639cd0>,
 <unstructured.documents.elements.Text at 0x14d15639d90>,
 <unstructured.documents.elements.Title at 0x14d156f85e0>,
 <unstructured.documents.elements.Text at 0x14d15639f70>,
 <unstructured.documents.elements.Title at 0x14d15639b80>,
 <unstructured.documents.elements.Title at 0x14d156f8c70>,
 <unstructured.documents.elements.Title at 0x14d1562f160>,
 <unstructured.documents.elements.Title at 0x14d1562f340>,
 <unstructured.documents.elements.NarrativeText at 0x14d1562f430>,
 <unstructured.documents.elements.Text at 0x14d1562f4c0>,
 <unstructured.documents.elements.Title at 0x14d1562f610>,
 <unstructured.documents.elements.NarrativeText at 0x14d1562f700>,
 <unstructured.documents.elements.NarrativeText at 0x14d1562f7f0>,
 <unstructured.documents.elements.NarrativeText at 0x14d1562f8e0>,
 <unstructured.documents.elem

In [94]:
len(elements)

417

In [95]:
# Convert each element to a dictionary
element_dict = [el.to_dict() for el in elements]
# Dump the list of dictionaries into a JSON string
output = json.dumps(element_dict, indent=2)
# Print the resulting JSON string
print(output)

[
  {
    "type": "UncategorizedText",
    "element_id": "f2aea0911b6c158a5bd75b1fba564c40",
    "text": "4 2 0 2",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.34,
            207.81999999999994
          ],
          [
            16.34,
            247.81999999999994
          ],
          [
            36.34,
            247.81999999999994
          ],
          [
            36.34,
            207.81999999999994
          ]
        ],
        "system": "PixelSpace",
        "layout_width": 612,
        "layout_height": 792
      },
      "file_directory": "data",
      "filename": "MINIGPT_5.pdf",
      "languages": [
        "eng"
      ],
      "last_modified": "2024-05-05T10:57:29",
      "page_number": 1,
      "filetype": "application/pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "71c8fbf6ab2f71ab6175c659f493093f",
    "text": "r a",
    "metadata": {
      "coordinates": {
        "points": [
          [
            16.

In [96]:
# lets investigate a bit more
unstructured.documents.elements

<module 'unstructured.documents.elements' from 'c:\\Users\\divak\\anaconda3\\envs\\myenv\\lib\\site-packages\\unstructured\\documents\\elements.py'>

##### Let's use python sdk 

In [97]:
#pip install python-dotenv

In [98]:
# import os module for operating system dependent functionality
# import load_dotenv and find_dotenv functions from dotenv module
# to specify environment variables in a .env file
import os
from dotenv import load_dotenv, find_dotenv

# Load the .env file
load_dotenv(find_dotenv())

True

In [99]:
# Import os module to access environment variables
import os

# Get the value of saas_api_key_auth environment variable
saas_api_key_auth = os.environ.get('saas_api_key_auth')

# Get the value of saas_server_url environment variable
saas_server_url = os.environ.get('saas_server_url')

In [100]:
# Create an instance of the UnstructuredClient class
# and pass in the API key authentication and server URL
client = UnstructuredClient(
    api_key_auth=saas_api_key_auth,  # Replace this with your actual API key
    server_url=saas_server_url  # Replace this with your actual server URL
)

In [101]:
# Specify the PDF filename
filename = "data\MINIGPT_5.pdf"

# Open the file and read its content
with open(filename, "rb") as f:
    files = shared.Files(
        content=f.read(),  # Read the content of the file
        file_name=filename,  # Set the file name
    )

# This block of code is redundant, as it does the same thing as the previous block
# with open(filename, "rb") as f:
#     files = shared.Files(
#         content=f.read(),
#         file_name=filename,
#     )

# Define the partition parameters using the file
req = shared.PartitionParameters(files=files)

# Try to partition the file, catch and print any SDKError
try:
    resp = client.general.partition(req)
except SDKError as e:
    print(e)

In [102]:
print(resp)

PartitionResponse(content_type='application/json', status_code=200, raw_response=<Response [200]>, elements=[{'type': 'Title', 'element_id': 'd99ad376d3c673278a6c8b90e4facb15', 'text': 'MINIGPT-5: INTERLEAVED VISION-AND-LANGUAGE GENERATION VIA GENERATIVE VOKENS', 'metadata': {'languages': ['eng'], 'page_number': 1, 'filename': 'data\\MINIGPT_5.pdf', 'filetype': 'application/pdf'}}, {'type': 'Title', 'element_id': 'd5d4430ca05ac3791e755e87c1c256d8', 'text': 'Kaizhi Zheng∗, Xuehai He∗ , and Xin Eric Wang', 'metadata': {'languages': ['eng'], 'page_number': 1, 'filename': 'data\\MINIGPT_5.pdf', 'filetype': 'application/pdf'}}, {'type': 'Title', 'element_id': '9d1274315ba7eae03960265c08d5edaa', 'text': 'University of California, Santa Cruz https://github.com/eric-ai-lab/MiniGPT-5', 'metadata': {'languages': ['eng'], 'page_number': 1, 'filename': 'data\\MINIGPT_5.pdf', 'filetype': 'application/pdf'}}, {'type': 'UncategorizedText', 'element_id': '07cd30f6f89754a2c217419880a91514', 'text': '4 

In [103]:
resp.elements

[{'type': 'Title',
  'element_id': 'd99ad376d3c673278a6c8b90e4facb15',
  'text': 'MINIGPT-5: INTERLEAVED VISION-AND-LANGUAGE GENERATION VIA GENERATIVE VOKENS',
  'metadata': {'languages': ['eng'],
   'page_number': 1,
   'filename': 'data\\MINIGPT_5.pdf',
   'filetype': 'application/pdf'}},
 {'type': 'Title',
  'element_id': 'd5d4430ca05ac3791e755e87c1c256d8',
  'text': 'Kaizhi Zheng∗, Xuehai He∗ , and Xin Eric Wang',
  'metadata': {'languages': ['eng'],
   'page_number': 1,
   'filename': 'data\\MINIGPT_5.pdf',
   'filetype': 'application/pdf'}},
 {'type': 'Title',
  'element_id': '9d1274315ba7eae03960265c08d5edaa',
  'text': 'University of California, Santa Cruz https://github.com/eric-ai-lab/MiniGPT-5',
  'metadata': {'languages': ['eng'],
   'page_number': 1,
   'filename': 'data\\MINIGPT_5.pdf',
   'filetype': 'application/pdf'}},
 {'type': 'UncategorizedText',
  'element_id': '07cd30f6f89754a2c217419880a91514',
  'text': '4 2 0 2',
  'metadata': {'languages': ['eng'],
   'page_nu

In [104]:
len(resp.elements)

417

In [105]:
resp.elements[0]

{'type': 'Title',
 'element_id': 'd99ad376d3c673278a6c8b90e4facb15',
 'text': 'MINIGPT-5: INTERLEAVED VISION-AND-LANGUAGE GENERATION VIA GENERATIVE VOKENS',
 'metadata': {'languages': ['eng'],
  'page_number': 1,
  'filename': 'data\\MINIGPT_5.pdf',
  'filetype': 'application/pdf'}}

In [106]:
unique_types = set()

for item in resp.elements:
    unique_types.add(item['type'])

print(unique_types)

{'UncategorizedText', 'ListItem', 'NarrativeText', 'Title', 'Footer'}


In [107]:
# Calling the Unstructured API from the Unstructured open source library
import os

# Load base URL from .env file
base_url = saas_server_url

# Remaining part of the API URL
remaining_url = "/general/v0/general"

# Concatenate base URL with the remaining URL
api_url = base_url + remaining_url

In [108]:
from unstructured.partition.api import partition_via_api

filename = "data\MINIGPT_5.pdf"

elements = partition_via_api(
  filename=filename,
  api_key=saas_api_key_auth,
  api_url=api_url
)

In [109]:
elements

[<unstructured.documents.elements.Title at 0x14d1567c250>,
 <unstructured.documents.elements.Title at 0x14d1567cca0>,
 <unstructured.documents.elements.Title at 0x14d1567cc10>,
 <unstructured.documents.elements.Text at 0x14d1567ce80>,
 <unstructured.documents.elements.Title at 0x14d1567c610>,
 <unstructured.documents.elements.Text at 0x14d156eac40>,
 <unstructured.documents.elements.Text at 0x14d156eaa30>,
 <unstructured.documents.elements.Title at 0x14d156ea700>,
 <unstructured.documents.elements.Text at 0x14d156ea7c0>,
 <unstructured.documents.elements.Title at 0x14d156ea580>,
 <unstructured.documents.elements.NarrativeText at 0x14d156ea4c0>,
 <unstructured.documents.elements.Text at 0x14d156ea6d0>,
 <unstructured.documents.elements.Title at 0x14d156ea970>,
 <unstructured.documents.elements.NarrativeText at 0x14d156eaf10>,
 <unstructured.documents.elements.NarrativeText at 0x14d156ea400>,
 <unstructured.documents.elements.NarrativeText at 0x14d156ea820>,
 <unstructured.documents.elem

In [110]:
len(elements)

417

In [111]:
from unstructured.partition.html import partition_html
import json

# Define the filename
filename = "data/robust_rag.html"

# Partition the HTML file into elements
elements = partition_html(filename=filename)

# Convert each element into a dictionary
element_dict = [el.to_dict() for el in elements]

# Extract the 12th to 15th elements and convert them to a JSON string
example_output = json.dumps(element_dict[11:15], indent=2)

# Print the JSON string
print(example_output)

INFO: Reading document from string ...
INFO: Reading document ...
[
  {
    "type": "Title",
    "element_id": "30087720a972284a598a826227c001b9",
    "text": "\ud83d\udc68\ud83c\udffe\u200d\ud83d\udcbb GitHub \u2b50\ufe0f| \ud83d\udc26 Twitter | \ud83d\udcf9 YouTube | \ud83d\udc54LinkedIn | \u2615\ufe0fKo-fi",
    "metadata": {
      "category_depth": 0,
      "last_modified": "2024-05-06T10:45:09",
      "link_texts": [
        "GitHub",
        "Twitter",
        "YouTube",
        "LinkedIn",
        "Ko-fi"
      ],
      "link_urls": [
        "https://github.com/sudarshan-koirala",
        "https://twitter.com/mesudarshan",
        "https://www.youtube.com/@datasciencebasics",
        "https://www.linkedin.com/in/sudarshan-koirala/",
        "http://ko-fi.com/datasciencebasics"
      ],
      "link_start_indexes": [
        5,
        18,
        30,
        41,
        54
      ],
      "page_number": 1,
      "languages": [
        "eng"
      ],
      "file_directory": "data"

In [112]:
elements

[<unstructured.documents.html.HTMLTitle at 0x14d158ef280>,
 <unstructured.documents.html.HTMLTitle at 0x14d158ef0d0>,
 <unstructured.documents.html.HTMLTitle at 0x14d158efee0>,
 <unstructured.documents.html.HTMLTitle at 0x14d15450280>,
 <unstructured.documents.html.HTMLTitle at 0x14d0e65acd0>,
 <unstructured.documents.html.HTMLNarrativeText at 0x14d0e65ab50>,
 <unstructured.documents.html.HTMLText at 0x14d0e65a130>,
 <unstructured.documents.html.HTMLTitle at 0x14d15638e20>,
 <unstructured.documents.html.HTMLTitle at 0x14d15638580>,
 <unstructured.documents.html.HTMLTitle at 0x14d15638bb0>,
 <unstructured.documents.html.HTMLTitle at 0x14d15638970>,
 <unstructured.documents.html.HTMLTitle at 0x14d15638910>,
 <unstructured.documents.html.HTMLNarrativeText at 0x14d14f7a5b0>,
 <unstructured.documents.html.HTMLNarrativeText at 0x14d154d4610>,
 <unstructured.documents.html.HTMLNarrativeText at 0x14d15638520>,
 <unstructured.documents.html.HTMLListItem at 0x14d15386310>,
 <unstructured.documen

In [113]:
len(elements)

57