In [None]:
! pip install "unstructured[all-docs]"

# HTML

Test loading HTML files w/ `Unstructured` `partition_html`.

## Load 

Extract elements using `Unstructured`.

In [None]:
import pandas as pd
from lxml import html
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.html import partition_html

class Element(BaseModel):
    type: str
    text: Any

In [35]:
# Get elements
url = "https://en.wikipedia.org/wiki/List_of_Academy_Awards_ceremonies"
raw_elements = partition_html(url=url)

# Categorize by type
categorized_elements = []
for element in raw_elements:
    if "unstructured.documents.html.HTMLTable" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    else:
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]

# Table elements are nicely extracted 
print(len(table_elements))
print(len(text_elements))

6
101


In [36]:
text_elements[20]

Element(type='text', text='Read')

In [27]:
table_elements[2]

Element(type='table', text='Host\n \n Number of ceremonies\n \n Bob Hope \n 19\n \n Billy Crystal \n 9\n \n Johnny Carson \n 5\n \n Whoopi Goldberg \n 4\n \n Jack Lemmon \n \n Jimmy Kimmel \n 3\n \n Jerry Lewis \n \n Steve Martin \n \n Conrad Nagel \n \n David Niven \n \n Jack Benny \n 2\n \n Chevy Chase \n \n Sammy Davis Jr. \n \n Ellen DeGeneres \n \n Jane Fonda \n \n Goldie Hawn \n \n Walter Matthau \n \n Richard Pryor \n \n Chris Rock \n \n Frank Sinatra \n \n James Stewart \n \n Jon Stewart')

In [37]:
# Get elements
raw_elements = partition_html(url=url,
                             chunking_strategy="by_title")

# Categorize by type
categorized_elements = []
for element in raw_elements:
    if "unstructured.documents.html.HTMLTable" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    else:
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]

# Table elements are nicely extracted 
print(len(table_elements))
print(len(text_elements))

0
265


### Pain Points 

* We want text splits by section of document
* `chunking_strategy="by_title"` does not appear to work as expected

### Summarize

Here, we use [multi-vector retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector).

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [81]:
# Function call to summarize tables 
functions = [
    {
      "name": "parse_table",
      "description": "Summarize table and determine whether to keep",
      "parameters": {
        "type": "object",
        "properties": {
          "summary": {
            "type": "string",
            "description": "A summary of the table"
          },
          "keep": {
            "type": "boolean",
            "description": "Whether to keep the table"
          }
        },
        "required": ["summary", "keep"]
      }
    }
  ]

# Prompt 
prompt_text="""You are an assistant tasked with summarizing tables and filtering out "tables" that are not useful to keep. \
For instance, the table does contain any useful information. Give a concise summary of the table and output whether or not 
the table should be kept. Table: {table} """

# Summary chain 
model = ChatOpenAI(temperature=0,model="gpt-4")
prompt = ChatPromptTemplate.from_template(prompt_text) 
table_summarize_chain = {"table": lambda x:x} | prompt | model.bind(function_call= {"name": "parse_table"}, functions= functions) | JsonOutputFunctionsParser()

# Apply to our tables
tables = [str(i.element) for i in table_elements[0:10]]
table_summaries = table_summarize_chain.batch(tables, {"max_concurrency": 5})

In [None]:
# Summarize sections

# Prompt 
prompt_text="""You are an assistant tasked with summarizing sections of a document. Section: {section} """

# Summary chain 
model = ChatOpenAI(temperature=0,model="gpt-4")
prompt = ChatPromptTemplate.from_template(prompt_text) 
section_summarize_chain = {"section": lambda x:x} | prompt | model 

# *** Ideally document sections can be extracted w/ Unstructured ***
section_summarize_chain = xxx
section_summaries = section_summarize_chain.batch(sections, {"max_concurrency": 5})

In [None]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.embeddings import OpenAIEmbeddings

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OpenAIEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# Add tables 
doc_ids = [str(uuid.uuid4()) for _ in tables]
retriever.vectorstore.add_documents(table_summaries)
retriever.docstore.mset(list(zip(doc_ids, tables)))

# Add section summaries 
### split and add 

# PDF 

Paper (https://arxiv.org/pdf/2307.09288.pdf)

In [None]:
! brew install python-poppler tesseract

In [129]:
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf

class Element(BaseModel):
    type: str
    text: Any

# Get elements
raw_pdf_elements = partition_pdf(filename="/Users/rlm/Desktop/2307.09288.pdf",
                                 chunking_strategy="by_title",
                                 infer_table_structure=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [131]:
unique_categories = {str(type(element)) for element in raw_pdf_elements}
unique_categories

{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Footer'>",
 "<class 'unstructured.documents.elements.Formula'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [132]:
# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    else:
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]

# Table elements are nicely extracted 
len(table_elements)

48

## LangChain integration

We are using `partition`.

```
from unstructured.partition.auto import partition
return partition(filename=self.file_path, **self.unstructured_kwargs)
```

In [29]:
from langchain.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader(url=url,
                                mode="elements",
                                skip_infer_table_types=[])

TypeError: __init__() missing 1 required positional argument: 'file_path'

In [112]:
loader = UnstructuredFileLoader("/Users/rlm/Desktop/2307.09288.pdf",
                                mode="elements",
                                skip_infer_table_types=[])
docs = loader.load()

In [114]:
docs[60]

Document(page_content='. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .', metadata={'source': '/Users/rlm/Desktop/2307.09288.pdf', 'coordinates': {'points': ((253.81755, 265.77340000000015), (253.81755, 275.7360000000001), (517.8164873999997, 275.7360000000001), (517.8164873999997, 265.77340000000015)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': '2307.09288.pdf', 'file_directory': '/Users/rlm/Desktop', 'last_modified': '2023-10-09T12:51:46', 'filetype': 'application/pdf', 'parent_id': '611a920068bf97e03736f9e31960eed0', 'page_number': 2, 'links': [], 'category': 'UncategorizedText'})

In [117]:
# Collect unique categories in a set
unique_categories = {doc.metadata['category'] for doc in docs if 'category' in doc.metadata}
# Print out the unique categories
print(unique_categories)

{'Title', 'ListItem', 'UncategorizedText', 'NarrativeText'}


In [118]:
loader = UnstructuredFileLoader("tesla_2021_10k.htm", mode="elements")
docs_html = loader.load()

In [119]:
# Collect unique categories in a set
unique_categories = {doc.metadata['category'] for doc in docs_html if 'category' in doc.metadata}
# Print out the unique categories
print(unique_categories)

{'ListItem', 'UncategorizedText', 'Title', 'Table', 'NarrativeText'}
