In [1]:
from redhat_documentation import RedHatDocumentationLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

urls = ["https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/installing_and_uninstalling_openshift_ai_self-managed/index"]

In [2]:
# Load, parse, and transform to Markdown
loader = RedHatDocumentationLoader(urls)
docs = loader.load()
html2text = Html2TextTransformer()
md_docs = html2text.transform_documents(docs)

In [3]:
# Markdown splitter config
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=True
    )

# Markdown split
new_splits = []
for doc in md_docs:
    md_header_splits = markdown_splitter.split_text(doc.page_content)
    for split in md_header_splits:
        split.metadata = split.metadata | doc.metadata

    new_splits.extend(md_header_splits)

# Char-level splitter config
chunk_size = 2048
chunk_overlap = 256
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Char-level split
splits = text_splitter.split_documents(new_splits)

for split in splits:
    content_header = f"Section: {split.metadata['title']}"
    for header_name in ["Header 1", "Header 2", "Header 3"]:
        if header_name in split.metadata:
            content_header += f" / {split.metadata[header_name]}"
    content_header += "\n\nContent:\n"
    split.page_content = content_header + split.page_content

for split in splits:
    print(f"Metadata: {split.metadata}")
    print(f"Page Content:\n-------------\n{split.page_content}")
    print("\n")

Metadata: {'Header 1': 'Preface', 'source': 'https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/installing_and_uninstalling_openshift_ai_self-managed/index', 'title': 'Installing and uninstalling OpenShift AI Self-Managed'}
Page Content:
-------------
Section: Installing and uninstalling OpenShift AI Self-Managed / Preface

Content:
Learn how to use both the OpenShift command-line interface and web console to
install Red Hat OpenShift AI Self-Managed on your OpenShift Container Platform
cluster. To uninstall the product, learn how to use the recommended command-
line interface (CLI) method.  
Note  
Red Hat recommends that you install only one instance of OpenShift AI on your
cluster.  
Installing the Red Hat OpenShift AI Operator on the same cluster as the Red
Hat OpenShift AI Add-on is not recommended or supported.


Metadata: {'Header 1': 'Chapter 1. Architecture of OpenShift AI Self-Managed', 'source': 'https://access.redhat.com/doc