In [8]:
from redhat_documentation import RedHatDocumentationLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import MarkdownHeaderTextSplitter

urls = ["https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/installing_and_uninstalling_openshift_ai_self-managed/index"]

In [9]:
# Load, parse, and transform to Markdown
loader = RedHatDocumentationLoader(urls)
docs = loader.load()
html2text = Html2TextTransformer()
md_docs = html2text.transform_documents(docs)

<pre class="programlisting language-yaml">```yaml
apiVersion: v1
kind: Namespace
metadata:
  name: redhat-ods-operator 
```</pre>
<pre class="programlisting language-yaml">```yaml
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
  name: rhods-operator
  namespace: redhat-ods-operator 
```</pre>
<pre class="programlisting language-yaml">```yaml
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  name: rhods-operator
  namespace: redhat-ods-operator 
spec:
  name: rhods-operator
  channel: stable 
  source: redhat-operators
  sourceNamespace: openshift-marketplace
```</pre>
<pre class="programlisting language-yaml">```yaml
apiVersion: datasciencecluster.opendatahub.io/v1
kind: DataScienceCluster
metadata:
  name: default-dsc
spec:
  components:
    codeflare:
      managementState: Removed
    dashboard:
      managementState: Removed
    datasciencepipelines:
      managementState: Removed
    kserve:
      managementState: Removed
    modelmeshserv

In [10]:
print(md_docs[0].page_content)

# Preface

Learn how to use both the OpenShift command-line interface and web console to
install Red Hat OpenShift AI Self-Managed on your OpenShift Container Platform
cluster. To uninstall the product, learn how to use the recommended command-
line interface (CLI) method.

Note

Red Hat recommends that you install only one instance of OpenShift AI on your
cluster.

Installing the Red Hat OpenShift AI Operator on the same cluster as the
OpenShift Data Science Add-on is not recommended or supported.

# Chapter 1. Architecture of OpenShift AI Self-Managed

Red Hat OpenShift AI Self-Managed is an Operator that is available on a self-
managed environment, such as Red Hat OpenShift Container Platform.

OpenShift AI integrates the following components and services:

  * At the service layer: 

-> OpenShift AI dashboard
     A customer-facing dashboard that shows available and installed applications for the OpenShift AI environment as well as learning resources such as tutorials, quick starts

In [18]:
# Split the Markdown into sections based on headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=True
    )

all_splits = []
for doc in md_docs:
    md_header_splits = markdown_splitter.split_text(doc.page_content)
    for split in md_header_splits:
        split.metadata = split.metadata | doc.metadata
        content_header = f"Section: {split.metadata['title']}"
        for header_name in ["Header 1", "Header 2", "Header 3"]:
            if header_name in split.metadata:
                content_header += f" \ {split.metadata[header_name]}"
        content_header += "\n\nContent:\n"
        split.page_content = content_header + split.page_content
    all_splits.extend(md_header_splits)

In [19]:
for split in all_splits:
    print(f"Metadata: {split.metadata}")
    print(f"Page Content:\n-------------\n{split.page_content[:500]}")
    print("\n")

Metadata: {'Header 1': 'Preface', 'source': 'https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/installing_and_uninstalling_openshift_ai_self-managed/index', 'title': 'Installing and uninstalling OpenShift AI Self-Managed'}
Page Content:
-------------
Section: Installing and uninstalling OpenShift AI Self-Managed \ Preface

Content:
Learn how to use both the OpenShift command-line interface and web console to
install Red Hat OpenShift AI Self-Managed on your OpenShift Container Platform
cluster. To uninstall the product, learn how to use the recommended command-
line interface (CLI) method.  
Note  
Red Hat recommends that you install only one instance of OpenShift AI on your
cluster.  
Installing the Red Hat OpenShift AI Operator on the same


Metadata: {'Header 1': 'Chapter 1. Architecture of OpenShift AI Self-Managed', 'source': 'https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/inst

In [24]:
# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

new_splits = []
for doc in md_docs:
    md_header_splits = markdown_splitter.split_text(doc.page_content)
    for split in md_header_splits:
        split.metadata = split.metadata | doc.metadata
        
    new_splits.extend(md_header_splits)

chunk_size = 1000
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(new_splits)

for split in splits:
    content_header = f"Section: {split.metadata['title']}"
    for header_name in ["Header 1", "Header 2", "Header 3"]:
        if header_name in split.metadata:
            content_header += f" \ {split.metadata[header_name]}"
    content_header += "\n\nContent:\n"
    split.page_content = content_header + split.page_content

for split in splits:
    print(f"Metadata: {split.metadata}")
    print(f"Page Content:\n-------------\n{split.page_content[:2000]}")
    print("\n")

Metadata: {'Header 1': 'Preface', 'source': 'https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2-latest/html-single/installing_and_uninstalling_openshift_ai_self-managed/index', 'title': 'Installing and uninstalling OpenShift AI Self-Managed'}
Page Content:
-------------
Section: Installing and uninstalling OpenShift AI Self-Managed \ Preface

Content:
Learn how to use both the OpenShift command-line interface and web console to
install Red Hat OpenShift AI Self-Managed on your OpenShift Container Platform
cluster. To uninstall the product, learn how to use the recommended command-
line interface (CLI) method.  
Note  
Red Hat recommends that you install only one instance of OpenShift AI on your
cluster.  
Installing the Red Hat OpenShift AI Operator on the same cluster as the
OpenShift Data Science Add-on is not recommended or supported.


Metadata: {'Header 1': 'Chapter 1. Architecture of OpenShift AI Self-Managed', 'source': 'https://access.redhat.com/d