In [1]:
# !pip install -r requirements.txt

In [2]:
import os
import re
from typing import Iterable, Literal

import requests
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.utils import convert_files_to_docs

# html2text

A helper function that can convert a page element from Beautiful Soup 4 to text.

In [3]:
Mode = Literal["block", "line", "pre"]
Delimiter = Literal["", " ", " - ", "\n", "\n\n"]

spaces_re = re.compile(r"\s+")


def max_delimiter(a: Delimiter, b: Delimiter) -> Delimiter:
    delimiters: list[Delimiter] = ["\n\n", "\n", " - ", " "]
    for d in delimiters:
        if a == d or b == d:
            return d
    return ""


def _element_to_text(content: PageElement, *, mode: Mode = "block") -> tuple[Delimiter, str, Delimiter]:
    if isinstance(content, NavigableString):
        s = str(content)
        if mode == "pre":
            return "", s, ""
        s = spaces_re.sub(" ", s)
        prefix = ""
        if s.startswith(" "):
            prefix = " "
            s = s[1:]
        suffix = ""
        if s.endswith(" "):
            suffix = " "
            s = s[:-1]
        return prefix, s, suffix
    if isinstance(content, Tag):
        if content.name in ("div", "p", "table"):
            _, text, _ = _elements_to_text(content.children, mode=mode)
            return "\n\n", text, "\n\n"
        if content.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
            _, text, _ = _elements_to_text(content.children, mode="line")
            text = "#" * int(content.name[1]) + " " + text
            return "\n\n", text, "\n\n"
        if content.name == "ul":
            _, text, _ = _elements_to_text(content.children, mode=mode)
            return "\n\n", text, "\n\n"
        if content.name == "pre":
            _, text, _ = _elements_to_text(content.children, mode="pre" if mode != "line" else "line")
            return "\n\n", text, "\n\n"
        if content.name == "li":
            _, text, _ = _elements_to_text(content.children, mode="line")
            text = " *  " + text
            return "\n", text, "\n"
        if content.name == "tbody":
            _, text, _ = _elements_to_text(content.children, mode=mode)
            return "\n\n", text, "\n\n"
        if content.name == "tr":
            _, text, _ = _elements_to_text(content.children, mode="line")
            return "\n\n", text, "\n\n"
        if content.name in ("td", "th"):
            _, text, _ = _elements_to_text(content.children, mode="line")
            return " - ", text, " - "
        return _elements_to_text(content.children, mode=mode)
    raise NotImplementedError(f"Cannot convert {type(content)} to text")


def _elements_to_text(children: Iterable[PageElement], *, mode: Mode) -> tuple[Delimiter, str, Delimiter]:
    prefix: Delimiter = ""
    text = ""

    delim: Delimiter = ""
    for child in children:
        child_prefix, child_text, child_suffix = _element_to_text(child, mode=mode)
        if mode == "line" and "\n" in child_prefix:
            child_prefix = " "
        if mode == "line" and "\n" in child_suffix:
            child_suffix = " "
        delim = max_delimiter(delim, child_prefix)
        if not child_text:
            delim = max_delimiter(delim, child_suffix)
            continue
        if text == "":
            prefix = delim
        else:
            text += delim
        text += child_text
        delim = child_suffix
    return prefix, text, delim


def html2text(content: PageElement) -> str:
    _, text, _ = _element_to_text(content)
    return text

# Download docs and create .txt files in ./data

In [4]:
os.makedirs("./data", exist_ok=True)

for name in (
    "deploy_quay_on_openshift_op_tng",
    "deploy_quay",
    "deploy_quay_ha",
    "config_quay",
    "manage_quay",
    "upgrade_quay",
    "use_quay",
    "api_quay",
):
    r = requests.get(f"https://docs.projectquay.io/{name}.html")
    r.raise_for_status()
    soup = BeautifulSoup(r.text, features="html.parser")
    content = soup.find(id="content")
    assert content is not None
    text = html2text(content)
    with open(f"./data/{name}.txt", "w") as f:
        f.write(text)

# Document Store

In [5]:
documents = convert_files_to_docs(dir_path="./data", split_paragraphs=True)

document_store = InMemoryDocumentStore()
document_store.write_documents(documents)

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
)
document_store.update_embeddings(retriever)

  return self.fget.__get__(instance, owner)()


Updating Embedding:   0%|          | 0/3273 [00:00<?, ? docs/s]

Batches:   0%|          | 0/103 [00:00<?, ?it/s]

# Test it

In [6]:
docs = retriever.retrieve(query="What values are supported for AUTHENTICATION_TYPE?", top_k=10)
for doc in docs:
    print()
    print("#" * 78)
    print("# Match score: {}".format(doc.score))
    print(doc.content)
    print("#" * 78)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


##############################################################################
# Match score: 0.5689518921587217
AUTHENTICATION_TYPE (Required) - String - The authentication engine to use for credential authentication. Values: One of Database, LDAP, JWT, Keystone, OIDC Default: Database
##############################################################################

##############################################################################
# Match score: 0.5685617067657014
 *  auth_kind: The type of auth used, including:  *  basic  *  oauth  *  credentials
 *  success: true or false
##############################################################################

##############################################################################
# Match score: 0.5671045362909856
AUTHENTICATION_TYPE: LDAP
##############################################################################

##############################################################################
# Match score: 0.55874825732

In [7]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

  return self.fget.__get__(instance, owner)()


In [8]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [9]:
from haystack.utils import print_answers

prediction = pipe.run(
    query="What authentication engines can be used?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

print_answers(
    prediction,
    details="minimum" ## Choose from `minimum`, `medium`, and `all`
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]


Query: What authentication engines can be used?
Answers:
[   {   'answer': 'Database, LDAP, JWT, Keystone, OIDC',
        'context': 'uired) - String - The authentication engine to use for '
                   'credential authentication. Values: One of Database, LDAP, '
                   'JWT, Keystone, OIDC Default: Database'},
    {'answer': 'Google', 'context': '#### Google authentication'},
    {   'answer': 'Hardware and KMS Signing',
        'context': ' *  Hardware and KMS Signing\n'
                   ' *  Bring-your-own PKI\n'
                   ' *  OIDC PKI\n'
                   ' *  Built-in binary transparency and timestamping service'},
    {'answer': 'Registry', 'context': '#### Registry authentication'},
    {   'answer': 'External',
        'context': '### External authentication (OAUTH) configuration'}]
