In [None]:
from bs4 import BeautifulSoup
import regex as re
import xml.etree.ElementTree as ET

Usually the hardest part of extracting information from a document is to get the text out of it in a way that is useful.

The below snippets give an Ansatz on how to extract information from different types of documents.

In [None]:
"""
Extracting text from a PDF

This uses tika (java required) to extract the text from a PDF.
Usually its smart to check manually what the output is before entering it into the RAG preprocessor.

Known problems:
- PDFs with double columns
- Tables
- Formulas (e.g. x^2 often becomes something like x2
"""
import tika
tika.initVM()
from tika import parser
parsed = parser.from_file("/home/finn/Downloads/PEER_final.pdf", xmlContent=True)
content = parsed["content"]
print(content)

In [None]:
"""
Extracting text from HTML

there is no such thing as a general way to extract text from HTML.
The below works if there is a clear header based structure in the HTML.
With this you can directly get the JSON for input into the RAG database.

Usually this wont work and you use the RAG preprocessor and do some filtering before.
"""


def html_to_entities(html_content, source="NOT SPECIFIED"):
    soup = BeautifulSoup(html_content, 'html.parser')
    page_title = soup.title.string if soup.title else "NONE"

    content_dict = {}
    elements = soup.find_all(re.compile(r'h[1-6]|p'))
    current_heading = ""
    for element in elements:
        if element.name.startswith('h'):
            current_heading = element.get_text().strip()
            content_dict[current_heading] = ""
        elif element.name == 'p' and current_heading:
            content_dict[current_heading] += element.get_text().strip() + " "
    for heading, text in content_dict.items():
        content_dict[heading] = ' '.join(text.split())
    embeddings = []
    for name, item in content_dict.items():
        if not item or item == "":
            continue
        entry = {"document_title": page_title, "subtitle": name, "content": item,
                 "source": source}
        embeddings.append(entry)
    return embeddings
html_to_entities("/home/finn/Downloads/hugging.html")

In [None]:
"""
Extracting text from (Wikipedia)XML

Working with XML is the easiest.
The below directly creates the JSON for the RAG database. No need for the RAG preprocessor.
"""

def get_entities_from_wiki_xml(path, tags, doc_id):
    """
    From knowledge_db_creation.py
    """
    tree = ET.parse(path)
    root = tree.getroot()
    entities = []
    for page in root[1:]:

        text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision").find(
            "{http://www.mediawiki.org/xml/export-0.10/}text").text
        title = page.find("{http://www.mediawiki.org/xml/export-0.10/}title").text
        id = page.find("{http://www.mediawiki.org/xml/export-0.10/}id").text
        if "Category:" in title:
            continue

        def repl(matchobj):
            hit = matchobj.groups()[0]
            full = matchobj.group()
            if "|" not in full or "efn|" in full:
                return ""
            elif "math| " in full:
                return f"${re.sub(r'{{((?:[^{}]|(?R))*)}}', repl, hit[6:])}$"
            elif "|" in hit:
                hit = re.sub(r"\|link=y", r"", full)
                if "10^|" in hit:
                    return f"10^{hit[6:-2]}"
                hit = re.sub(r"{{(.*?)\|(.*?)}}", r"\2", hit)
                return hit
            else:
                return full

        sections = re.split(r'={2,5}\s*(.*?)\s*={2,5}', text)
        headers = [title] + sections[1::2]
        section_text = sections[0::2]
        sections = {i: j for i, j in zip(headers, section_text)}
        entries_to_remove = (
            'See also', 'Footnotes', "References", "Sources", "History", "External links", "Bibliography")
        for k in entries_to_remove:
            sections.pop(k, None)

        for i in sections:
            text = sections[i]
            text = text.replace("&lt;", "<")
            text = text.replace("&gt;", ">")
            text = re.sub(r'\[\[(.*?)(?:\|.*?)?\]\]', r'\1', text)
            text = re.sub(r"<ref (.*?)>(.*?)</ref>", '', text)
            text = re.sub(r"<ref>(.*?)</ref>", '', text)
            text = re.sub(r"<ref (.*?)>", '', text)
            text = re.sub(r"<math(.*?)>(.*?)</math>", r'$\2$', text)
            text = re.sub(r"<sub>(.*?)</sub>", r'$\1$', text)
            text = re.sub(r"<sup>(.*?)</sup>", r'^{\1}', text)
            text = re.sub("&nbsp;", " ", text)
            text = re.sub("\t;", "", text)
            text = re.sub(r" {2,20}", "", text)
            text = re.sub(r'{{((?:[^{}]|(?R))*)}}', repl, text)
            text = re.sub("\n", "", text)  # <ref></ref>
            text = re.sub(r"<ref>(.*?)</ref>", '', text)
            text = re.sub(r"\'\'\'(.*?)\'\'\'", r"'\1'", text)
            text = re.sub(r"\'\'(.*?)\'\'", r"'\1'", text)
            entity = {"header": title, "content": i + ":\n" + text,
                      "url": f"https://en.wikipedia.org/?curid={id}#" + "_".join(i.split(" ")),
                      "subheader": i, "tags":tags, "doc_id": doc_id}
            entities.append(entity)
            # sections[i] = text
    return entities