In [63]:
import sys

sys.path.append("../")

import json
import os
import re
import mistletoe
from bs4 import BeautifulSoup

from config import Paths


In [81]:
def parse_page(soup: BeautifulSoup, path: str) -> dict:
    try:
        soup.style.decompose()
    except AttributeError:
        pass

    h2_tags = soup.find_all("h2")

    title = None
    try:
        title =  re.search(r"title: (.*)", soup.h2.get_text()).group(1)
    except AttributeError:
        pass
    try:
        output = {
            "title": title,
            "path": path
        }
    except AttributeError:
        print(f"Could not parse title from {path}")
        raise
    parsed_elements = []

    for i in range(len(h2_tags)):
        # Get the current h2 tag
        current_h2 = h2_tags[i]

        # Get the next h2 tag, if it exists
        if i < len(h2_tags) - 1:
            next_h2 = h2_tags[i + 1]
        else:
            next_h2 = None

        # Find all elements between the current h2 tag and the next h2 tag
        elements = []
        next_sibling = current_h2.next_sibling
        while next_sibling and next_sibling != next_h2:
            if next_sibling.name != "h2":
                elements.append(str(next_sibling))
            next_sibling = next_sibling.next_sibling

        parsed_elements.append(
            {"section_name": current_h2.get_text(), "section_elements": elements}
        )

    excluded_list = ["On this page", "\ntitle:"]

    parsed_elements = [
        element
        for element in parsed_elements
        if not any(excluded in element["section_name"] for excluded in excluded_list)
    ]
    output["elements"] = parsed_elements
    return output

In [83]:
handbook_path = Paths.data / "handbook"
handbook_parsed_path = Paths.data / "handbook_parsed"

# delete all files in handbook_parsed_path using pathlib
for file in handbook_parsed_path.iterdir():
    file.unlink()

for root, dirs, files in os.walk(handbook_path):
    for file in files:
        if (file.endswith(".md") or file.endswith(".md.erb")) and not file.startswith(
            "_"
        ):
            with open(os.path.join(root, file), "r") as f:
                relative_path = root.replace(str(Paths.data), "")
                rendered = mistletoe.markdown(f)
                soup = BeautifulSoup(rendered, "html.parser")
                content_parsed = parse_page(soup, relative_path)

            output_file = (
                f"{relative_path}-{file}.json".replace("/", "_")
                .replace(".html", "")
                .replace(".md", "")
                .replace(".md.erb", "")
            )
            output_file = handbook_parsed_path / output_file[1:]

            with open(output_file, "w") as fw:
                json.dump(content_parsed, fw, indent=4)
