In [1]:
import requests
from config.settings import settings

In [3]:
def crawl_page(url: str) -> str:
    token = settings.crawler.api_key.get_secret_value()
    headers = {"Authorization": f"Bearer {token}"}

    # Basic crawl with authentication
    response = requests.post(
        f"{settings.crawler.api_host}/crawl",
        headers=headers,
        json={
            "urls": url,
            "priority": 10,
        },
    )
    return response.json()["task_id"]


In [4]:
task_id = crawl_page("https://docs.crawl4ai.com/core/docker-deployment/")

In [5]:
def get_task_result(task_id: str) -> str:
    token = settings.crawler.api_key.get_secret_value()
    headers = {"Authorization": f"Bearer {token}"}
    return requests.get(
        f"{settings.crawler.api_host}/task/{task_id}",
        headers=headers,
    )

In [12]:
result = get_task_result(task_id)

In [None]:
html_content = result.json()["result"]["cleaned_html"]
print(len(html_content))

In [15]:
from bs4 import BeautifulSoup
import html2text
from IPython.display import display, HTML, Markdown

In [16]:
def clean_html(soup: BeautifulSoup):
    # Remove tags but keep their content
    for tag in soup.find_all(["a"]):
        tag.unwrap()

    # Remove whole tags and their content
    for tag_name in [
        "script",
        "style",
        "noscript",
        "iframe",
        "canvas",
        "svg",
        "object",
        "embed",
        "form",
        "input",
        "button",
        "nav",
        "footer",
        "header",
        "aside",
        "video",
        "audio",
    ]:
        for tag in soup.find_all(tag_name):
            tag.decompose()

    # Remove empty lists
    for list_tag in soup.find_all(["ul", "ol"]):
        li_items = list_tag.find_all("li")
        has_non_empty_item = any(li.get_text(strip=True) for li in li_items)
        if not li_items or not has_non_empty_item:
            list_tag.decompose()
            
    for img in soup.find_all('img'):
        alt_text = img.get('alt')
        if alt_text:
            img.replace_with(alt_text)
        else:
            img.decompose()
    

In [17]:
def html_to_markdown(html: str) -> str:
    converter = html2text.HTML2Text()
    converter.ignore_links = True
    converter.ignore_images = True
    converter.body_width = 0
    return converter.handle(html).strip()


In [19]:
def write_markdown_to_file(markdown_text: str, file_path: str) -> None:
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(markdown_text)

In [None]:
soup = BeautifulSoup(html_content, "html.parser")
clean_html(soup)
markdown_content = html_to_markdown(str(soup))
# display(HTML(str(soup)))
display(Markdown(markdown_content))

In [20]:
write_markdown_to_file(markdown_content, "../docs/crawl4ai_docker_deployment.md")