In [1]:
import asyncio
from pathlib import Path

In [2]:
from scraper.interface import ScraperInterface
from scraper.factory import ScraperFactory
from scraper.utils import html_to_clean_markdown
from scraper.discovery import get_site_urls
from config.logger import setup_logging

In [3]:
setup_logging()

In [4]:
async def scrape_and_write(scraper_client, url, output_dir):
    try:
        result = await scraper_client.scrape_and_wait(url, config={})
        markdown = html_to_clean_markdown(result["cleaned_html"])

        filename = url.replace("https://", "").replace("/", "_").strip("_") + ".md"
        filepath = Path(output_dir) / filename
        filepath.write_text(markdown)
        print(f"✅ Done: {url}")
    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")

In [5]:
async def scrape_all(scraper_client, urls, output_dir="output"):
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    async with scraper_client:
        tasks = [
            asyncio.create_task(scrape_and_write(scraper_client, url, output_dir))
            for url in urls
        ]

        for completed in asyncio.as_completed(tasks):
            await completed  # This ensures each result is processed as it completes


In [None]:
sitemap = await get_site_urls("https://langchain-ai.github.io/langgraph/", 2)

In [8]:
scraper_client: ScraperInterface = ScraperFactory.create_scraper()

In [None]:
async with scraper_client:
    await scrape_all(scraper_client, sitemap)