In [1]:
# %pip install -q langchain-openai langchain playwright beautifulsoup4
# %playwright install

In [31]:
import dotenv
import os

dotenv.load_dotenv()
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

In [32]:
import asyncio

from langchain_community.document_loaders import AsyncChromiumLoader
from playwright.async_api import TimeoutError as PlaywrightTimeoutError

class RobustChromiumLoader(AsyncChromiumLoader):
    """
    Ultimate Fix: Handles:
    - Late JavaScript navigation
    - Unwanted dynamic changes
    - Auto-redirects
    - Pop-ups & unnecessary elements
    """
    async def _fetch(self, page, url):
        try:
            # Set User-Agent to mimic a real browser
            await page.set_extra_http_headers({"User-Agent": USER_AGENT})

            # Block auto-navigation & unnecessary assets (ads, images, tracking scripts)
            async def block_unwanted(route):
                if route.request.resource_type in ["image", "font", "stylesheet", "media"]:
                    await route.abort()
                else:
                    await route.continue_()
            await page.route("**/*", block_unwanted)

            # **Prevent Unwanted Redirections**
            async def intercept_navigation(request):
                if request.is_navigation_request():
                    await request.abort()  # Block redirects
            await page.route("**/*", intercept_navigation)

            # Go to the page, ensuring full JavaScript execution
            await page.goto(url, wait_until="domcontentloaded", timeout=25000)
            await page.wait_for_load_state("networkidle")

            # Ensure key elements are present before extracting
            await page.wait_for_selector("body", timeout=10000)

            # **Handle Delayed Content**: Wait for final JavaScript updates
            await asyncio.sleep(2)  # Ensures async-loaded content is fully rendered

            # **Safe Extraction**: Avoids page.content() error
            html_handle = await page.evaluate_handle("document.documentElement.outerHTML")
            html_content = await html_handle.json_value()  # Convert to string
            
            return html_content

        except PlaywrightTimeoutError:
            print(f"⏳ Timeout while loading {url}. Retrying...")
            return await self._retry_fetch(page, url)

        except Exception as e:
            print(f"❌ Error loading {url}: {e}")
            return None

    async def _retry_fetch(self, page, url, retries=3):
        """Retry fetching the page with incremental wait times."""
        wait_times = [3, 5, 8]  # Gradual increase in delay before retrying
        for attempt, wait_time in enumerate(wait_times, start=1):
            try:
                print(f"🔄 Retrying ({attempt}/{retries}) for {url}, waiting {wait_time}s...")
                await asyncio.sleep(wait_time)
                await page.goto(url, wait_until="domcontentloaded", timeout=25000)
                await page.wait_for_load_state("networkidle")
                await page.wait_for_selector("body", timeout=10000)
                html_handle = await page.evaluate_handle("document.documentElement.outerHTML")
                return await html_handle.json_value()
            except PlaywrightTimeoutError:
                continue
        print(f"❌ Final retry failed for {url}")
        return None

In [33]:
from langchain.document_transformers import Html2TextTransformer

async def fetch_and_extract_text(urls):
    loader = RobustChromiumLoader(urls=urls)
    docs = await loader.aload()

    if not docs:
        return None

    transformer = Html2TextTransformer()
    extracted_docs = transformer.transform_documents(docs)
    
    return extracted_docs

In [34]:
import os

def save_documents_to_files(documents, directory="./output/"):
    """
    Saves each extracted document to a .txt file.
    """
    os.makedirs(directory, exist_ok=True)

    for index, doc in enumerate(documents):
        content = doc.page_content
        file_path = os.path.join(directory, f"file_{index + 1}.txt") 

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(content)

In [37]:
urls = [
    "https://www.vensure.com/",
]

DIR_Html2TextTransformer = "./html2text_output/"

documents = await fetch_and_extract_text(urls)

if documents:
    save_documents_to_files(documents, DIR_Html2TextTransformer)
else:
    print("No content extracted!")