In [None]:
from langchain_openai import ChatOpenAI
# model = "gpt-3.5-turbo-0613"
model = "gpt-4o"

llm = ChatOpenAI(
    model=model,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [None]:
schema = {
    "properties": {
        "page_title": {"type": "string", "description": "The title of the webpage."},
        "meta_description": {"type": "string", "description": "Meta description of the webpage."},
        "author": {"type": "string", "description": "The author or source of the content."},
        "publish_date": {"type": "string", "description": "The publish date if available."},
        "main_content": {"type": "string", "description": "The main text content of the webpage."},
        "sections": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Different sections or paragraphs of the content.",
        },
        "links": {
            "type": "array",
            "items": {"type": "string"},
            "description": "List of URLs found in the page content.",
        },
        "images": {
            "type": "array",
            "items": {"type": "string"},
            "description": "List of image URLs found in the page.",
        }
    },
    "required": ["page_title", "main_content"]
}

In [None]:
import asyncio
import dotenv
import os

from langchain_community.document_loaders import AsyncChromiumLoader
from playwright.async_api import TimeoutError as PlaywrightTimeoutError

# Load API keys
dotenv.load_dotenv()
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

class RobustChromiumLoader(AsyncChromiumLoader):
    """
    Ultimate Fix: Handles:
    - Late JavaScript navigation
    - Unwanted dynamic changes
    - Auto-redirects
    - Pop-ups & unnecessary elements
    """
    async def _fetch(self, page, url):
        try:
            # Set User-Agent to mimic a real browser
            await page.set_extra_http_headers({"User-Agent": USER_AGENT})

            # Block auto-navigation & unnecessary assets (ads, images, tracking scripts)
            async def block_unwanted(route):
                if route.request.resource_type in ["image", "font", "stylesheet", "media"]:
                    await route.abort()
                else:
                    await route.continue_()
            await page.route("**/*", block_unwanted)

            # **Prevent Unwanted Redirections**
            async def intercept_navigation(request):
                if request.is_navigation_request():
                    await request.abort()  # Block redirects
            await page.route("**/*", intercept_navigation)

            # Go to the page, ensuring full JavaScript execution
            await page.goto(url, wait_until="domcontentloaded", timeout=25000)
            await page.wait_for_load_state("networkidle")

            # Ensure key elements are present before extracting
            await page.wait_for_selector("body", timeout=10000)

            # **Handle Delayed Content**: Wait for final JavaScript updates
            await asyncio.sleep(2)  # Ensures async-loaded content is fully rendered

            # **Safe Extraction**: Avoids page.content() error
            html_handle = await page.evaluate_handle("document.documentElement.outerHTML")
            html_content = await html_handle.json_value()  # Convert to string
            
            return html_content

        except PlaywrightTimeoutError:
            print(f"⏳ Timeout while loading {url}. Retrying...")
            return await self._retry_fetch(page, url)

        except Exception as e:
            print(f"❌ Error loading {url}: {e}")
            return None

    async def _retry_fetch(self, page, url, retries=3):
        """Retry fetching the page with incremental wait times."""
        wait_times = [3, 5, 8]  # Gradual increase in delay before retrying
        for attempt, wait_time in enumerate(wait_times, start=1):
            try:
                print(f"🔄 Retrying ({attempt}/{retries}) for {url}, waiting {wait_time}s...")
                await asyncio.sleep(wait_time)
                await page.goto(url, wait_until="domcontentloaded", timeout=25000)
                await page.wait_for_load_state("networkidle")
                await page.wait_for_selector("body", timeout=10000)
                html_handle = await page.evaluate_handle("document.documentElement.outerHTML")
                return await html_handle.json_value()
            except PlaywrightTimeoutError:
                continue
        print(f"❌ Final retry failed for {url}")
        return None

In [None]:
from langchain.document_transformers import Html2TextTransformer
from langchain.chains import create_extraction_chain


async def scrape_and_extract(urls):
    """
    Uses AsyncChromiumLoader to load the webpage asynchronously and extract clean text.
    """
    loader = RobustChromiumLoader(urls=urls)
    docs = await loader.aload()

    transformer = Html2TextTransformer()
    text_docs = transformer.transform_documents(docs)

    if not text_docs:
        return None
    
    extracted_data = []
    for doc in text_docs:
        content = doc.page_content
        extracted_info = create_extraction_chain(schema=schema, llm=llm).run(content)
        extracted_data.append(extracted_info)

    return extracted_data

In [None]:
import json
import os

def save_documents_to_files(documents, directory="./output/"):
    """
    Saves each extracted document to a .txt file.
    """
    os.makedirs(directory, exist_ok=True)

    file_path = os.path.join(directory, f"file.json") 

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(documents, f, indent=4, ensure_ascii=False)

In [None]:
async def main():
    urls = [
        "https://www.vensure.com/",
        "https://www.vensure.com/about-us/overview/"
    ]

    extracted_data = await scrape_and_extract(urls)

    if extracted_data:
        save_documents_to_files(extracted_data)
    else:
        print("❌ No data extracted!")

asyncio.run(main())