# Explore LLM Input - What the LLM Sees

In [1]:
import sys
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import wraps

from dotenv import load_dotenv
from crawl4ai import AsyncWebCrawler, BrowserConfig

from playground.browser_tools_v13.crawler import (
    _crawler_config,
    _create_extraction_strategy,
    SESSION_ID,
)

load_dotenv()

# Setup for Windows Jupyter - set policy BEFORE nest_asyncio
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

import nest_asyncio
nest_asyncio.apply()

# Wrapper to run async functions with ProactorEventLoop in separate thread
def run_in_proactor_loop(coro_func):
    """Run async function in a separate thread with ProactorEventLoop"""
    @wraps(coro_func)
    async def wrapper(*args, **kwargs):
        def run_in_thread():
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            try:
                return loop.run_until_complete(coro_func(*args, **kwargs))
            finally:
                try:
                    # Cancel pending tasks before closing
                    for task in asyncio.all_tasks(loop):
                        if not task.done():
                            task.cancel()
                    loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop), return_exceptions=True))
                except Exception:
                    pass
                try:
                    loop.close()
                except Exception:
                    pass
        
        executor = ThreadPoolExecutor(max_workers=1)
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(executor, run_in_thread)
    
    return wrapper

## Configuration

Set the URL to explore (matching the terminal output)

In [2]:
# URL from the terminal output
url = "https://rollcall.com/factbase/trump/search/"

## Run one observe (initial load)

Calls into the crawler: `_create_extraction_strategy()`, `_crawler_config(...)`, then `arun`. Same as initial page load in the agent (no action). Returns raw `result` for inspection below.

In [3]:
@run_in_proactor_loop
async def explore_llm_input():
    """One observe (initial load) via crawler helpers; return raw result for inspection."""
    browser_config = BrowserConfig(
        headless=False,
        extra_args=["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage"]
    )
    
    strategy = _create_extraction_strategy()
    config = _crawler_config(action=None, reuse_session=False, extraction_strategy=strategy)
    async with AsyncWebCrawler(config=browser_config) as c:
        result = await c.arun(url, config=config, session_id=SESSION_ID)
    return result, strategy

# Execute the exploration
result, strategy = await explore_llm_input()

## Inspect Result Object

Check what properties are available on the result object

In [4]:
print(result.extracted_content)

[
    {
        "articles": [
            {
                "title": "Remarks: JD Vance Addresses the National March for Life Rally in Washington - January 23, 2026",
                "url": "https://rollcall.com/factbase/trump/transcript/donald-trump-remarks-jd-vance-national-march-for-life-rally-january-23-2026/",
                "publication_date": "2026-01-23",
                "date_confidence": "HIGH",
                "date_source": "datetime_attr"
            },
            {
                "title": "Donald Trump Vlog: March for Life Message - January 23, 2026",
                "url": "https://rollcall.com/factbase/trump/transcript/donald-trump-vlog-march-for-life-january-23-2026/",
                "publication_date": "2026-01-23",
                "date_confidence": "HIGH",
                "date_source": "datetime_attr"
            },
            {
                "title": "Remarks: Donald Trump Attends the First Board of Peace Meeting in Davos - January 22, 2026",
              

In [5]:
strategy.total_usage

TokenUsage(completion_tokens=12376, prompt_tokens=17154, total_tokens=29530, completion_tokens_details=None, prompt_tokens_details=None)

In [7]:
len(result.markdown)

53062

## Full Markdown Content

Display the complete markdown object (may be very long)

## LLM Extraction Result

What the LLM returned after processing the markdown

In [None]:
# Show what the LLM extracted
if result.extracted_content:
    print("LLM Extraction Result:")
    print("="*80)
    print(result.extracted_content)
else:
    print("No extraction content returned")

## Summary

This shows:
1. The markdown content that crawl4ai generates from the HTML
2. This markdown is what gets sent to the LLM along with the extraction prompt
3. The LLM processes this markdown and returns structured JSON