# Explore LLM Input - What the LLM Sees

In [7]:
import sys
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import wraps
import numpy as np

from dotenv import load_dotenv
from crawl4ai import AsyncWebCrawler, BrowserConfig

from playground.browser_tools_v13.crawler import (
    _crawler_config,
    _create_extraction_strategy,
    SESSION_ID,
)
from playground.browser_tools_v13.models import NavigationAction, PageExtraction

load_dotenv()

# Setup for Windows Jupyter - set policy BEFORE nest_asyncio
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

import nest_asyncio
nest_asyncio.apply()

# Wrapper to run async functions with ProactorEventLoop in separate thread
def run_in_proactor_loop(coro_func):
    """Run async function in a separate thread with ProactorEventLoop"""
    @wraps(coro_func)
    async def wrapper(*args, **kwargs):
        def run_in_thread():
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            try:
                return loop.run_until_complete(coro_func(*args, **kwargs))
            finally:
                try:
                    # Cancel pending tasks before closing
                    for task in asyncio.all_tasks(loop):
                        if not task.done():
                            task.cancel()
                    loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop), return_exceptions=True))
                except Exception:
                    pass
                try:
                    loop.close()
                except Exception:
                    pass
        
        executor = ThreadPoolExecutor(max_workers=1)
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(executor, run_in_thread)
    
    return wrapper

## Configuration

Set the URL to explore (matching the terminal output)

In [2]:
# URL from the terminal output
url = "https://rollcall.com/factbase/trump/search/"

## Run one observe (initial load)

Matches v13's first observe: scroll action, crawl without extraction_strategy, then `strategy.arun(url, [markdown])` separately (same as `PageCrawler.observe`). Returns `result`, `strategy`, `extraction` for inspection below.

In [9]:
@run_in_proactor_loop
async def explore_llm_input():
    """One observe (initial load) matching v13: scroll, crawl then separate LLM extraction."""
    browser_config = BrowserConfig(
        headless=False,
        extra_args=["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage"]
    )
    action = NavigationAction(type="scroll")
    async with AsyncWebCrawler(config=browser_config) as c:
        result = await c.arun(url, config=_crawler_config(action, reuse_session=False), session_id=SESSION_ID)
        result = await c.arun(url, config=_crawler_config(action, reuse_session=True), session_id=SESSION_ID)
    markdown = result.markdown.raw_markdown if result.success else ""
    strategy = _create_extraction_strategy(delta_mode=False)
    raw = await strategy.arun(url, [markdown]) if markdown else []
    parsed = raw[0] if raw else {}
    try:
        extraction = PageExtraction.model_validate(parsed)
    except (TypeError, ValueError):
        extraction = PageExtraction()
    return result, strategy, extraction

# Execute the exploration
result, strategy, extraction = await explore_llm_input()

In [10]:
print(result.markdown.raw_markdown)

  * [Politics](https://rollcall.com/factbase/trump/search/)
    * [Campaigns](https://rollcall.com/section/campaigns/)
    * [Congress](https://rollcall.com/section/congress/)
    * [White House](https://rollcall.com/section/white-house/)
  * [Policy](https://rollcall.com/factbase/trump/search/)
    * [Defense](https://rollcall.com/category/defense/)
    * [Energy/Environment](https://rollcall.com/category/energy/)
    * [Fintech](https://rollcall.com/category/fintech/)
    * [Health Care](https://rollcall.com/category/health-care/)
    * [Technology](https://rollcall.com/category/technology/)
    * [Transportation](https://rollcall.com/category/transportation/)
    * [All Policy](https://rollcall.com/section/policy/)
  * [Heard on the Hill](https://rollcall.com/section/heard-on-the-hill/)
  * Podcasts
    * [CQ Budget](https://www.rollcall.com/podcast/cq-budget-podcast/)
    * [Equal Time](https://rollcall.com/podcast/equal-time/)
    * [Fintech Beat](https://www.rollcall.com/podcast/

In [11]:
extraction.articles

[ArticleExtraction(title='Remarks: Donald Trump Discusses His Plan for Savings Accounts for Children - January 28, 2026', url='https://rollcall.com/factbase/trump/transcript/donald-trump-remarks-trump-accounts-washington-january-28-2026/', date_candidates=[DateCandidate(date='2026-01-28', source=<DateSource.url_path: 'url_path'>), DateCandidate(date='2026-01-28', source=<DateSource.near_title: 'near_title'>)]),
 ArticleExtraction(title='Speech: Donald Trump Discusses the Economy and Energy in Clive, Iowa - January 27, 2026', url='https://rollcall.com/factbase/trump/transcript/donald-trump-speech-economy-energy-clive-iowa-january-27-2026/', date_candidates=[DateCandidate(date='2026-01-27', source=<DateSource.url_path: 'url_path'>), DateCandidate(date='2026-01-27', source=<DateSource.near_title: 'near_title'>)]),
 ArticleExtraction(title='Interview: No Transcript - Rachel Scott Interviews Donald Trump for ABC News - January 27, 2026', url='https://rollcall.com/factbase/trump/transcript/d