In [None]:
import json
import asyncio
import pandas as pd
from crawl4ai import AsyncWebCrawler

# old.reddit is simpler HTML & still supports .json
REDDIT = "https://old.reddit.com"


async def fetch_subreddit(
    sub: str, pages: int = 3, concurrency: int = 10
) -> pd.DataFrame:
    """
    Pull `pages` * 100 posts from a subreddit via the .json listing, using Crawl4AI.

    Returns
    -------
    DataFrame with columns: id, title, score, author, created_utc, url
    """
    crawler = AsyncWebCrawler(
        concurrency=concurrency,
        default_headers={"User-Agent": "Mozilla/5.0 Crawl4AI demo"},
    )
    after = ""
    rows = []

    async with crawler:
        for _ in range(pages):
            url = f"{REDDIT}/r/{sub}.json?limit=100&after={after}"
            resp = await crawler.arun(url)
            data = json.loads(resp.text)
            # pagination token for next loop
            after = data["data"]["after"] or ""
            for post in data["data"]["children"]:
                d = post["data"]
                rows.append(
                    {
                        "id": d["id"],
                        "title": d["title"],
                        "score": d["score"],
                        "author": d["author"],
                        "created_utc": d["created_utc"],
                        "permalink": f'https://reddit.com{d["permalink"]}',
                    }
                )
            if not after:  # reached end
                break
    return pd.DataFrame(rows)