In [1]:
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai import LLMExtractionStrategy
from pydantic import BaseModel, Field
import csv
import json

In [None]:
class GhanaWebModel(BaseModel):
    url: str = Field(..., description="url of the article")
    title: str = Field(..., description="title of the article")

async def main(url: str):
    browser_config = BrowserConfig(verbose=True)
    run_config = CrawlerRunConfig(
        word_count_threshold=1,
        extraction_strategy=LLMExtractionStrategy(
            # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
            # provider="ollama/qwen2", api_token="no-token",
            llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
            schema=GhanaWebModel.model_json_schema(),
            extraction_type="schema",
            instruction="""From the crawled content, extract all mentioned titles of the articles along with their urls. 
            Do not miss any title and url in the entire content. One extracted model JSON format should look like this: 
             {'title': "Dr Ayew Afriye raises alarm over government's failure to prevent Zipline shutdown",
             'url': 'https://www.ghanaweb.com/GhanaHomePage/NewsArchive/Dr-Ayew-Afriye-raises-alarm-over-government-s-failure-to-prevent-Zipline-shutdown-2010997'}.
             """
        ),
        cache_mode=CacheMode.BYPASS,
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        print(result.extracted_content)

        # Parse JSON string to Python object
        try:
            data = json.loads(result.extracted_content)
        except json.JSONDecodeError as e:
            print("JSON DECODE ERROR:", e)
            return

        # Handle cases where the extraction returns a single dict
        if isinstance(data, dict):
            data = [data]

        # Save to CSV
        csv_file = "ghanaweb_extracted_news.csv"
        with open(csv_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["title", "url", "error"])
            writer.writeheader()
            writer.writerows(data)

        print(f"\nSaved {len(data)} records to: {csv_file}\n")

In [3]:
# url = https://platform.openai.com/docs/pricing
# url = "https://3news.com/news/politics"

url = "https://www.ghanaweb.com/GhanaHomePage/NewsArchive"

await main(url=url)

/tmp/ipykernel_3280603/3799885890.py:13: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  schema=GhanaWebModel.schema(),


[
    {
        "title": "NACOC nabs 3 in connection with 1,158kg suspected cocaine shipment to Belgium",
        "url": "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/NACOC-nabs-3-in-connection-with-1-158kg-suspected-cocaine-shipment-to-Belgium-2012807",
        "error": false
    },
    {
        "title": "'Let’s choose unity over division' - NPP running mate NAPO urges Bono executives",
        "url": "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/Let-s-choose-unity-over-division-NPP-running-mate-NAPO-urges-Bono-executives-2012818",
        "error": false
    },
    {
        "title": "Why Randy Abbey didn’t attend Farmers Day Celebrations - COCOBOD PRO explains",
        "url": "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/Why-Randy-Abbey-didn-t-attend-Farmers-Day-Celebrations-COCOBOD-PRO-explains-2012800",
        "error": false
    },
    {
        "title": "Alhaji Talal Fattal stable after heart procedure",
        "url": "https://www.ghanaweb.com/GhanaHomePage/N