In [1]:
import os
import yaml
import dotenv
import logging
import json
import yaml
import time
import random

from pathlib import Path

from datetime import datetime

import asyncio
import nest_asyncio

import pydantic
from pydantic import BaseModel, Field, RootModel
from typing import Dict, TypedDict, Type, List, Optional, Any
from dataclasses import dataclass, field
from enum import Enum

import numpy as np
import pandas as pd

import openai
from openai import AsyncOpenAI

import agents
from agents.exceptions import InputGuardrailTripwireTriggered
from agents import (Agent, Runner, Tool, ModelSettings, FunctionTool, InputGuardrail, GuardrailFunctionOutput,
                    SQLiteSession, set_default_openai_api, set_default_openai_client
                   )

from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

from IPython.display import HTML, Image, Markdown, display

from prompt_loader import PromptLoader
from log_handler import SQLiteLogHandler, setup_sqlite_logging, sanitize_error_for_logging
from utilities import (StepStatus, WorkflowStatus,
                       get_workflow_status_report, print_workflow_summary, 
                      )
# from scrape import gather_urls


In [2]:
print(f"OpenAI:            {openai.__version__}")
print(f"OpenAI Agents SDK  {agents.__version__}")
print(f"Pydantic           {pydantic.__version__}")


OpenAI:            1.107.0
OpenAI Agents SDK  0.2.11
Pydantic           2.11.7


# Basic usage
- Run a prompt using agents
- Sessions
- Route through Portkey for observability
- Save logs
- Link to openai for traces and evals


In [3]:
# load environment variables including OPENAI_API_KEY
# important - for portkey
# OPENAI_BASE_URL="http://localhost:8787/v1"
# OPENAI_DEFAULT_HEADERS='{"x-portkey-provider": "openai"}'
# launch proxy service https://portkey.ai/docs/product/enterprise-offering/components
# npx @portkey-ai/gateway
# could point to a database with a portkey_config.yaml
# logging:
#   sink: sql
#   database_url: postgres://user:password@localhost:5432/portkey
# npx @portkey-ai/gateway --portkey_config.yaml

dotenv.load_dotenv()

# to run async in jupyter notebook
nest_asyncio.apply()

# verbose console logging if something doesn't work
# logging.basicConfig(level=logging.DEBUG)
# openai_logger = logging.getLogger("openai")a
# openai_logger.setLevel(logging.DEBUG)

# needed for portkey - responses API is persistent connection-oriented and seeems to not work
set_default_openai_api("chat_completions")

print("OPENAI_BASE_URL =", os.getenv("OPENAI_BASE_URL"))
print("OPENAI_DEFAULT_HEADERS =", os.getenv("OPENAI_DEFAULT_HEADERS"))


OPENAI_BASE_URL = http://localhost:8787/v1
OPENAI_DEFAULT_HEADERS = {"x-portkey-provider": "openai"}


In [4]:
def setup_logging(session_id: str = "default", db_path: str = "agent_logs.db") -> logging.Logger:
    """Set up logging to console and SQLite database."""

    # Create logger
    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(f"NewsletterAgent.{session_id}")
    logger.setLevel(logging.INFO)

    # Clear any existing handlers
    logger.handlers.clear()

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_formatter = logging.Formatter(
        '%(asctime)s | %(name)s | %(levelname)s | %(message)s',
        datefmt='%H:%M:%S'
    )
    console_handler.setFormatter(console_formatter)

    # SQLite handler
    sqlite_handler = SQLiteLogHandler(db_path)
    sqlite_handler.setLevel(logging.INFO)
    sqlite_formatter = logging.Formatter('%(message)s')
    sqlite_handler.setFormatter(sqlite_formatter)

    # Add handlers to logger
    logger.addHandler(console_handler)
    logger.addHandler(sqlite_handler)

    # Prevent propagation to root logger
    logger.propagate = False

    return logger

logger = setup_logging("newsletter_agent", "test_logs.db")

# Log some test messages
logger.info("Test info message", extra={
    'step_name': 'test_step',
    'agent_session': 'demo_session'
})

logger.warning("Test warning message", extra={
    'step_name': 'test_step',
    'agent_session': 'demo_session'
})

logger.error("Test error message", extra={
    'step_name': 'error_step',
    'agent_session': 'demo_session'
})

sanitize_error_for_logging("log with some bad stuff for the filter: sk-proj-123456789012345678901234567890123456789012345678")

12:28:09 | NewsletterAgent.newsletter_agent | INFO | Test info message
12:28:09 | NewsletterAgent.newsletter_agent | ERROR | Test error message


'log with some bad stuff for the filter: [API_KEY_REDACTED]'

In [5]:
client = AsyncOpenAI(
    base_url=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
    default_headers=json.loads(os.getenv("OPENAI_DEFAULT_HEADERS")),
)

# set the client globally
set_default_openai_client(client)


In [None]:
# run a simple query through portkey
# can see traces in openai https://platform.openai.com/logs?api=traces
# potentially set up evals - https://platform.openai.com/evaluations
!
myagent = Agent(
    name="Swallow Expert",
    instructions="You are an expert on airspeed velocities of swallows. Answer questions about swallow flight speeds with authority and humor when appropriate.",
    model="gpt-5-mini",
    # these below seem to be being deprecated, you probably have to use old chat API directly on eg gpt-4o for logprobs
    # model_settings=ModelSettings(temperature=0.0, logprobs=1, top_logprobs=1)
)

# 1) Create (or reuse) a session. Use a durable DB path if you want persistence.
session = SQLiteSession("test_swallow_chat", "swallow.db")

# 2) First turn
myresult = await Runner.run(myagent, "What is the airspeed velocity of an unladen swallow?", session=session)
display(Markdown(myresult.final_output))

In [None]:
# 3) Next turns — just keep reusing the same session
myresult = await Runner.run(myagent, "ok, go ahead and explain how that number comes about", session=session)

display(Markdown(myresult.final_output))

# More advanced usage
- Prompt Management
- Structured JSON outputs, enables validation and safe passing downstream over long pipelines
- Map prompts to larger data sets asynchronously (e.g. send parallel batches of 50)


In [None]:
# get prompts from the prompt repository (the promptfoo yaml files)
# langfuse probably a better enterprise option
# prompt repository solution allows us to run evals, version prompts, improving performance over time

logger.info("Show available prompts")
my_prompt_loader = PromptLoader()
my_prompt_loader.list_available_prompts()

prompt_name = 'headline_classifier_v1'
prompt_dict = my_prompt_loader.load_prompt_by_name(prompt_name)
time.sleep(1)

logger.info("Load a prompt")
print(prompt_dict.get('system'), "")
print(prompt_dict.get('user'), "")
time.sleep(1)

logger.info("Show prompt metadata")
prompt_metadata = my_prompt_loader.get_prompt_metadata(prompt_name)
print(prompt_metadata)
time.sleep(1)

logger.info("Format a prompt with input")
print(my_prompt_loader.format_prompt(prompt_name, input_str="AI Is Replacing Online Moderators, But It's Bad at the Job"))


In [None]:
# output class for classifying headlines
class ClassificationResult(BaseModel):
    """A single headline classification result"""
    input_str: str = Field(description="The original headline text")
    output: bool = Field(description="Whether the headline is AI-related")

class ClassificationResultList(BaseModel):
    """List of ClassificationResult for batch processing"""
    results_list: list[ClassificationResult] = Field(description="List of classification results")


In [None]:
client = AsyncOpenAI(
    base_url=os.getenv("OPENAI_BASE_URL"),
    api_key=os.getenv("OPENAI_API_KEY"),
    default_headers=json.loads(os.getenv("OPENAI_DEFAULT_HEADERS")),
)

# set the client globally
set_default_openai_client(client)

class ClassifierAgent(Agent):
    """Agent for classifying headlines as AI-related or not
    or more generally apply a prompt to a string for a classification according to an output type
    """

    def __init__(self,
                 system_prompt: str,
                 user_prompt: str,
                 output_type: Type[BaseModel],
                 model: str,
                 verbose: bool = False):
        """
        Initialize the ClassifierAgent

        Args:
            system_prompt: The system prompt template to use
            user_prompt: The user prompt template to use
            output_type: Pydantic model class for structured output
            verbose: Enable verbose logging
        """
        super().__init__(
            name="ClassifierAgent",
            model=model,
            instructions=system_prompt,
            output_type=output_type
        )
        self.system_prompt = system_prompt
        self.user_prompt = user_prompt
        self.verbose = verbose

        if self.verbose:
            logger.info(f"""Initialized ClassifierAgent:
system_prompt:
{self.system_prompt}
user_prompt:
{self.user_prompt}
output_type:         {output_type.__name__}
model:               {self.model}
schema:              {json.dumps(output_type.model_json_schema(), indent=2)}
""")
    @retry(
        retry=retry_if_exception_type((openai.APIConnectionError,
                                       openai.APITimeoutError,
                                       openai.InternalServerError)),
        stop=stop_after_attempt(5),  # 5 attempts sufficient for classification
        after=lambda retry_state: log(sanitize_error_for_logging(
            f"Attempt {retry_state.attempt_number}: {retry_state.outcome.exception()}, tag: {retry_state.args[1].get('tag', '')}")),
        wait=wait_exponential(multiplier=1, min=1, max=30),
    )

    async def classify(self, input_str: str) -> Type[BaseModel]:
        """
        Classify a single input or a string with multiple inputs to the specified type

        Args:
            input: The input text to classify

        Returns:
            The specified type
        """
        user_message = self.user_prompt.format(input_str=input_str)
        if self.verbose:
            logger.info(f"User message: {user_message}")

        results_list = await Runner.run(self, user_message)
        if self.verbose:
            logger.info(f"Result: {results_list}")
        return results_list

    async def classify_batch(self, input_list: List[str], batch_size: int = 25,
                             *, max_concurrency: int = 16, retries: int = 3
                            ) -> Any:

        """
        Classify a list using paged, parallel calls to `self.classify()`,
        preserving the original input order and validating page sizes.
        """
        # Type must have a 'results_list' element
        null_return = self.output_type(results_list=[])
        if not input_list:
            return null_return

        pages = [input_list[i:i+batch_size]
                 for i in range(0, len(input_list), batch_size)]
        sem = asyncio.Semaphore(max_concurrency)
        logger.info(f"Sending {len(pages)} batches with concurrency {max_concurrency}")

        async def _guarded_classify(page_idx: int, items: List[str]) -> self.output_type:
            for i in range(retries):
                input_str = "\n".join(items)
                try:
                    async with sem:
                        result = await self.classify(input_str)
                    res = result.final_output
#                     print(type(res))
#                     print("----")
#                     print(res)
#                     print("----")
                    if not hasattr(res, "results_list"):
                        raise ValueError("Bad structured output or missing 'results_list'.")
                    if not isinstance(res.results_list, list):
                        raise ValueError("Structured output invalid 'results_list'.")
                    if len(res.results_list) != len(items):
                        raise ValueError(
                            f"Page {page_idx}: count mismatch (got {len(res.results_list)} vs expected {len(items)})."
                        )
                    return (page_idx, res)
                except Exception as e:
                    last_exc = e
                    logger.info(f"[page {page_idx}] attempt {i+1}/{retries} failed: {e}")
                    if i < retries:
                        await asyncio.sleep(2 ** i)  # 1s, 2s, 4s backoff

            return page_idx, last_exc if last_exc else RuntimeError(f"Unknown error on page {page_idx}")

        tasks = [
            asyncio.create_task(_guarded_classify(i, page))
            for i, page in enumerate(pages)
        ]
        page_results = await asyncio.gather(*tasks)

        # Reassemble in original order
        flattened_results = []
        for idx, res_or_exc in page_results:
            if isinstance(res_or_exc, Exception):
                raise res_or_exc
            elif res_or_exc:
                flattened_results.extend(res_or_exc.results_list)
            else:
                logger.info(f"no results for page {idx}")

        final = self.output_type(results_list=flattened_results)

        # Final sanity check
        if len(final.results_list) != len(input_list):
            raise ValueError(f"Final count mismatch: expected {len(input_list)} results, got {len(flattened.results_list)}.")

        return final



In [None]:
# send singly
prompt_name = 'headline_classifier_v1'
prompt_dict = PromptLoader().load_prompt_by_name(prompt_name)

classifier = ClassifierAgent(prompt_dict.get('system'),
                             prompt_dict.get('user'),
                             ClassificationResult,
                             "gpt-5-mini",
                             verbose=True)

test_headlines = [
    "AI Is Replacing Online Moderators, But It's Bad at the Job",
    "Baby Trapped in Refrigerator Eats Own Foot",
    "Machine Learning Breakthrough in Medical Diagnosis",
    "Local Restaurant Opens New Location",
    "ChatGPT Usage Soars in Educational Settings"
]

prompt_name = 'headline_classifier_v1'
prompt_dict = PromptLoader().load_prompt_by_name(prompt_name)

classifier = ClassifierAgent(prompt_dict.get('system'),
                             prompt_dict.get('user'),
                             ClassificationResult,
                             "gpt-5-mini",
                             verbose=True)

result = await classifier.classify(test_headlines[0])
print(result.final_output)
result = await classifier.classify(test_headlines[1])
print(result.final_output)


In [None]:
# send a single batch with verbose
prompt_name = 'headline_classifier_v1'
prompt_dict = PromptLoader().load_prompt_by_name(prompt_name)

classifier = ClassifierAgent(prompt_dict.get('system'),
                             prompt_dict.get('user'),
                             ClassificationResultList,
                             "gpt-5-mini",
                             verbose=True)

result = await classifier.classify(str(test_headlines))
print(result.final_output)


In [None]:
# make batches and send multiple in parallel
headlines_df = pd.read_csv("test_headlines.csv")
headlines_df


In [None]:
logger.info("classify headlines as AI-related or not")
prompt_name = 'headline_classifier_v1'
prompt_dict = PromptLoader().load_prompt_by_name(prompt_name)

classifier = ClassifierAgent(prompt_dict.get('system'),
                             prompt_dict.get('user'),
                             ClassificationResultList,
                             "gpt-5-mini",
                             verbose=False)

classification_result = await classifier.classify_batch(list(headlines_df['title'].to_list()))
classification_result

In [None]:
# see results, true and false
zdf = pd.DataFrame([(z.input_str, z.output) for z in classification_result.results_list], columns=["input", "output"])
display(zdf.loc[zdf["output"]])
zdf.loc[~zdf["output"]]


# Run Agent Worfklow

In [6]:
# class to store agent state from step to step

class NewsletterAgentState(BaseModel):
    """
    Persistent state for the newsletter agent workflow.

    Manages the complete newsletter generation process with serializable storage
    of headlines, processing results, and workflow progress. Supports resumable execution
    and DataFrame integration for data manipulation.

    """
    # Serializable data storage (DataFrame as list of dicts)
    headline_data: List[Dict[str, Any]] = Field(
        default_factory=list,
        description="List of headline dictionaries with columns: title, url, source, timestamp, is_ai, summary,etc."
    )

    # Configuration
    max_edits: int = Field(default=2, description="Maximum editing iterations")
    concurrency: int = Field(default=16, description="Number of concurrent browsers")
    current_step: int = Field(default=0, description="Current workflow step (0-9)")
    workflow_complete: bool = Field(default=False, description="Whether the entire workflow is complete")

    # Source config
    sources_file: str = Field(
        default="sources.yaml",
        description="YAML filename containing source configurations"
    )
    sources: Dict[str, Any] = Field(
        default_factory=dict,
        description="Dictionary of source configurations loaded from YAML"
    )

    # Topics and clustering
    cluster_names: List[str] = Field(
        default_factory=list,
        description="List of topic names for categorization"
    )
    clusters: Dict[str, List[str]] = Field(
        default_factory=dict,
        description="Topic name -> list of article IDs and related info"
    )

    # Newsletter content
    newsletter_sections: Dict[str, str] = Field(
        default_factory=dict,
        description="Section name -> section content"
    )
    final_newsletter: str = Field(default="", description="Final newsletter content")

    # Helper methods
    @classmethod
    def create_headline_df(cls) -> pd.DataFrame:
        """Create an empty DataFrame with proper columns for news headlines."""
        return pd.DataFrame(columns=[
            'id',
            'source',
            'title',
            'orig_url',
            'url',
            'text_path',
            'site_name',
            'published',
            'is_ai',
            'topic_list',
            'cluster',
            'rating',
            'summary'
        ])

    @property
    def headline_dict_to_df(self) -> 'pd.DataFrame':
        """Convert headline data to DataFrame"""
        return pd.DataFrame(self.headline_data)

    def headline_df_to_dict(self, df: 'pd.DataFrame'):
        """Update headline data from DataFrame"""
        self.headline_data = df.to_dict(orient='records')

    def add_headlines(self, new_headlines: List[Dict[str, Any]]) -> None:
        """
        Add new headlines to the DataFrame with deduplication.

        Args:
            new_headlines: List of dictionaries with headline data
                          Expected keys: title, url, source, timestamp, etc.
        """
        if not new_headlines:
            print("⚠️  No new headlines to add")
            return

        new_df = pd.DataFrame(new_headlines)

        if self.headline_data:
            existing_df = self.headline_dict_to_df()
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            self.headline_df_to_dict(combined_df)
        else:
            self.headline_df_to_dict(new_df)

        print(f"📰 Added headlines - updated count: {len(self.headline_data)}")

    def get_status_summary(self) -> Dict[str, Any]:
        """
        Get a summary of the current session state.

        Returns:
            Dictionary with key metrics and status information
        """
        total_headlines = len(self.headline_data)

        return {
            "headlines": {
                "total": total_headlines,
                # group by source
                # ai vs non-ai
                # downloaded: have text path
                # have summary
                # have topics
                # have cluster assignment
                # have rating
            },
            "sources": {
                "config_file": self.sources_file,
                "loaded_sources": len(self.sources),
            },
            "topics": {
                "cluster_topics": len(self.cluster_names),
                "topics": self.cluster_names
            },
            "workflow": {
                "current_step": self.current_step,
                "workflow_complete": self.workflow_complete,
                "max_edits": self.max_edits,
                "concurrency": self.concurrency
            },
            "processing": {
                "topic_clusters": len(self.clusters),
                "newsletter_sections": len(self.newsletter_sections),
                "final_newsletter_length": len(self.final_newsletter)
            }
        }

    def print_status(self) -> None:
        """Print a formatted summary of the current session state."""
        status = self.get_status_summary()

        print("\n" + "="*50)
        print("📊 NEWSLETTER AGENT STATE SUMMARY")
        print("="*50)

        print(f"📰 Headlines: {status['headlines']['total']} total")
        # print(f"   🤖 AI-related: {status['headlines']['is_ai']} ({status['headlines']['ai_percentage']})")
        # print(f"   📄 Non-AI: {status['headlines']['non_ai']}")

        print(f"\n📡 Sources: {status['sources']['loaded_sources']} loaded")
        print(f"   📁 Config: {status['sources']['config_file']}")

        print(f"\n🏷️  Topics: {status['topics']['cluster_topics']} cluster topics")
        if status['topics']['topics']:
            print(f"   📋 Topics: {', '.join(status['topics']['topics'])}")

        print(f"\n⚙️  Workflow:")
        print(f"   📍 Current step: {status['workflow']['current_step']}/9")
        print(f"   ✅ Complete: {status['workflow']['workflow_complete']}")
        print(f"   ✏️ Max edits: {status['workflow']['max_edits']}")
        print(f"   🌐 Concurrency: {status['workflow']['concurrency']}")

        print(f"\n🔄 Processing:")
        # print(f"   📝 Article summaries: {status['processing']['article_summaries']}")
        print(f"   🏷️  Topic clusters: {status['processing']['topic_clusters']}")
        print(f"   📑 Newsletter sections: {status['processing']['newsletter_sections']}")
        if status['processing']['final_newsletter_length'] > 0:
            print(f"   📰 Final newsletter: {status['processing']['final_newsletter_length']} chars")

        print("="*50 + "\n")


    def get_unique_sources(self) -> List[str]:
        """Get list of unique source names from headline data."""
        df = self.headline_dict_to_df
        df['count'] = 1
        df = df[['source', 'count']].groupby(['source']).sum().reset_index()
        print(df)
        return df.to_dict(orient='records')


In [7]:
import sys 

if 'fetch' in sys.modules:
    del sys.modules['fetch']
    # Delete the reference
    del Fetcher
from fetch import Fetcher


In [8]:
async with Fetcher() as f:
      z=await f.fetch_rss('Ars Technica')
z


2025-09-12 12:28:24,470 - fetcher_4738149776 - INFO - [fetcher_init] Loading sources from sources.yaml
2025-09-12 12:28:24,490 - fetcher_4738149776 - INFO - [fetcher_init] Loaded 17 sources: 7 RSS, 9 HTML, 1 API
2025-09-12 12:28:24,491 - fetcher_4738149776 - INFO - [fetcher_init] Fetcher initialized with max_concurrent=8
2025-09-12 12:28:24,492 - fetcher_4738149776 - INFO - [fetch_rss] Fetching RSS from Ars Technica: https://arstechnica.com/ai/feed/
2025-09-12 12:28:24,786 - fetcher_4738149776 - INFO - [fetch_rss] RSS fetch successful for Ars Technica: 20 articles


{'source': 'Ars Technica',
 'results': [{'source': 'Ars Technica',
   'title': 'Education report calling for ethical AI use contains over 15 fake sources',
   'url': 'https://arstechnica.com/ai/2025/09/education-report-calling-for-ethical-ai-use-contains-over-15-fake-sources/',
   'published': 'Fri, 12 Sep 2025 16:01:27 +0000',
   'rss_summary': 'Experts find fake sources in Canadian government report that took 18 months to complete.'},
  {'source': 'Ars Technica',
   'title': 'OpenAI and Microsoft sign preliminary deal to revise partnership terms',
   'url': 'https://arstechnica.com/ai/2025/09/openai-and-microsoft-sign-preliminary-deal-to-revise-partnership-terms/',
   'published': 'Thu, 11 Sep 2025 22:27:53 +0000',
   'rss_summary': 'Companies work to finalize terms as OpenAI pursues for-profit restructuring.'},
  {'source': 'Ars Technica',
   'title': 'Ted Cruz AI bill could let firms bribe Trump to avoid safety laws, critics warn',
   'url': 'https://arstechnica.com/tech-policy/202

In [9]:
async with Fetcher() as f:
      z = await f.fetch_html('The Verge')
        

2025-09-12 12:28:25,734 - fetcher_4742280016 - INFO - [fetcher_init] Loading sources from sources.yaml
2025-09-12 12:28:25,751 - fetcher_4742280016 - INFO - [fetcher_init] Loaded 17 sources: 7 RSS, 9 HTML, 1 API
2025-09-12 12:28:25,752 - fetcher_4742280016 - INFO - [fetcher_init] Fetcher initialized with max_concurrent=8
2025-09-12 12:28:25,752 - fetcher_4742280016 - INFO - [fetch_html] Fetching HTML from The Verge: https://www.theverge.com/ai-artificial-intelligence
2025-09-12 12:28:27,101 - fetcher_4742280016 - INFO - [fetch_html] Source dict for The Verge: {'url': 'https://www.theverge.com/ai-artificial-intelligence', 'title': 'The Verge', 'sourcename': 'The Verge', 'click': '', 'scroll': 0, 'scroll_div': '', 'initial_sleep': None, 'include': ['^https://www.theverge.com/news'], 'exclude': None, 'minlength': None}


Starting fetch_source https://www.theverge.com/ai-artificial-intelligence, The Verge
fetch_url(https://www.theverge.com/ai-artificial-intelligence)
Fetching https://www.theverge.com/ai-artificial-intelligence to download/sources
Response: 200
10
10


2025-09-12 12:28:46,487 - fetcher_4742280016 - ERROR - [fetch_html] HTML fetch failed for The Verge: '<=' not supported between instances of 'int' and 'NoneType'


performed human like actions
Found last updated time from document.lastModified: 09/12/2025 09:28:46
Saving HTML to download/sources/The_Verge.html


In [10]:
!ls download/sources 




HTTPBin_HTML_Test.html The_Verge.html


In [None]:
z


In [12]:
async with Fetcher() as f:
     z = await f.fetch_api('NewsAPI')
z 


2025-09-12 12:29:15,293 - fetcher_4779257104 - INFO - [fetcher_init] Loading sources from sources.yaml
2025-09-12 12:29:15,309 - fetcher_4779257104 - INFO - [fetcher_init] Loaded 17 sources: 7 RSS, 9 HTML, 1 API
2025-09-12 12:29:15,310 - fetcher_4779257104 - INFO - [fetcher_init] Fetcher initialized with max_concurrent=8
2025-09-12 12:29:15,311 - fetcher_4779257104 - INFO - [newsapi] Fetching top 100 stories matching artificial intelligence since 2025-09-11T12:29:15 from NewsAPI


{'source': 'NewsAPI',
 'results': [{'source': 'NewsAPI',
   'title': "Venezuela military, militias deploy to 'battlefronts', Maduro says",
   'url': 'https://www.yahoo.com/news/articles/venezuela-military-militias-deploy-battlefronts-131832857.html',
   'published': '2025-09-11T13:18:32Z'},
  {'source': 'NewsAPI',
   'title': 'French Voice Actress For Lara Croft Accuses Developer Of Using AI To Alter Her Voice',
   'url': 'https://kotaku.com/lara-croft-tomb-raider-francoise-cadol-aspyr-remaster-2000624934',
   'published': '2025-09-11T15:21:33Z'},
  {'source': 'NewsAPI',
   'title': 'Four talks and events hosted by Dezeen during London Design Festival 2025',
   'url': 'https://www.dezeen.com/2025/09/11/dezeen-events-london-design-festival-2025/',
   'published': '2025-09-11T15:00:27Z'},
  {'source': 'NewsAPI',
   'title': 'Google Pixel 10 Adds C2PA Support to Verify AI-Generated Media Authenticity',
   'url': 'https://thehackernews.com/2025/09/google-pixel-10-adds-c2pa-support-to.html'

In [None]:
f.sources.get('Ars Technica')


In [None]:
# tools

class WorkflowStatusTool:
    """Tool to check current workflow status"""

    def __init__(self, workflow_status: WorkflowStatus, logger: logging.Logger):
        self.workflow_status = workflow_status
        self.logger = logger

    async def _check_workflow_status(self, ctx, args: str) -> str:
        """Get current workflow status report based on persistent state"""
        if self.logger:
            self.logger.info("Starting check_workflow_status")

        try:
            # Access the persistent state
            state: NewsletterAgentState = ctx.context

            # Create a status report based on persistent state
            step_names = [
                "step_01_gather_urls", "step_02_filter_urls", "step_03_download_articles",
                "step_04_extract_summaries", "step_05_cluster_by_topic", "step_06_rate_articles",
                "step_07_select_sections", "step_08_draft_sections", "step_09_finalize_newsletter"
            ]

            lines = [
                "WORKFLOW STATUS (FROM PERSISTENT STATE)",
                f"Current Step: {state.current_step}/9",
                f"Workflow Complete: {state.workflow_complete}",
                f"Progress: {(state.current_step/9)*100:.1f}%",
                "",
                "Step Details:"
            ]

            for i, step_name in enumerate(step_names, 1):
                if i <= state.current_step:
                    status = "✅ completed"
                elif i == state.current_step + 1:
                    status = "➡️ next to execute"
                else:
                    status = "⭕ not started"

                formatted_name = step_name.replace('step_', 'Step ').replace('_', ' ').title()
                formatted_name = formatted_name.replace('0', '').replace('  ', ' ')  # Clean up numbering
                lines.append(f"  {formatted_name}: {status}")

            if state.headline_data:
                lines.extend([
                    "",
                    "Data Summary:",
                    f"  Total articles: {len(state.headline_data)}",
                    f"  AI-related: {sum(1 for a in state.headline_data if a.get('ai_related') is True)}",
                    f"  Summaries: {len(state.article_summaries)}",
                    f"  Clusters: {len(state.topic_clusters)}",
                    f"  Sections: {len(state.newsletter_sections)}",
                ])

                result = "\n".join(lines)
                if self.logger:
                    self.logger.info("Completed check_workflow_status")
                return result

        except Exception as e:
            if self.logger:
                self.logger.error(f"check_workflow_status failed: {str(e)}")
            raise

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="check_workflow_status",
            description="Check the current status of the newsletter workflow and see which steps are completed, in progress, or pending",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._check_workflow_status
        )


class StateInspectionTool:
    """Tool to inspect detailed persistent state data"""

    def __init__(self, verbose: bool = False, logger: logging.Logger = None):
        self.verbose = verbose
        self.logger = logger

    async def _inspect_state(self, ctx, args: str) -> str:
        """Inspect detailed state data for debugging and monitoring"""
        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Create detailed state report
        report_lines = [
            "DETAILED STATE INSPECTION",
            "=" * 50,
            f"Current Step: {state.current_step}/9",
            f"Workflow Complete: {state.workflow_complete}",
            f"Sources File: {state.sources_file}",
            "",
            "HEADLINE DATA:",
            f"  Total articles: {len(state.headline_data)}",
        ]

        if state.headline_data:
            ai_related = sum(1 for a in state.headline_data if a.get('ai_related') is True)
            with_content = sum(1 for a in state.headline_data if a.get('content'))
            with_ratings = sum(1 for a in state.headline_data if a.get('quality_rating'))
            with_clusters = sum(1 for a in state.headline_data if a.get('cluster_topic'))

            report_lines.extend([
                f"  AI-related: {ai_related}",
                f"  With content: {with_content}",
                f"  With ratings: {with_ratings}",
                f"  With clusters: {with_clusters}",
                f"  Sources: {len(set(a.get('source', 'Unknown') for a in state.headline_data))}",
            ])

        report_lines.extend([
            "",
            "PROCESSING RESULTS:",
            f"  Article summaries: {len(state.article_summaries)} articles",
            f"  Topic clusters: {len(state.topic_clusters)} topics",
            f"  Newsletter sections: {len(state.newsletter_sections)} sections",
            f"  Final newsletter: {'Generated' if state.final_newsletter else 'Not created'}",
        ])

        if state.topic_clusters:
            report_lines.extend([
                "",
                "TOPIC CLUSTERS:",
            ])
            for topic, urls in state.topic_clusters.items():
                report_lines.append(f"  {topic}: {len(urls)} articles")

        if state.newsletter_sections:
            report_lines.extend([
                "",
                "NEWSLETTER SECTIONS:",
            ])
            for section_name, section_data in state.newsletter_sections.items():
                status = section_data.get('section_status', 'unknown')
                word_count = section_data.get('word_count', 0)
                article_count = section_data.get('article_count', 0)
                report_lines.append(f"  {section_name}: {status}, {article_count} articles, {word_count} words")

        if state.final_newsletter:
            newsletter_words = len(state.final_newsletter.split())
            report_lines.extend([
                "",
                "FINAL NEWSLETTER:",
                f"  Length: {newsletter_words} words",
                f"  Preview: {state.final_newsletter[:200]}..." if len(state.final_newsletter) > 200 else f"  Content: {state.final_newsletter}",
            ])

        return "\n".join(report_lines)

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="inspect_state",
            description="Inspect detailed persistent state data including article counts, processing results, and content status. Useful for debugging and monitoring workflow progress.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._inspect_state
        )


class GatherUrlsTool:
    """Tool for Step 1: Gather URLs from various news sources"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status  # Keep for UI progress tracking
        self.verbose = verbose
        self.logger = logger

    async def _gather_urls(self, ctx, args: str) -> str:
        """Execute Step 1: Gather URLs using persistent state"""
        if self.logger:
            self.logger.info("Starting Step 1: Gather URLs")

        step_name = "step_01_gather_urls"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 1:
            total_articles = len(state.headline_data)
            if self.logger:
                self.logger.info(f"Step 1 already completed with {total_articles} articles")
            return f"Step 1 already completed! Found {total_articles} articles in persistent state."

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Use real RSS fetching from sources.yaml
            fetcher = Fetcher()
#             rss_dict = await Fetcher.fetch_rss()
#             html_dict = await Fetcher.fetch_html()
#             api_dict = await Fetcher.fetch_api()
            sources_results = await fetcher.gather_all()
            print(sources_results)

            # Process results and store in persistent state
            all_articles = []
            successful_sources = []
            failed_sources = []

            for result in sources_results:
                if result['status'] == 'success' and result['results']:
                    # Add source info to each article
                    for article in result['results']:
                        article['source_key'] = result['source_key']
                        article['ai_related'] = None  # To be determined in step 2
                        all_articles.append(article)
                    successful_sources.append(result['source_key'])
                elif result['status'] == 'not_implemented':
                    # Skip HTML/API sources for now
                    continue
                else:
                    failed_sources.append(result['source_key'])

            # Store results in persistent state
            state.headline_data = all_articles
            state.current_step = 1

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 1: Gathered {len(all_articles)} URLs from {len(successful_sources)} RSS sources")
                if failed_sources:
                    print(f"⚠️  Failed sources: {', '.join(failed_sources)}")

            status_msg = f"✅ Step 1 completed successfully! Gathered {len(all_articles)} articles from {len(successful_sources)} sources (RSS only)."
            if failed_sources:
                status_msg += f" {len(failed_sources)} sources failed or not implemented."

            status_msg += f"\n\n📊 Articles stored in persistent state: {len(state.headline_data)}"
            if self.logger:
                self.logger.info(f"Completed Step 1: Gathered {len(all_articles)} articles")
            return status_msg

        except Exception as e:
            if self.logger:
                self.logger.error(f"Step 1 failed: {str(e)}")
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 1 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="gather_urls",
            description="Execute Step 1: Gather URLs and headlines from various news sources. Only use this tool if Step 1 is not already completed.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._gather_urls
        )


class FilterUrlsTool:
    """Tool for Step 2: Filter URLs to AI-related content"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _filter_urls(self, ctx, args: str) -> str:
        """Execute Step 2: Filter URLs using persistent state"""
        if self.logger:
            self.logger.info("Starting Step 2: Filter URLs")

        step_name = "step_02_filter_urls"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 2:
            ai_related_count = sum(1 for article in state.headline_data if article.get('ai_related') is True)
            total_count = len(state.headline_data)
            return f"Step 2 already completed! Filtered {total_count} articles, {ai_related_count} identified as AI-related."

        # Check if step 1 is completed
        if state.current_step < 1 or not state.headline_data:
            return f"❌ Cannot execute Step 2: Step 1 (Gather URLs) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Read headlines from persistent state
            total_articles = len(state.headline_data)

            # Mock AI classification - in a real implementation, this would use an AI model
            # to analyze titles and descriptions for AI relevance
            ai_related_count = 0
            for i, article in enumerate(state.headline_data):
                # Simple keyword-based mock classification
                title_lower = article.get('title', '').lower()
                description_lower = article.get('description', '').lower()

                ai_keywords = [
                    'artificial intelligence', 'ai', 'machine learning', 'ml', 'deep learning',
                    'neural network', 'llm', 'large language model', 'gpt', 'claude',
                    'openai', 'anthropic', 'chatbot', 'automation', 'algorithm',
                    'computer vision', 'natural language', 'nlp', 'robotics'
                ]

                is_ai_related = any(keyword in title_lower or keyword in description_lower
                                  for keyword in ai_keywords)

                # Update article with AI classification
                state.headline_data[i]['ai_related'] = is_ai_related
                if is_ai_related:
                    ai_related_count += 1

            # Update persistent state
            state.current_step = 2

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            filter_accuracy = ai_related_count / total_articles if total_articles > 0 else 0

            if self.verbose:
                print(f"✅ Completed Step 2: Filtered to {ai_related_count} AI-related headlines from {total_articles} total")

            status_msg = f"✅ Step 2 completed successfully! Filtered {total_articles} headlines to {ai_related_count} AI-related articles (accuracy: {filter_accuracy:.1%})."
            status_msg += f"\n\n📊 Results stored in persistent state. Current step: {state.current_step}"
            if self.logger:
                self.logger.info(f"Completed Step 2: Filtered to {ai_related_count} AI-related articles")
            return status_msg

        except Exception as e:
            if self.logger:
                self.logger.error(f"Step 2 failed: {str(e)}")
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 2 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="filter_urls",
            description="Execute Step 2: Filter URLs to AI-related content only. Requires Step 1 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._filter_urls
        )


class DownloadArticlesTool:
    """Tool for Step 3: Download article content"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _download_articles(self, ctx, args: str) -> str:
        """Execute Step 3: Download Articles using persistent state"""
        if self.logger:
            self.logger.info("Starting Step 3: Download Articles")

        step_name = "step_03_download_articles"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 3:
            ai_articles = [article for article in state.headline_data if article.get('ai_related') is True]
            downloaded_count = sum(1 for article in ai_articles if article.get('content'))
            return f"Step 3 already completed! Downloaded content for {downloaded_count} AI-related articles."

        # Check if step 2 is completed
        if state.current_step < 2:
            return f"❌ Cannot execute Step 3: Step 2 (Filter URLs) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get AI-related articles from persistent state
            ai_articles = [article for article in state.headline_data if article.get('ai_related') is True]

            if not ai_articles:
                return f"❌ No AI-related articles found to download. Please run step 2 first."

            # Mock content download - in a real implementation, this would fetch actual article content
            successful_downloads = 0
            total_length = 0

            for article in state.headline_data:
                if article.get('ai_related') is True:
                    # Simulate downloading article content
                    # In reality, this would use web scraping or API calls
                    mock_content = f"Mock article content for: {article.get('title', 'Unknown title')}\n\n"
                    mock_content += f"This is placeholder content that would normally be extracted from {article.get('url', 'unknown URL')}.\n"
                    mock_content += f"The article covers topics related to AI and technology as indicated by the title and description.\n"
                    mock_content += f"Source: {article.get('source', 'Unknown source')}\n"

                    # Add content to the article data
                    article['content'] = mock_content
                    article['download_timestamp'] = datetime.now().isoformat()
                    article['content_length'] = len(mock_content)

                    successful_downloads += 1
                    total_length += len(mock_content)

            # Calculate stats
            download_success_rate = successful_downloads / len(ai_articles) if ai_articles else 0
            avg_article_length = total_length / successful_downloads if successful_downloads > 0 else 0

            # Update persistent state
            state.current_step = 3

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 3: Downloaded {successful_downloads} AI-related articles")

            status_msg = f"✅ Step 3 completed successfully! Downloaded {successful_downloads} AI-related articles with {download_success_rate:.0%} success rate."
            status_msg += f"\n📊 Average article length: {avg_article_length:.0f} characters"
            status_msg += f"\n🔗 Content stored in persistent state. Current step: {state.current_step}"
            if self.logger:
                self.logger.info(f"Completed Step 3: Downloaded {successful_downloads} articles")
            return status_msg

        except Exception as e:
            if self.logger:
                self.logger.error(f"Step 3 failed: {str(e)}")
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 3 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="download_articles",
            description="Execute Step 3: Download full article content from filtered URLs. Requires Step 2 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._download_articles
        )


class ExtractSummariesTool:
    """Tool for Step 4: Extract article summaries"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _extract_summaries(self, ctx, args: str) -> str:
        """Execute Step 4: Extract Summaries using persistent state"""
        step_name = "step_04_extract_summaries"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 4:
            summary_count = len([url for url in state.article_summaries.keys() if state.article_summaries[url]])
            return f"Step 4 already completed! Generated summaries for {summary_count} articles."

        # Check if step 3 is completed
        if state.current_step < 3:
            return f"❌ Cannot execute Step 4: Step 3 (Download Articles) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get articles with content from persistent state
            articles_with_content = [
                article for article in state.headline_data
                if article.get('ai_related') is True and article.get('content')
            ]

            if not articles_with_content:
                return f"❌ No downloaded AI-related articles found to summarize. Please run step 3 first."

            # Clear existing summaries if rerunning
            state.article_summaries = {}

            # Generate summaries for each article
            articles_summarized = 0
            total_bullets = 0

            for article in articles_with_content:
                url = article.get('url', f"article_{articles_summarized}")
                title = article.get('title', 'Unknown title')
                content = article.get('content', '')

                # Mock summary generation - in a real implementation, this would use an AI model
                # to create bullet point summaries from the full article content
                mock_summary = [
                    f"Key insight from '{title[:50]}...' - Main technological development discussed",
                    f"Business implications or market impact highlighted in the article",
                    f"Future outlook or expert predictions mentioned in the content"
                ]

                # Store summary in persistent state
                state.article_summaries[url] = mock_summary
                articles_summarized += 1
                total_bullets += len(mock_summary)

                # Add summary reference to article data as well
                article['summary_bullets'] = len(mock_summary)
                article['summary_timestamp'] = datetime.now().isoformat()

            # Calculate stats
            avg_bullets_per_article = total_bullets / articles_summarized if articles_summarized > 0 else 0
            summary_quality_score = 0.89  # Mock quality score

            # Update persistent state
            state.current_step = 4

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 4: Created summaries for {articles_summarized} articles")

            status_msg = f"✅ Step 4 completed successfully! Generated {avg_bullets_per_article:.1f}-bullet summaries for {articles_summarized} articles."
            status_msg += f"\n📝 Quality score: {summary_quality_score:.1%}"
            status_msg += f"\n💾 Summaries stored in persistent state. Current step: {state.current_step}"
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 4 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="extract_summaries",
            description="Execute Step 4: Create bullet point summaries of each downloaded article. Requires Step 3 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._extract_summaries
        )


class ClusterByTopicTool:
    """Tool for Step 5: Cluster articles by topic"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _cluster_by_topic(self, ctx, args: str) -> str:
        """Execute Step 5: Cluster By Topic using persistent state"""
        step_name = "step_05_cluster_by_topic"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 5:
            cluster_count = len(state.topic_clusters)
            total_articles = sum(len(articles) for articles in state.topic_clusters.values())
            return f"Step 5 already completed! Created {cluster_count} topic clusters with {total_articles} articles."

        # Check if step 4 is completed
        if state.current_step < 4:
            return f"❌ Cannot execute Step 5: Step 4 (Extract Summaries) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get articles with summaries from persistent state
            articles_with_summaries = [
                article for article in state.headline_data
                if article.get('ai_related') is True and
                article.get('url') in state.article_summaries
            ]

            if not articles_with_summaries:
                return f"❌ No summarized articles found to cluster. Please run step 4 first."

            # Clear existing clusters if rerunning
            state.topic_clusters = {}

            # Mock clustering logic - in a real implementation, this would use NLP/ML
            # to group articles by semantic similarity of their titles and summaries
            predefined_topics = [
                "LLM Advances", "AI Safety & Ethics", "Business AI Applications",
                "Research Breakthroughs", "Industry News", "Other AI Topics"
            ]

            # Initialize empty clusters
            for topic in predefined_topics:
                state.topic_clusters[topic] = []

            # Simple keyword-based clustering
            topic_keywords = {
                "LLM Advances": ["llm", "large language model", "gpt", "claude", "language model", "chatbot", "chat"],
                "AI Safety & Ethics": ["safety", "ethics", "bias", "fairness", "responsible", "trust", "alignment"],
                "Business AI Applications": ["business", "enterprise", "productivity", "automation", "workflow", "commercial"],
                "Research Breakthroughs": ["research", "breakthrough", "paper", "study", "academic", "university", "science"],
                "Industry News": ["company", "startup", "funding", "acquisition", "partnership", "launch", "release"],
                "Other AI Topics": []  # Catch-all
            }

            for article in articles_with_summaries:
                url = article.get('url', '')
                title_lower = article.get('title', '').lower()
                description_lower = article.get('description', '').lower()

                # Find best matching topic
                best_topic = "Other AI Topics"  # Default
                max_matches = 0

                for topic, keywords in topic_keywords.items():
                    if topic == "Other AI Topics":
                        continue

                    matches = sum(1 for keyword in keywords
                                if keyword in title_lower or keyword in description_lower)

                    if matches > max_matches:
                        max_matches = matches
                        best_topic = topic

                # Add article URL to the appropriate cluster
                state.topic_clusters[best_topic].append(url)

                # Also update the article with cluster info
                article['cluster_topic'] = best_topic
                article['cluster_timestamp'] = datetime.now().isoformat()

            # Remove empty clusters
            state.topic_clusters = {
                topic: articles for topic, articles in state.topic_clusters.items()
                if articles
            }

            # Calculate stats
            total_clusters = len(state.topic_clusters)
            total_articles = sum(len(articles) for articles in state.topic_clusters.values())
            cluster_coherence_score = 0.84  # Mock coherence score

            # Update persistent state
            state.current_step = 5

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 5: Created {total_clusters} topic clusters")

            status_msg = f"✅ Step 5 completed successfully! Organized {total_articles} articles into {total_clusters} topic clusters."
            status_msg += f"\n📊 Cluster coherence score: {cluster_coherence_score:.1%}"
            status_msg += f"\n🏷️ Topics: {', '.join(state.topic_clusters.keys())}"
            status_msg += f"\n💾 Clusters stored in persistent state. Current step: {state.current_step}"
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 5 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="cluster_by_topic",
            description="Execute Step 5: Group articles by thematic topics using clustering. Requires Step 4 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._cluster_by_topic
        )


class RateArticlesTool:
    """Tool for Step 6: Rate article quality and importance"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _rate_articles(self, ctx, args: str) -> str:
        """Execute Step 6: Rate Articles using persistent state"""
        step_name = "step_06_rate_articles"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 6:
            rated_articles = [article for article in state.headline_data if article.get('quality_rating')]
            avg_rating = sum(article.get('quality_rating', 0) for article in rated_articles) / len(rated_articles) if rated_articles else 0
            return f"Step 6 already completed! Rated {len(rated_articles)} articles with average rating {avg_rating:.1f}/10."

        # Check if step 5 is completed
        if state.current_step < 5:
            return f"❌ Cannot execute Step 6: Step 5 (Cluster By Topic) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get clustered articles from persistent state
            clustered_articles = [
                article for article in state.headline_data
                if article.get('ai_related') is True and article.get('cluster_topic')
            ]

            if not clustered_articles:
                return f"❌ No clustered articles found to rate. Please run step 5 first."

            # Rate each article based on mock criteria
            articles_rated = 0
            total_rating = 0
            high_quality_count = 0

            for article in clustered_articles:
                # Mock rating logic - in reality, this would use AI to evaluate:
                # - Content quality, originality, depth
                # - Source credibility
                # - Relevance to AI community
                # - Timeliness and newsworthiness

                title_length = len(article.get('title', ''))
                has_description = bool(article.get('description', ''))
                source_quality = 8 if article.get('source') in ['Techmeme', 'Ars Technica', 'The Verge'] else 6
                cluster_bonus = 2 if article.get('cluster_topic') != 'Other AI Topics' else 0

                # Calculate mock quality rating (1-10)
                base_rating = 5
                if title_length > 50: base_rating += 1
                if has_description: base_rating += 1
                rating = min(10, base_rating + (source_quality - 6) + cluster_bonus)

                # Add some randomness to make it more realistic
                import random
                rating = max(1, min(10, rating + random.uniform(-1, 1)))

                # Store rating in article data
                article['quality_rating'] = round(rating, 1)
                article['rating_timestamp'] = datetime.now().isoformat()

                articles_rated += 1
                total_rating += rating
                if rating >= 7.0:
                    high_quality_count += 1

            # Calculate stats
            avg_rating = total_rating / articles_rated if articles_rated > 0 else 0

            # Update persistent state
            state.current_step = 6

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 6: Rated {articles_rated} articles")

            status_msg = f"✅ Step 6 completed successfully! Rated {articles_rated} articles with average rating {avg_rating:.1f}/10."
            status_msg += f"\n⭐ High quality articles (≥7.0): {high_quality_count}"
            status_msg += f"\n💾 Ratings stored in persistent state. Current step: {state.current_step}"
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 6 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="rate_articles",
            description="Execute Step 6: Evaluate article quality and importance with ratings. Requires Step 5 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._rate_articles
        )


class SelectSectionsTool:
    """Tool for Step 7: Select newsletter sections"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _select_sections(self, ctx, args: str) -> str:
        """Execute Step 7: Select Sections using persistent state"""
        step_name = "step_07_select_sections"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 7:
            section_count = len(state.newsletter_sections)
            return f"Step 7 already completed! Created {section_count} newsletter sections."

        # Check if step 6 is completed
        if state.current_step < 6:
            return f"❌ Cannot execute Step 7: Step 6 (Rate Articles) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get rated articles from persistent state
            rated_articles = [
                article for article in state.headline_data
                if article.get('ai_related') is True and article.get('quality_rating')
            ]

            if not rated_articles:
                return f"❌ No rated articles found to organize into sections. Please run step 6 first."

            # Clear existing sections if rerunning
            state.newsletter_sections = {}

            # Create newsletter sections based on topic clusters and ratings
            # Use existing topic clusters but prioritize high-quality articles
            high_quality_articles = [a for a in rated_articles if a.get('quality_rating', 0) >= 7.0]
            medium_quality_articles = [a for a in rated_articles if 5.0 <= a.get('quality_rating', 0) < 7.0]

            # Group articles by cluster topic and select best ones for each section
            cluster_sections = {}
            for article in high_quality_articles + medium_quality_articles:
                cluster = article.get('cluster_topic', 'Other AI Topics')
                if cluster not in cluster_sections:
                    cluster_sections[cluster] = []
                cluster_sections[cluster].append(article)

            # Create newsletter sections with article assignments
            articles_assigned = 0
            for cluster, articles in cluster_sections.items():
                if not articles:
                    continue

                # Sort articles by rating (highest first) and take top articles
                sorted_articles = sorted(articles, key=lambda x: x.get('quality_rating', 0), reverse=True)
                top_articles = sorted_articles[:5]  # Max 5 articles per section

                # Create section outline (will be filled in step 8)
                section_content = {
                    'title': cluster,
                    'article_count': len(top_articles),
                    'articles': [{
                        'url': article.get('url'),
                        'title': article.get('title'),
                        'rating': article.get('quality_rating'),
                        'source': article.get('source')
                    } for article in top_articles],
                    'section_status': 'selected',
                    'timestamp': datetime.now().isoformat()
                }

                state.newsletter_sections[cluster] = section_content
                articles_assigned += len(top_articles)

            # Update persistent state
            state.current_step = 7

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 7: Created {len(state.newsletter_sections)} newsletter sections")

            status_msg = f"✅ Step 7 completed successfully! Organized content into {len(state.newsletter_sections)} sections with {articles_assigned} articles assigned."
            status_msg += f"\n📑 Sections: {', '.join(state.newsletter_sections.keys())}"
            status_msg += f"\n💾 Section plan stored in persistent state. Current step: {state.current_step}"
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 7 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="select_sections",
            description="Execute Step 7: Organize articles into newsletter sections based on topics and ratings. Requires Step 6 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._select_sections
        )


class DraftSectionsTool:
    """Tool for Step 8: Draft section content"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _draft_sections(self, ctx, args: str) -> str:
        """Execute Step 8: Draft Sections using persistent state"""
        step_name = "step_08_draft_sections"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 8:
            drafted_sections = [s for s in state.newsletter_sections.values() if s.get('content')]
            total_words = sum(len(s.get('content', '').split()) for s in drafted_sections)
            return f"Step 8 already completed! Drafted {len(drafted_sections)} sections with {total_words} total words."

        # Check if step 7 is completed
        if state.current_step < 7:
            return f"❌ Cannot execute Step 8: Step 7 (Select Sections) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get section plans from persistent state
            if not state.newsletter_sections:
                return f"❌ No newsletter sections found to draft. Please run step 7 first."

            # Draft content for each section
            sections_drafted = 0
            total_words = 0

            for section_name, section_data in state.newsletter_sections.items():
                if section_data.get('section_status') != 'selected':
                    continue

                articles = section_data.get('articles', [])
                if not articles:
                    continue

                # Mock section content generation - in reality, this would use AI
                # to create engaging newsletter content from article summaries
                section_content = f"## {section_name}\n\n"

                # Add intro paragraph
                intro_templates = {
                    'LLM Advances': "The latest developments in large language models continue to push the boundaries of what's possible in AI.",
                    'AI Safety & Ethics': "Important discussions around responsible AI development and deployment are shaping the future of the field.",
                    'Business AI Applications': "Companies are finding innovative ways to integrate AI into their products and workflows.",
                    'Research Breakthroughs': "Academic researchers are making significant strides in advancing our understanding of artificial intelligence.",
                    'Industry News': "The AI industry continues to evolve with new partnerships, funding rounds, and product launches."
                }

                intro = intro_templates.get(section_name, f"Here are the latest updates in {section_name.lower()}.")
                section_content += f"{intro}\n\n"

                # Add article summaries
                for i, article in enumerate(articles[:3]):  # Top 3 articles per section
                    article_url = article.get('url', '')
                    article_title = article.get('title', 'Unknown Title')
                    article_source = article.get('source', 'Unknown Source')

                    # Get the actual summary from state if available
                    summary_bullets = state.article_summaries.get(article_url, [
                        f"Key insights from this {section_name.lower()} article",
                        f"Important implications for the AI community",
                        f"Notable developments worth following"
                    ])

                    section_content += f"### {article_title}\n"
                    section_content += f"*Source: {article_source}*\n\n"

                    for bullet in summary_bullets:
                        section_content += f"- {bullet}\n"

                    section_content += f"\n[Read more]({article_url})\n\n"

                # Store the drafted content
                state.newsletter_sections[section_name]['content'] = section_content
                state.newsletter_sections[section_name]['section_status'] = 'drafted'
                state.newsletter_sections[section_name]['draft_timestamp'] = datetime.now().isoformat()
                state.newsletter_sections[section_name]['word_count'] = len(section_content.split())

                sections_drafted += 1
                total_words += len(section_content.split())

            # Calculate average words per section
            avg_words_per_section = total_words / sections_drafted if sections_drafted > 0 else 0

            # Update persistent state
            state.current_step = 8

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 8: Drafted {sections_drafted} sections")

            status_msg = f"✅ Step 8 completed successfully! Drafted {sections_drafted} sections with {total_words} total words."
            status_msg += f"\n📝 Average words per section: {avg_words_per_section:.0f}"
            status_msg += f"\n💾 Section content stored in persistent state. Current step: {state.current_step}"
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 8 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="draft_sections",
            description="Execute Step 8: Write engaging content for each newsletter section. Requires Step 7 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._draft_sections
        )


class FinalizeNewsletterTool:
    """Tool for Step 9: Finalize complete newsletter"""

    def __init__(self, workflow_status: WorkflowStatus, verbose: bool = False, logger: logging.Logger = None):
        self.workflow_status = workflow_status
        self.verbose = verbose
        self.logger = logger

    async def _finalize_newsletter(self, ctx, args: str) -> str:
        """Execute Step 9: Finalize Newsletter using persistent state"""
        step_name = "step_09_finalize_newsletter"

        # Access the persistent state
        state: NewsletterAgentState = ctx.context

        # Check if step already completed via persistent state
        if state.current_step >= 9:
            newsletter_length = len(state.final_newsletter.split()) if state.final_newsletter else 0
            sections_count = len([s for s in state.newsletter_sections.values() if s.get('content')])
            return f"Step 9 already completed! Newsletter finalized with {sections_count} sections and {newsletter_length} words."

        # Check if step 8 is completed
        if state.current_step < 8:
            return f"❌ Cannot execute Step 9: Step 8 (Draft Sections) must be completed first. Current step: {state.current_step}"

        try:
            # Update workflow status for UI tracking
            self.workflow_status.start_step(step_name)

            # Get drafted sections from persistent state
            drafted_sections = {
                name: data for name, data in state.newsletter_sections.items()
                if data.get('section_status') == 'drafted' and data.get('content')
            }

            if not drafted_sections:
                return f"❌ No drafted sections found to finalize. Please run step 8 first."

            # Create the final newsletter by combining all sections
            today = datetime.now().strftime("%B %d, %Y")

            newsletter_content = f"# AI News Digest - {today}\n\n"
            newsletter_content += f"*Curated insights from the latest in artificial intelligence*\n\n"
            newsletter_content += f"---\n\n"

            # Add table of contents
            newsletter_content += "## Table of Contents\n\n"
            for i, section_name in enumerate(drafted_sections.keys(), 1):
                newsletter_content += f"{i}. [{section_name}](#{section_name.lower().replace(' ', '-').replace('&', 'and')})\n"
            newsletter_content += "\n---\n\n"

            # Add each section content
            for section_name, section_data in drafted_sections.items():
                newsletter_content += section_data.get('content', '')
                newsletter_content += "\n---\n\n"

            # Add footer
            newsletter_content += "## About This Newsletter\n\n"
            newsletter_content += "This AI News Digest was automatically curated using our intelligent newsletter agent. "
            newsletter_content += f"We analyzed {len(state.headline_data)} articles from {len(set(a.get('source', '') for a in state.headline_data))} sources "
            newsletter_content += f"to bring you the most relevant AI developments.\n\n"
            newsletter_content += f"*Generated on {today}*\n"

            # Store the final newsletter
            state.final_newsletter = newsletter_content

            # Calculate final stats
            newsletter_length = len(newsletter_content.split())
            sections_included = len(drafted_sections)

            # Mock quality score based on content metrics
            base_quality = 7.0
            if sections_included >= 4: base_quality += 0.5
            if newsletter_length >= 2000: base_quality += 0.5
            if newsletter_length >= 3000: base_quality += 0.5
            final_quality_score = min(10.0, base_quality)

            # Mark workflow as complete
            state.current_step = 9
            state.workflow_complete = True

            # Also update workflow status for UI
            self.workflow_status.complete_step(step_name)

            if self.verbose:
                print(f"✅ Completed Step 9: Finalized newsletter ({newsletter_length} words)")

            status_msg = f"🎉 Step 9 completed successfully! Newsletter finalized with {sections_included} sections and {newsletter_length} words."
            status_msg += f"\n⭐ Quality score: {final_quality_score:.1f}/10"
            status_msg += f"\n📰 Complete newsletter stored in persistent state"
            status_msg += f"\n✅ Workflow complete! All 9 steps finished successfully."
            return status_msg

        except Exception as e:
            self.workflow_status.error_step(step_name, str(e))
            return f"❌ Step 9 failed: {str(e)}"

    def create_tool(self) -> FunctionTool:
        """Create a FunctionTool instance following OpenAI Agents SDK conventions"""
        return FunctionTool(
            name="finalize_newsletter",
            description="Execute Step 9: Combine all sections into the final newsletter with formatting and polish. Requires Step 8 to be completed first.",
            params_json_schema={
                "type": "object",
                "properties": {},
                "required": []
            },
            on_invoke_tool=self._finalize_newsletter
        )


In [None]:
LOGDB = 'newsagent_logs.db'

class NewsletterAgent(Agent[NewsletterAgentState]):
    """Newsletter agent with persistent state and workflow tools"""

    def __init__(self, session_id: str = "newsletter_agent", verbose: bool = False):
        """
        Initialize the NewsletterAgent with persistent state

        Args:
            session_id: Unique identifier for the session (for persistence)
            verbose: Enable verbose logging
        """
        # Initialize session for persistence
        self.session = SQLiteSession(session_id, "newsletter_agent.db")
        self.workflow_status = WorkflowStatus()  # Keep for progress tracking UI
        self.verbose = verbose

        # Initialize logger
        self.logger = setup_logging(session_id, LOGDB)

        # System prompt that guides tool selection based on workflow status
        system_prompt = """
You are an AI newsletter writing agent that executes a 9-step workflow process using tools with persistent state.

WORKFLOW OVERVIEW:
1. Step 1: Gather URLs - Collect headlines and URLs from various sources
2. Step 2: Filter URLs - Filter headlines to AI-related content only
3. Step 3: Download Articles - Fetch full article content from URLs
4. Step 4: Extract Summaries - Create bullet point summaries of each article
5. Step 5: Cluster By Topic - Group articles by thematic topics
6. Step 6: Rate Articles - Evaluate article quality and importance
7. Step 7: Select Sections - Organize articles into newsletter sections
8. Step 8: Draft Sections - Write content for each section
9. Step 9: Finalize Newsletter - Combine sections into final newsletter

WORKFLOW RESUME LOGIC:
- You maintain persistent state between runs and can resume from any step
- ALWAYS start by checking workflow status to understand current progress
- If current_step >= 1, you can resume from any completed step forward
- Steps are idempotent - if a step is already completed, tools will return cached results
- When resuming, automatically continue from the next incomplete step

INSTRUCTIONS:
- ALWAYS start by checking the current workflow status using check_workflow_status
- Use inspect_state tool to examine detailed state data when debugging
- Execute workflow steps in the correct order using the appropriate tools
- Each step has prerequisites - only execute a step if the previous step is completed
- If a user asks to "run all steps" or "create the newsletter", execute all remaining steps in sequence
- If a user asks for a specific step, execute only that step (if prerequisites are met)
- If a user asks to "resume" or "continue", start from the next incomplete step
- Always check status between steps to ensure proper sequencing
- Your state persists between sessions - you can resume work from where you left off

TOOL SELECTION STRATEGY:
1. First, always use check_workflow_status to understand current state and progress
2. If resuming, identify the next step that needs to be executed
3. Use the appropriate tool for that step
4. After each step, check status again to confirm progress
5. Continue until workflow is complete or user request is fulfilled

RESUME EXAMPLES:
- If current_step=3, next step is step 4 (Extract Summaries)
- If current_step=7, next step is step 8 (Draft Sections)
- If current_step=9, workflow is complete - no further steps needed

Remember: Your state is persistent. You can safely resume from any point. Never skip steps or execute them out of order.
"""

        super().__init__(
            name="NewsletterAgent",
            instructions=system_prompt,
            model="gpt-4o-mini",
            tools=[
                WorkflowStatusTool(self.workflow_status, self.logger).create_tool(),
                StateInspectionTool(self.verbose, self.logger).create_tool(),
                GatherUrlsTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                FilterUrlsTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                DownloadArticlesTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                ExtractSummariesTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                ClusterByTopicTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                RateArticlesTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                SelectSectionsTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                DraftSectionsTool(self.workflow_status, self.verbose, self.logger).create_tool(),
                FinalizeNewsletterTool(self.workflow_status, self.verbose, self.logger).create_tool(),
            ]
        )

        # Initialize default state
        self.default_state = NewsletterAgentState()

        if self.verbose:
            print(f"Initialized NewsletterAgent with persistent state and 9-step workflow")
            print(f"Session ID: {session_id}")

    async def run_step(self, user_input: str) -> str:
        """Run a workflow step with persistent state"""
        result = await Runner.run(
            self,
            user_input,
            session=self.session,
            context=self.default_state,  # Will load from session if exists
            max_turns=50  # Increased for complete 9-step workflow
        )
        return result.final_output


In [None]:
news_agent = NewsletterAgent(session_id=f"newsletter_{random.randint(10000000, 99999999)}", verbose=True)


In [None]:
user_prompt = "Run all the workflow steps in order and create the newsletter"

start_time = time.time()
result = await news_agent.run_step(user_prompt)
duration = time.time() - start_time

print("=" * 80)
print(f"⏱️  Total execution time: {duration:.2f}s")
print(f"📊 Final result:")
print(result)



In [None]:
# Create mock context
class MockContext:
    def __init__(self):
        self.context = news_agent.default_state

ctx = MockContext()
current_state = ctx.context  # From your previous run, or reload it
df = current_state.headline_df
df



In [None]:
try:
    current_state = news_agent.session.get_state()
except:
    current_state = news_agent.default_state

print(current_state)
print()

print(f"Current Step: {current_state.current_step}/9")
print(f"Workflow Complete: {current_state.workflow_complete}")
print(f"Progress: {(current_state.current_step/9)*100:.1f}%")
print(f"Total articles: {len(current_state.headline_data)}")

if current_state.headline_data:
    ai_related = sum(1 for a in current_state.headline_data if a.get('ai_related') is True)
    print(f"AI-related articles: {ai_related}")
    print(f"Summaries: {len(current_state.article_summaries)}")
    print(f"Clusters: {len(current_state.topic_clusters)}")
    print(f"Sections: {len(current_state.newsletter_sections)}")

In [None]:
# review slides

# review workflow status, move to a moadule
# all prints should be logs
# section writing and composition will have the critic /optimizer loop
# add batch with async


In [None]:
def create_news_dataframe():
    """
    Creates an empty DataFrame to support headline/article analysis
    - URLs, source tracking and metadata
    - Topic classification and clustering
    - Content quality ratings and rankings

    Returns:
        pd.DataFrame: Empty DataFrame with predefined column structure
    """

    # column structure
    column_dict = {
        # Core identifiers and source info
        'article_id': 'object',              # Unique identifier for each article
        'source':     'object',              # Source category
        'headline_title': 'object',          # Article headline/title
        'original_url': 'object',            # Initial URL before redirects
        'final_url': 'object',               # URL after following redirects
        'domain_name': 'category',           # Website domain
        'site_name': 'category',             # Human-readable site name
        'site_reputation_score': 'float32',  # Reputation/trustworthiness score for the site
        'keep_flag': 'boolean',

        # File paths and storage
        'html_file_path': 'object',          # Path to stored HTML content
        'text_file_path': 'object',          # Path to extracted text content

        # Time information
        'last_updated_timestamp': 'datetime64[ns]',  # When article was last updated
        'article_age_days': 'int32',         # Age of article in days
        'recency_score': 'float32',          # Calculated recency score (higher = more recent)

        # Content analysis
        'content_summary': 'object',         # Generated summary of article content
        'bullet_points': 'object',           # Key points extracted as bullets
        'article_length_chars': 'int32',     # Character count of article content

        # Rating flags (LLM-generated probabilities)
        'is_high_quality': 'float32',        # LLM probability for low-quality content
        'is_off_topic': 'float32',           # LLM probability for off-topic content
        'is_low_importance': 'float32',      # 1-LLM probability for high-importance content

        # Other ratings
        'bradley_terry_score': 'float32',    # Bradley-Terry rating from pairwise article comparisons
        'bradley_terry_rank': 'int32',       # Ordinal rank based on Bradley-Terry scores (1 = highest rated)
        'adjusted_length_score': 'float32',  # Length-adjusted quality score
        'final_composite_rating': 'float32', # Final weighted rating combining multiple factors

        # Topic classification
        'topic_string': 'object',            # Topic labels as comma-separated string
        'topic_list': 'object',              # Topic labels as list/array structure (same topics, different format)

        # Organization and clustering (HDBSCAN-based)
        'display_order': 'int32',            # Order for display/presentation
        'cluster_id': 'int32',               # HDBSCAN cluster identifier (-1 = noise/outlier)
        'cluster_label': 'category'          # Human-readable cluster name/description
    }

    # Create empty DataFrame from column dictionary
    df = pd.DataFrame(columns=list(column_dict.keys())).astype(column_dict)

    return df



In [None]:
@dataclass
class NewsletterState:
    """
    Maintains session state for the OpenAI Agents SDK workflow.

    Attributes:
        headline_df: DataFrame containing headline data for processing
        sources_file: Path to YAML file containing source configurations
        sources: Dictionary of source configurations loaded from YAML
        cluster_topics: List of clean topic names for headline categorization
        max_edits: Maximum number of critic optimizer editing iterations allowed
        edit_complete: Boolean flag indicating if editing process is finished
        n_browsers: Number of concurrent Playwright browser instances for downloads
    """

    status: WorkflowStatus = WorkflowStatus()
    headline_df: pd.DataFrame = field(default_factory=create_news_dataframe)
    sources_file: str = field(default="sources.yaml")
    sources: Dict[str, Any] = field(default_factory=dict)
    cluster_topics: List[str] = field(default_factory=list)
    max_edits: int = field(default=3)
    edit_complete: bool = field(default=False)
    n_browsers: int = field(default=8)
    verbose: bool = field(default=True)


    def __post_init__(self):
        """
        Post-initialization validation and setup.

        Validates that the configuration makes sense and performs
        any necessary initialization steps.
        """
        # Validate max_edits is reasonable
        if self.max_edits < 1 or self.max_edits > 10:
            raise ValueError(f"max_edits should be between 1-10, got {self.max_edits}")

        # Validate n_browsers is reasonable
        if self.n_browsers < 1 or self.n_browsers > 32:
            raise ValueError(f"n_browsers should be between 1-32, got {self.n_browsers}")

        # Validate sources_file exists and load sources from file automatically
        try:
            sources_path = Path(self.sources_file)
            with open(sources_path, 'r', encoding='utf-8') as file:
                self.sources = yaml.safe_load(file) or {}
            if self.verbose:
                print(f"Loaded {len(self.sources)} sources from {self.sources_file}")
        except FileNotFoundError:
            raise FileNotFoundError(f"Sources file not found: {self.sources_file}")
        except yaml.YAMLError as e:
            raise ValueError(f"Error parsing YAML file {self.sources_file}: {e}")


In [None]:
state = NewsletterState()
state


In [None]:
from agents import Agent, Runner, SQLiteSession, function_tool, RunContextWrapper


In [None]:
class NewsletterAgent(Agent[NewsletterState]):
    """AI newsletter writing agent with structured workflow"""

    def __init__(self, session_id: str = "newsletter_agent"):
        self.session = SQLiteSession(session_id, "newsletter.db")
        self.state = NewsletterState()

        super().__init__(
            name="AINewsletterAgent",
            instructions="""
            You are an AI newsletter writing agent. Your role is to:
            1. Scrape headlines and URLs from various sources
            2. Filter the headlines to ones that are about AI
            3. Fetch the URLs and save them as plain text
            4. Summarize each article to 3 bullet points containing the key facts
            5. Extract topics from each article and cluster articles by topic
            6. Rate each article according to the provided rubric
            7. Identify 6-15 thematic sections + "Other News", assign articles to sections and deduplicate
            8. Write each section
            9. Combine sections and polish

            Use the tools available to accomplish these tasks in order.
            Always maintain context about workflow progress and data.
            Guide users through the workflow steps systematically.
            """,
            tools=[
                self.step1_scrape_headlines,
                self.step2_filter_ai_headlines,
                self.step3_fetch_article_texts,
                self.step4_summarize_articles,
                self.step5_extract_and_cluster_topics,
                self.step6_rate_articles,
                self.step7_organize_sections,
                self.step8_write_sections,
                self.step9_finalize_newsletter,
                self.get_workflow_status,
                self.run_complete_workflow,
                self.reset_workflow
            ]
        )

    @function_tool
    async def step1_scrape_headlines(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        sources: List[str] = None,
        max_articles_per_source: int = 50
    ) -> str:
        """Step 1: Scrape headlines and URLs from various sources"""
        if sources is None:
            sources = ["techcrunch", "arstechnica", "theverge", "wired", "venturebeat"]

        scraped_data = []

        # Mock scraping implementation (replace with real RSS/API scraping)
        for source in sources:
            for i in range(max_articles_per_source):
                article = {
                    'title': f"{source} AI Article {i+1}: Latest developments in machine learning",
                    'url': f"https://{source}.com/ai-article-{i+1}",
                    'source': source,
                    'published_at': (datetime.now() - timedelta(hours=i)).isoformat(),
                    'description': f"AI-related content from {source}"
                }
                scraped_data.append(article)

        wrapper.context.raw_headlines = scraped_data
        wrapper.context.scraped_urls = [article['url'] for article in scraped_data]
        wrapper.context.current_step = 1

        return f"✅ Step 1 Complete: Scraped {len(scraped_data)} headlines from {len(sources)} sources"


    @function_tool
    async def step2_filter_ai_content(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        ai_keywords: List[str] = None
    ) -> str:
        """Step 2: Filter headlines to AI-related content only"""
        if not wrapper.context.raw_headlines:
            return "❌ No headlines to filter. Run step 1 first."

        if ai_keywords is None:
            ai_keywords = [
                'ai', 'artificial intelligence', 'machine learning', 'deep learning',
                'neural network', 'llm', 'gpt', 'transformer', 'chatbot', 'automation',
                'computer vision', 'nlp', 'natural language', 'algorithm', 'model'
            ]

        ai_articles = []
        for article in wrapper.context.raw_headlines:
            title_lower = article['title'].lower()
            desc_lower = article['description'].lower()

            # Check if any AI keywords are present
            if any(keyword in title_lower or keyword in desc_lower for keyword in ai_keywords):
                ai_articles.append(article)

        wrapper.context.ai_headlines = pd.DataFrame(ai_articles)
        wrapper.context.current_step = 2

        return f"✅ Step 2 Complete: Filtered to {len(ai_articles)} AI-related headlines from {len(wrapper.context.raw_headlines)} total"

    @function_tool
    async def step3_fetch_article_texts(
        self,
        wrapper: RunContextWrapper[NewsletterState]
    ) -> str:
        """Step 3: Fetch full article texts from URLs"""
        if wrapper.context.ai_headlines.empty:
            return "❌ No AI headlines to fetch. Complete steps 1-2 first."

        # Mock article fetching (replace with actual web scraping)
        article_texts = {}

        for _, row in wrapper.context.ai_headlines.iterrows():
            url = row['url']
            # Mock article content
            article_texts[url] = f"""
            {row['title']}

            This is a mock article about AI developments. In a real implementation,
            you would use libraries like requests + BeautifulSoup or newspaper3k
            to extract the full article text from the URL.

            Key points about this AI story:
            - Advancement in machine learning techniques
            - Impact on industry applications
            - Future implications for AI development

            This content would be much longer in practice, containing the full
            article text that needs to be summarized and analyzed.
            """

        wrapper.context.article_texts = article_texts
        wrapper.context.current_step = 3

        return f"✅ Step 3 Complete: Fetched full text for {len(article_texts)} articles"

    @function_tool
    async def step4_summarize_articles(
        self,
        wrapper: RunContextWrapper[NewsletterState]
    ) -> str:
        """Step 4: Summarize each article to 3 key bullet points"""
        if not wrapper.context.article_texts:
            return "❌ No article texts to summarize. Complete steps 1-3 first."

        summaries = {}

        for url, text in wrapper.context.article_texts.items():
            # Mock summarization (replace with actual LLM summarization)
            summaries[url] = [
                "• Key development in AI technology or research",
                "• Practical implications for businesses or developers",
                "• Future outlook or next steps in this area"
            ]

        wrapper.context.article_summaries = summaries
        wrapper.context.current_step = 4

        return f"✅ Step 4 Complete: Generated 3-point summaries for {len(summaries)} articles"

    @function_tool
    async def step5_extract_and_cluster_topics(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        max_clusters: int = 8
    ) -> str:
        """Step 5: Extract topics and cluster articles"""
        if not wrapper.context.article_texts:
            return "❌ No articles to analyze. Complete steps 1-4 first."

        # Extract topics from each article (mock implementation)
        article_topics = {}
        all_topics = []

        for url, text in wrapper.context.article_texts.items():
            # Mock topic extraction (replace with NLP)
            topics = ['machine learning', 'business applications', 'research', 'ethics']
            article_topics[url] = topics
            all_topics.extend(topics)

        # Cluster articles by common topics
        topic_counts = Counter(all_topics)
        main_topics = [topic for topic, count in topic_counts.most_common(max_clusters)]

        topic_clusters = {}
        for topic in main_topics:
            topic_clusters[topic] = [
                url for url, topics in article_topics.items()
                if topic in topics
            ]

        wrapper.context.article_topics = article_topics
        wrapper.context.topic_clusters = topic_clusters
        wrapper.context.current_step = 5

        return f"✅ Step 5 Complete: Extracted topics and created {len(topic_clusters)} clusters"

    @function_tool
    async def step6_rate_articles(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        custom_rubric: Dict[str, str] = None
    ) -> str:
        """Step 6: Rate articles according to rubric"""
        if not wrapper.context.article_texts:
            return "❌ No articles to rate. Complete previous steps first."

        if custom_rubric:
            wrapper.context.rating_rubric.update(custom_rubric)

        # Mock rating (replace with actual evaluation)
        ratings = {}
        for url in wrapper.context.article_texts.keys():
            # Mock scoring based on rubric criteria
            relevance_score = 0.8
            novelty_score = 0.7
            impact_score = 0.9
            credibility_score = 0.8

            overall_rating = (relevance_score + novelty_score + impact_score + credibility_score) / 4
            ratings[url] = overall_rating

        wrapper.context.article_ratings = ratings
        wrapper.context.current_step = 6

        avg_rating = sum(ratings.values()) / len(ratings)
        return f"✅ Step 6 Complete: Rated {len(ratings)} articles. Average rating: {avg_rating:.2f}"

    @function_tool
    async def step7_organize_sections(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        target_sections: int = 10
    ) -> str:
        """Step 7: Organize articles into thematic sections"""
        if not wrapper.context.topic_clusters:
            return "❌ No topic clusters available. Complete steps 1-6 first."

        # Create thematic sections based on clusters and ratings
        sections = {}

        # Main thematic sections from top clusters
        top_clusters = sorted(
            wrapper.context.topic_clusters.items(),
            key=lambda x: len(x[1]),  # Sort by cluster size
            reverse=True
        )[:target_sections-1]  # Reserve space for "Other News"

        for topic, urls in top_clusters:
            # Only include high-rated articles
            high_rated_urls = [
                url for url in urls
                if wrapper.context.article_ratings.get(url, 0) >= 0.6
            ]
            if high_rated_urls:
                section_name = topic.title().replace('_', ' ')
                sections[section_name] = high_rated_urls

        # "Other News" section for remaining articles
        assigned_urls = set()
        for urls in sections.values():
            assigned_urls.update(urls)

        other_urls = [
            url for url in wrapper.context.article_texts.keys()
            if url not in assigned_urls and wrapper.context.article_ratings.get(url, 0) >= 0.5
        ]

        if other_urls:
            sections["Other News"] = other_urls

        wrapper.context.thematic_sections = sections
        wrapper.context.section_names = list(sections.keys())
        wrapper.context.current_step = 7

        section_summary = "\n".join([
            f"• {name}: {len(urls)} articles"
            for name, urls in sections.items()
        ])

        return f"✅ Step 7 Complete: Organized into {len(sections)} sections:\n{section_summary}"

    @function_tool
    async def step8_write_sections(
        self,
        wrapper: RunContextWrapper[NewsletterState]
    ) -> str:
        """Step 8: Write content for each thematic section"""
        if not wrapper.context.thematic_sections:
            return "❌ No sections to write. Complete steps 1-7 first."

        section_drafts = {}

        for section_name, urls in wrapper.context.thematic_sections.items():
            # Gather content for this section
            section_articles = []

            for url in urls:
                summary = wrapper.context.article_summaries.get(url, [])
                rating = wrapper.context.article_ratings.get(url, 0)

                # Get article title from DataFrame
                article_row = wrapper.context.ai_headlines[
                    wrapper.context.ai_headlines['url'] == url
                ]
                title = article_row['title'].iloc[0] if not article_row.empty else "Unknown Title"

                section_articles.append({
                    'title': title,
                    'url': url,
                    'summary': summary,
                    'rating': rating
                })

            # Write section content (mock implementation)
            section_content = f"## {section_name}\n\n"

            for article in sorted(section_articles, key=lambda x: x['rating'], reverse=True):
                section_content += f"**{article['title']}**\n"
                for bullet in article['summary']:
                    section_content += f"{bullet}\n"
                section_content += f"[Read more]({article['url']})\n\n"

            section_drafts[section_name] = section_content

        wrapper.context.section_drafts = section_drafts
        wrapper.context.current_step = 8

        return f"✅ Step 8 Complete: Wrote content for {len(section_drafts)} sections"

    @function_tool
    async def step9_finalize_newsletter(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        newsletter_title: str = "AI Weekly Newsletter"
    ) -> str:
        """Step 9: Combine sections and polish final newsletter"""
        if not wrapper.context.section_drafts:
            return "❌ No section drafts available. Complete steps 1-8 first."

        # Combine all sections
        newsletter_content = f"# {newsletter_title}\n"
        newsletter_content += f"*Generated on {datetime.now().strftime('%B %d, %Y')}*\n\n"

        # Add introduction
        total_articles = len(wrapper.context.article_texts)
        newsletter_content += f"This week's AI newsletter covers {total_articles} key developments across {len(wrapper.context.section_drafts)} areas of AI.\n\n"

        # Add each section
        for section_name in wrapper.context.section_names:
            if section_name in wrapper.context.section_drafts:
                newsletter_content += wrapper.context.section_drafts[section_name]
                newsletter_content += "\n---\n\n"

        # Add footer
        newsletter_content += "*Thank you for reading! This newsletter was generated using AI curation and analysis.*"

        wrapper.context.final_newsletter = newsletter_content
        wrapper.context.workflow_complete = True
        wrapper.context.current_step = 9

        return f"✅ Step 9 Complete: Finalized newsletter with {len(wrapper.context.section_drafts)} sections"

    @function_tool
    async def get_workflow_status(
        self,
        wrapper: RunContextWrapper[NewsletterState]
    ) -> str:
        """Get detailed workflow progress status"""
        state = wrapper.context

        status = {
            'current_step': state.current_step,
            'steps_completed': [
                f"1. Scraping: {len(state.raw_headlines)} headlines" if state.raw_headlines else "1. Scraping: Pending",
                f"2. AI Filtering: {len(state.ai_headlines)} AI articles" if not state.ai_headlines.empty else "2. AI Filtering: Pending",
                f"3. Text Fetching: {len(state.article_texts)} articles" if state.article_texts else "3. Text Fetching: Pending",
                f"4. Summarization: {len(state.article_summaries)} summaries" if state.article_summaries else "4. Summarization: Pending",
                f"5. Topic Clustering: {len(state.topic_clusters)} clusters" if state.topic_clusters else "5. Topic Clustering: Pending",
                f"6. Article Rating: {len(state.article_ratings)} rated" if state.article_ratings else "6. Article Rating: Pending",
                f"7. Section Organization: {len(state.thematic_sections)} sections" if state.thematic_sections else "7. Section Organization: Pending",
                f"8. Section Writing: {len(state.section_drafts)} drafts" if state.section_drafts else "8. Section Writing: Pending",
                f"9. Newsletter Finalization: {'Complete' if state.final_newsletter else 'Pending'}"
            ],
            'workflow_complete': state.workflow_complete
        }

        return f"Newsletter Workflow Status:\n\n" + "\n".join(status['steps_completed'])

    @function_tool
    async def run_complete_workflow(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        sources: List[str] = None,
        ai_keywords: List[str] = None
    ) -> str:
        """Run the complete 9-step workflow automatically"""
        results = []

        # Execute each step in sequence
        result1 = await self.step1_scrape_headlines(wrapper, sources)
        results.append(result1)

        result2 = await self.step2_filter_ai_content(wrapper, ai_keywords)
        results.append(result2)

        result3 = await self.step3_fetch_article_texts(wrapper)
        results.append(result3)

        result4 = await self.step4_summarize_articles(wrapper)
        results.append(result4)

        result5 = await self.step5_extract_and_cluster_topics(wrapper)
        results.append(result5)

        result6 = await self.step6_rate_articles(wrapper)
        results.append(result6)

        result7 = await self.step7_organize_sections(wrapper)
        results.append(result7)

        result8 = await self.step8_write_sections(wrapper)
        results.append(result8)

        result9 = await self.step9_finalize_newsletter(wrapper)
        results.append(result9)

        newsletter_length = len(wrapper.context.final_newsletter)

        return "\n".join(results) + f"\n\n🎉 Complete workflow finished! Newsletter ready ({newsletter_length} characters)"

    @function_tool
    async def reset_workflow(
        self,
        wrapper: RunContextWrapper[NewsletterState]
    ) -> str:
        """Reset workflow to start fresh"""
        wrapper.context.__dict__.update(NewsletterState().__dict__)
        return "🔄 Workflow reset. Ready to start step 1."

    @function_tool
    async def get_newsletter_preview(
        self,
        wrapper: RunContextWrapper[NewsletterState],
        max_chars: int = 500
    ) -> str:
        """Get a preview of the current newsletter"""
        if not wrapper.context.final_newsletter:
            return "Newsletter not ready yet. Complete the full workflow first."

        preview = wrapper.context.final_newsletter[:max_chars]
        if len(wrapper.context.final_newsletter) > max_chars:
            preview += "..."

        return f"Newsletter Preview:\n\n{preview}"

    async def run_step(self, user_input: str) -> str:
        """Run a workflow step with persistent state"""
        result = await Runner.run(
            self,
            user_input,
            session=self.session,
            context=self.state
        )
        return result.final_output

    def save_newsletter(self, filepath: str = None):
        """Save the final newsletter to file"""
        if not self.state.final_newsletter:
            print("No newsletter to save. Complete workflow first.")
            return

        if filepath is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filepath = f"ai_newsletter_{timestamp}.md"

        with open(filepath, 'w') as f:
            f.write(self.state.final_newsletter)

        print(f"Newsletter saved to {filepath}")




In [None]:
import openai

client = openai.OpenAI(
  base_url="http://localhost:8787/v1",
  api_key=os.getenv("OPENAI_API_KEY"),
  default_headers={"x-portkey-provider": "openai"}
)

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[{"role": "user", "content": "Hello"}]
)
print(response.choices[0].message.content)

In [None]:
from portkey_ai import Portkey

client = Portkey(
    provider="openai",
    Authorization=os.getenv("OPENAI_API_KEY")
)

# Example: Send a chat completion request
response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Hello, how are you?"}],
    model="gpt-4o"
)

print(response.choices[0].message.content)

In [None]:
type(prompt_template)

In [None]:
class AgentState(TypedDict):
    """
    State of the LangGraph agent.
    Each node in the graph is a function that takes the current state and returns the updated state.
    """

    # the current working set of headlines (pandas dataframe not supported)
    AIdf: list[dict]
    # ignore stories before this date for deduplication (force reprocess since)
    model_low: str     # cheap fast model like gpt-4o-mini or flash
    model_medium: str  # medium model like gpt-4o or gemini-1.5-pro
    model_high: str    # slow expensive thinking model like o3-mini
    sources: dict  # sources to scrap
    sources_reverse: dict[str, str]  # map file names to sources

state = AgentState()


In [None]:
SOURCES_FILE = "sources.yaml"

def initialize(state, sources_file=SOURCES_FILE) -> Dict[str, Any]:
    """Read and parse the sources.yaml file."""
    try:
        with open(sources_file, 'r', encoding='utf-8') as file:
            state["sources"] =  yaml.safe_load(file)
        state["sources_reverse"] = {v["title"]+".html":k for k,v in state["sources"].items()}
    except FileNotFoundError:
        raise FileNotFoundError(f"Sources file '{self.sources_file}' not found")
    except yaml.YAMLError as e:
        raise ValueError(f"Error parsing YAML file: {e}")

    return state


In [None]:
state = initialize(state)
state
