In [None]:
import os
from dotenv import load_dotenv
import dspy
import logging

logging.getLogger().setLevel(logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("LiteLLM").setLevel(logging.WARNING)

# set your api key (if needed)
load_dotenv("../../.env")
APIKEY = os.getenv("APIKEY")

# set your model (litellm model strings)
#model_id = "openrouter/deepseek/deepseek-chat"
model_id = "openrouter/meta-llama/llama-3.3-70b-instruct"
lm = dspy.LM(model_id, api_key=APIKEY, cache=False)
dspy.configure(lm=lm)

# Signatures

Signatures are like DSPy's pydantic models. Describe the fields and docstrings as though they are prompts (they are).

They will likely reflect the data in your table schema, but also could additional intermediate data structures in multi-hop patterns.

### Initial prototype
```python
from typing import Literal, Optional


class NewsAppSignatureExample(dspy.Signature):
    text: str = dspy.InputField(desc="Text from an article for analysis")
    category: Literal["world", "entertainment", "science", "health", "business", "sports", "politics", "tech"] = dspy.OutputField(desc="Article content category")
    title: str = dspy.OutputField(desc="Article title, when available. Otherwise create one")
    tags: list[str] = dspy.OutputField(desc="Tags for search and classification")
    notable_people: Optional[list[str]] = dspy.OutputField(desc="Names of notable people in the article")
    notable_organizations: Optional[list[str]] = dspy.OutputField(desc="Names of notable organizations in the article")


# system prompt goes in the docstring
NewsAppSignatureExample.__doc__ = """
You are provided with the text of a news article. Help provide the requested information for catalogging.
"""
```

With some good examples in hand, I refined an expanded list with ChatGPT.

In [None]:
from news_app import NewsAppSignature

# Run the program

I like the natural code style of writing a DSPy signature. A pydantic model becomes the prompt.

`Literal` type + LLM = classifier (cool!)

We can already try it out, using the ChainOfThought predictor to run the program.

In [None]:
text = """
Business Briefing Dec. 2, 2015
Nokia shareholders overwhelmingly approved the acquisition of the ailing French telecom Alcatel-Lucent, removing one of the last hurdles to a 15.6 billion euro ($16.5 billion) deal that will make Nokia a market leader in networks.
In October, Nokia said it would pay 4 billion to shareholders as the company raised its outlook for the year.
Rajeev Suri, Nokias chief executive, said he was delighted by shareholders recognizing the long-term value creation opportunity of the deal, which is expected to close during the first quarter of 2016.
"""

In [None]:
catalog = dspy.ChainOfThought(NewsAppSignature)
catalog_item = catalog(article_text=text)
print(catalog_item)

# Generating training data

We'll rely on "best of n" scaling to help create synthetic data for our application. Then we'll manually review ~100 examples we created for training.


## A basic test time scaling

I'll generate some training data using a simplistic best-of-n style test time scaling. Aggregating all of the types is a bit more challenging, so I've done that in the `aggregate/` folder as a module that I can work on further.

Depending on where you are running your LLM calls, you might choose the serial or parallel methods below.

In [None]:
import tqdm


def generate_candidates_serial(text, n=8):
    """Run in serial"""
    return [catalog(article_text=text) for _ in range(n)]


def generate_candidates_parallel(text, n=8, num_threads=2):
    """Run in parallel"""
    parallel_executor = dspy.Parallel(num_threads=num_threads)
    exec_pairs = [(catalog, {"article_text": text}) for _ in range(n)]
    results = parallel_executor.forward(exec_pairs)

    return results

### Aggregation

We need to aggregate by each field to obtain consensus results. For lists, we fuzzy deduplicate and then set a threshold for N minimum occurrences for acceptence. We are targeting aggregation from 8 outputs.

I've modularized the code and imported it here, since it's a bit long and not especially interesting.

In [None]:
import sys

sys.path.append("..")

from aggregate.aggregate import LLMOutputAggregator

## Process a bunch of data

We can load `ag_news` to create our synthetic training data, and process ~100 rows.

I'll save the save the results as I go. Quick and dirty, just restart if it fails.

In [None]:
from datasets import load_dataset

# Load a diverse news dataset (e.g., "ag_news")
dataset = load_dataset("valurank/News_Articles_Categorization", split="train")

### Utilities for tracking the dataset offset

In [None]:
import hashlib
import os

# Define the number of articles and samples
num_articles = 100
samples_per_article = 8

# Define the output directory
output_dir = "training_data"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define a file to keep track of progress (offset)
progress_file = os.path.join(output_dir, "progress.txt")


# Function to generate a non-cryptographic hash (e.g., MD5) of a JSON string
def generate_hash(json_str: str) -> str:
    return hashlib.md5(json_str.encode("utf-8")).hexdigest()


# Function to load the current offset
def load_offset() -> int:
    if os.path.exists(progress_file):
        with open(progress_file, "r") as f:
            try:
                offset = int(f.read().strip())
                return offset
            except ValueError:
                return 0
    return 0


# Function to save the current offset
def save_offset(offset: int):
    with open(progress_file, "w") as f:
        f.write(str(offset))

### Best-Of-N Processing Loop

In [None]:
# Initialize the starting offset
start_offset = load_offset()

# Iterate over the specified number of articles starting from the offset
for i in tqdm.tqdm(
    range(start_offset, num_articles),
    desc="Processing Articles",
    total=num_articles - start_offset,
):
    try:
        article = dataset[i]
        text = article["Text"]

        # Generate multiple predictions
        # candidates = generate_candidates_serial(text, n=samples_per_article)
        candidates = generate_candidates_parallel(text, n=samples_per_article)

        # Aggregate predictions to form consensus
        candidates_with_text = []
        for c in [c.toDict() for c in candidates]:
            c.update({"article_text": text})
            candidates_with_text.append(c)
        candidates_with_text
        consensus = LLMOutputAggregator.aggregate(
            NewsAppSignature, candidates_with_text, threshold=3
        )

        # Convert consensus to JSON string
        consensus_json = consensus.model_dump_json()

        # Generate filename using hash of JSON string
        filename_hash = generate_hash(consensus_json)
        filename = f"{filename_hash}.json"
        file_path = os.path.join(output_dir, filename)

        # Save the JSON string to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(consensus_json)

        # Update the progress offset
        save_offset(i + 1)

    except Exception as e:
        print(f"Error processing article {i}: {e}")
        # Optionally, log the error to a file
        error_log = os.path.join(output_dir, "error_log.txt")
        with open(error_log, "a") as f:
            f.write(f"Article {i}: {e}\n")
        # Continue with the next article
        continue

## Review data

(This is done in the review tool.)

# Summary data

1/27/25 - feature add summary to training data

In [None]:
import glob
import json

data = []
for fn in glob.glob("./training_data/accepted/*.json"):
    with open(fn, "r") as fh:
        data.append((fn, json.load(fh)))

In [None]:
class BestSnippet(dspy.Signature):
    """ Choose the best snippet for the article """
    article_text: str = dspy.InputField(desc="Original article text")
    snippets: list[str] = dspy.InputField(desc="Generated snippets")
    best_snippet: str = dspy.OutputField(desc="Snippet that best works for the article")


class ArticleSummary(dspy.Module):
    def __init__(self):
        self.summary = dspy.Predict("article_text -> snippet")
        self.best = dspy.ChainOfThought(BestSnippet)

    def forward(self, article_text: str, n: int = 8):
        snippets = []
        for i in range(n):
            result = self.summary(article_text=article_text)
            dspy.Suggest(
                len(result.snippet) < 500,
                "Snippet is too long."
            )
            snippets.append(result.snippet)

        selection = self.best(article_text=article_text, snippets=snippets)
        return selection

In [None]:
import tqdm
summarizer = ArticleSummary().activate_assertions()

for fn, article in tqdm.tqdm(data, desc="Processing Articles", total=len(data)):
    base_name = os.path.split(fn)[-1]
    outpath = f"output/{base_name}"
    if os.path.exists(outpath):
        continue
    try:
        result = summarizer(article["article_text"])
        output_text = result.best_snippet
        article["snippet"] = output_text.strip('"')
        with open(outpath, "w") as fh:
            json.dump(article, fh)
    except:
        import traceback
        print(traceback.format_exc())
        continue