# Setup

In [1]:
! uv pip install agentics-py

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 273ms[0m[0m


In [2]:
import os
from pathlib import Path
import sys
from getpass import getpass

from dotenv import find_dotenv, load_dotenv

CURRENT_PATH = ""

IN_COLAB = "google.colab" in sys.modules
print("In Colab:", IN_COLAB)

if IN_COLAB:
    CURRENT_PATH = "/content/drive/MyDrive/"
    # Mount your google drive
    from google.colab import drive

    drive.mount("/content/drive")
    from google.colab import userdata

    os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")
else:
    load_dotenv(find_dotenv())

if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY:")

base = Path(CURRENT_PATH)

In Colab: True
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Setup Custom LLM

In [3]:
from crewai import LLM

# pick a provider (openai, anthropic, groq, etc.) - see crewai docs for details
gemini_llm = LLM(
    model="gemini/gemini-2.0-flash-lite",
    temperature=0.7,  # Adjust based on task
    max_tokens=4096,  # Set based on output needs
    # timeout=300,
)  # Longer timeout for complex tasks

print(gemini_llm)

<crewai.llm.LLM object at 0x7c58a34d09e0>


In [4]:
# test call
print(gemini_llm.call("where is the Eiffel Tower?"))

The Eiffel Tower is in **Paris, France**.



# Train
## Download HF Dataset

In [5]:
from datasets import load_dataset, DatasetDict, Features, Value
import warnings
warnings.filterwarnings("ignore")

# DATASET_NAME = "NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed"
# SPLIT_NAME = "train"

# os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
# DATASET_NAME = "Brianferrell787/financial-news-multisource"
# SPLIT_NAME = "train"

DATASET_NAME = "danidanou/Bloomberg_Financial_News"
SPLIT_NAME = "train"

In [6]:
def download_dataset(dataset_name: str, split_name: str) -> DatasetDict:
  try:
    # The load_dataset function downloads the dataset to your local cache
    # and loads it into memory as a Dataset object.
    dataset = load_dataset(dataset_name, split=split_name)

    print("\n--- Download Successful! ---")
    print(f"Loaded dataset type: {type(dataset)}")

    # Print basic information
    print(f"\nTotal number of rows in the '{split_name}' split: {len(dataset)}")
    print("\nFeatures (columns) in the dataset:")
    print(dataset.column_names)
    return dataset
  except FileNotFoundError:
      print(f"Error: Dataset or split '{dataset_name}/{split_name}' not found on the Hub.")
      print("Please check the dataset name and split name for typos.")
  except Exception as e:
      print(f"An unexpected error occurred during dataset loading: {e}")

ds = download_dataset(DATASET_NAME, SPLIT_NAME)


--- Download Successful! ---
Loaded dataset type: <class 'datasets.arrow_dataset.Dataset'>

Total number of rows in the 'train' split: 446762

Features (columns) in the dataset:
['Headline', 'Journalists', 'Date', 'Link', 'Article']


In [7]:
#Convert datetime to date string
new_features = ds.features.copy()
new_features["Date"] = Value("string")
ds = ds.map(
    lambda x: {"Date": x["Date"].strftime("%Y-%m-%d")},
    features=new_features
)

Map:   0%|          | 0/446762 [00:00<?, ? examples/s]

In [8]:
ds[101010]

{'Headline': 'Haunted Greeks Sell Real Estate EBay-Style to Evict Debt Specter',
 'Journalists': ['Maria Petrakis'],
 'Date': '2013-09-05',
 'Link': 'http://www.bloomberg.com/news/2013-09-05/haunted-greeks-sell-real-estate-ebay-style-to-evict-debt-specter.html',
 'Article': 'A legend that has swirled around the dilapidated mansion on Smolenski Street in Athens is that the ghost of the previous owner deters prospective buyers by moaning: “The house is mine.” The Greek government refuses to be spooked. The protected two-story mansion and tower, replete with palm trees in the overgrown gardens, will be sold on Sept. 17 to the highest bidder in an EBay-style Internet auction. Greece is trying to dispel criticism it’s not doing enough to sell real estate pledged as part of its 240 billion-euro ($315 billion) rescue. “It’s literally haunting the Greek budget,” Andreas Taprantzis, executive director for real estate at the Hellenic Republic Asset Development Fund, said in an interview. It “gen

## Transduction between AGs
### News -> Classify News + Key points

**Customizing Transduction**   
You can fine-tune how logical transduction works by configuring:

LLMs – choose the underlying language model to run the transduction.   
Instructions – add task-specific guidance for the LLM.   
Prompt Templates – control how inputs are rendered into prompts.   
Few-Shot Examples – provide examples to steer the model’s behavior.   
Verbose Options – enable detailed logging and debug outputs.



In [9]:
from typing import Optional, Literal, List, Dict, Any
from pydantic import BaseModel, Field, HttpUrl, ValidationError
from datasets import load_dataset, DatasetDict
from pydantic import BaseModel
from agentics import AG
from datetime import datetime
import asyncio

In [10]:
INDUSTRIES = [
    "Information Technology",
    "Health Care",
    "Financials",
    "Consumer Discretionary",
    "Communication Services",
    "Industrials",
    "Consumer Staples",
    "Energy",
    "Utilities",
    "Real Estate",
    "Materials",
    "General Market"
]

IndustryType = Literal[tuple(INDUSTRIES)]

# class NewsEntry(BaseModel):
#     """
#     Pydantic model representing a single comprehensive news entry, including
#     sentiment analysis and source metadata.
#     """
#     # url: HttpUrl = Field(description="The full URL link to the original news article.")
#     # image: Optional[FlexibleImageUrl] = Field(description="Optional URL link to the main image or a relative path.")
#     publishedDate: str = Field(description="The publication date of the article (ISO 8601 string).")
#     symbol: str = Field(description="The stock ticker symbol the news is related to (e.g., 'AAPL', 'OCEA').")
#     site: str = Field(description="The source website or platform where the news was published.")
#     text: str = Field(description="A short snippet or summary of the news article content.")
#     title: str = Field(description="The headline or title of the news article.")
#     sentiment: Optional[str] = Field(description="The calculated sentiment label, usually one of 'positive', 'negative', or 'neutral'.")
#     sentimentScore: Optional[float] = Field(description="The numeric sentiment score calculated by the source.")
#     # New additions
#     industry: IndustryType = Field(None, description=f"The primary industry sector this news is relevant to. Must be one of: {INDUSTRIES}.")
#     keyPoints: str = Field(None, description="A bullet list summarizing the 5 most important points of the news article.")

class BloombergNewsEntry(BaseModel):
    """
    Pydantic model representing a single comprehensive news entry, including
    sentiment analysis and source metadata.
    """

    Headline: str = Field(description="Title or headline of the news article.")
    # Journalists: List[str] = Field(default_factory=list, description="List of journalists credited for the article.")
    Date: str = Field(description="Publication timestamp of the article (in UTC).")
    Link: str = Field(description="URL link to the full article.")
    Article: str = Field( description="Full article text content.")
    # New additions
    Industry: IndustryType = Field(None, description=f"The primary industry sector this news is relevant to. Must be one of: {INDUSTRIES}.")
    KeyPoints: str = Field(None, description="A bullet list summarizing the 5 most important points of the news article.")

    class Config:
        schema_extra = {
            "example": {
                "Headline": "Haunted Greeks Sell Real Estate EBay-Style to Evict Debt Specter",
                "Journalists": ["Maria Petrakis"],
                "Date": "2013-09-05T21:01:00Z",
                "Link": "http://www.bloomberg.com/news/2013-09-05/haunted-greeks-sell-real-estate-ebay-style-to-evict-debt-specter.html",
                "Article": "A legend that has swirled around the dilapidated mansion on Smolenski Street in Athens..."
            }
        }


In [11]:
news_entry_instance = BloombergNewsEntry.model_validate(ds[101010])
print(news_entry_instance)

Headline='Haunted Greeks Sell Real Estate EBay-Style to Evict Debt Specter' Date='2013-09-05' Link='http://www.bloomberg.com/news/2013-09-05/haunted-greeks-sell-real-estate-ebay-style-to-evict-debt-specter.html' Article='A legend that has swirled around the dilapidated mansion on Smolenski Street in Athens is that the ghost of the previous owner deters prospective buyers by moaning: “The house is mine.” The Greek government refuses to be spooked. The protected two-story mansion and tower, replete with palm trees in the overgrown gardens, will be sold on Sept. 17 to the highest bidder in an EBay-style Internet auction. Greece is trying to dispel criticism it’s not doing enough to sell real estate pledged as part of its 240 billion-euro ($315 billion) rescue. “It’s literally haunting the Greek budget,” Andreas Taprantzis, executive director for real estate at the Hellenic Republic Asset Development Fund, said in an interview. It “generates zero income, not even taxes,” he said. Hobbled b

In [12]:
from typing import Optional, Literal, List, Dict, Any, Tuple
from pydantic import BaseModel, Field, HttpUrl, ValidationError
from datasets import load_dataset, DatasetDict
from pydantic import BaseModel
from collections import defaultdict
from agentics import AG
import asyncio

async def bloomberg_news_entry_from_dict(hf_dataset: List[Dict[str, Any]]) -> List[BloombergNewsEntry]:
  """
  Processes a list of data (simulating a Hugging Face dataset split)
  to validate each entry against the NewsEntry Pydantic model.
  """
  print(f"\n--- Starting processing of {len(hf_dataset)} entries ---")

  # Validate each record synchronously
  results = [BloombergNewsEntry.model_validate(record) for record in hf_dataset]
  print("--- Processing Complete! ---")
  return results

In [19]:
news_entry = await bloomberg_news_entry_from_dict(ds.select(range(10))) # to remove, testing on 10 now


--- Starting processing of 10 entries ---
--- Processing Complete! ---


In [20]:
# news_entry

In [21]:
prompts = {
    "industry_class_and_keypoints": f'''You are a financial news analyst.
Read the article carefully and classify its **primary industry sector** as "Industry".

Possible industries:
{INDUSTRIES}

Guidelines:
- Choose **"General Market"** if the article covers overall economic conditions,
  government or central bank policies, currency movements, inflation, GDP,
  interest rates, IMF or World Bank decisions, or broad market sentiment that
  affects multiple sectors rather than one specific industry.
- If the article focuses on one company, classify it based on that company’s core sector.

Then, summarize the **5 most important points** of the article as "KeyPoints",
each starting with a bullet ("-").

Output JSON format:
{{
  "Industry": "...",
  "KeyPoints": \["...", "...", "...", "...", "..."\]
}}
Industry and KeyPoints should NOT be None.
'''
}

In [22]:
# Create self AG
ag_news_entry = AG(atype=BloombergNewsEntry, states=news_entry)

ag_news_entry.instructions = prompts["industry_class_and_keypoints"]
ag_news_entry.llm = (gemini_llm)
ag_news_entry = await ag_news_entry.self_transduction(
    source_fields=list(BloombergNewsEntry.model_fields.keys()),
    target_fields=["Industry", "KeyPoints"],
    instructions= prompts["industry_class_and_keypoints"]
)

Output()

In [23]:
ag_news_entry.pretty_print()

aType : <class '__main__.BloombergNewsEntry'>
Headline: Ivory Coast Keeps Cocoa Export Tax Below 22%, Document Shows
Date: '2011-10-06'
Link: http://www.bloomberg.com/news/2011-10-06/ivory-coast-keeps-cocoa-export-tax-below-22-document-shows.html
Article: "Export taxes on cocoa beans from Ivory Coast , the world\u2019s biggest\
  \ producer of the chocolate ingredient, won\u2019t exceed 22 percent of the international\
  \ price this season, meeting a commitment to the International Monetary Fund , according\
  \ to a finance ministry document. In the 2008-9 season taxes averaged 25.3 percent\
  \ of international prices, the IMF said in a document posted on its website in November\
  \ last year. While the country met the commitment in the season just ended, it had\
  \ a change in government earlier this year. The rate meets a demand by the International\
  \ Monetary Fund and the World Bank to reform the Ivorian cocoa and coffee industries\
  \ in order to comply with the terms of i

'aType : <class \'__main__.BloombergNewsEntry\'>\nHeadline: Ivory Coast Keeps Cocoa Export Tax Below 22%, Document Shows\nDate: \'2011-10-06\'\nLink: http://www.bloomberg.com/news/2011-10-06/ivory-coast-keeps-cocoa-export-tax-below-22-document-shows.html\nArticle: "Export taxes on cocoa beans from Ivory Coast , the world\\u2019s biggest\\\n  \\ producer of the chocolate ingredient, won\\u2019t exceed 22 percent of the international\\\n  \\ price this season, meeting a commitment to the International Monetary Fund , according\\\n  \\ to a finance ministry document. In the 2008-9 season taxes averaged 25.3 percent\\\n  \\ of international prices, the IMF said in a document posted on its website in November\\\n  \\ last year. While the country met the commitment in the season just ended, it had\\\n  \\ a change in government earlier this year. The rate meets a demand by the International\\\n  \\ Monetary Fund and the World Bank to reform the Ivorian cocoa and coffee industries\\\n  \\ in 

### Classified News + Key points -> Aggregate by (date, indstry)

In [24]:
ConsolidatedNews = Dict[Tuple[str, str], List[BloombergNewsEntry]]

In [25]:
async def group_articles_by_date_and_industry(
    articles: List[BloombergNewsEntry],
) -> ConsolidatedNews:
    """
    Takes a list of articles and groups them into a dictionary where the
    key is (published_date_part, industry) and the value is a list of articles.

    This function acts as the 'reduce' operation, consolidating the entire list.
    """
    # Use defaultdict for easy appending to lists
    grouped_result: ConsolidatedNews = defaultdict(list)

    for article in articles:
        # 1. Extract the date part (e.g., '2023-10-04' from '2023-10-04T21:54:28.000Z')
        date = article.Date

        # 2. Extract the industry, defaulting to 'Unknown' if missing
        industry = article.Industry or None

        # 3. Form the composite key
        key = (date, industry)

        # 4. Append the article to the list for this key
        grouped_result[key].append(article)

    # Convert back to a standard dictionary before returning
    return dict(grouped_result)

In [26]:
consolidated_news = await ag_news_entry.areduce(
    group_articles_by_date_and_industry
)

In [27]:
for key, value in consolidated_news.states.items():
  print(key)
  for news in value:
    print(news)
  print("------------------------------------------")

('2011-10-06', 'General Market')
Headline='Ivory Coast Keeps Cocoa Export Tax Below 22%, Document Shows' Date='2011-10-06' Link='http://www.bloomberg.com/news/2011-10-06/ivory-coast-keeps-cocoa-export-tax-below-22-document-shows.html' Article='Export taxes on cocoa beans from Ivory Coast , the world’s biggest producer of the chocolate ingredient, won’t exceed 22 percent of the international price this season, meeting a commitment to the International Monetary Fund , according to a finance ministry document. In the 2008-9 season taxes averaged 25.3 percent of international prices, the IMF said in a document posted on its website in November last year. While the country met the commitment in the season just ended, it had a change in government earlier this year. The rate meets a demand by the International Monetary Fund and the World Bank to reform the Ivorian cocoa and coffee industries in order to comply with the terms of its Heavily Indebted Poor Countries’ debt-relief program. Last y

# Setup Custom Tools
Todo: Used to get news for inference, not for training.

Method 1 (Deterministic)

In [28]:
!pip install feedparser



In [30]:
import feedparser
from datetime import datetime, timedelta

def get_bloomberg_rss_feeds() -> List[Dict[str, str]]:
  feeds = [
      "https://feeds.bloomberg.com/news/news.rss",
      "https://feeds.bloomberg.com/markets/news.rss",
      "https://feeds.bloomberg.com/business/news.rss",
      "https://feeds.bloomberg.com/technology/news.rss",
      "https://feeds.bloomberg.com/politics/news.rss",
      "https://feeds.bloomberg.com/wealth/news.rss",
      "https://feeds.bloomberg.com/economics/news.rss",
      "https://feeds.bloomberg.com/green/news.rss",
      "https://feeds.bloomberg.com/pursuits/news.rss",
      "https://feeds.bloomberg.com/opinion/news.rss",
      "https://feeds.bloomberg.com/finance/news.rss",
      "https://feeds.bloomberg.com/real-estate/news.rss",
      "https://feeds.bloomberg.com/deals/news.rss",
      "https://feeds.bloomberg.com/crypto/news.rss",
      "https://feeds.bloomberg.com/europe/news.rss",
      "https://feeds.bloomberg.com/uk/news.rss",
      "https://feeds.bloomberg.com/asia/news.rss",
      "https://feeds.bloomberg.com/commodities/news.rss",
      "https://feeds.bloomberg.com/currencies/news.rss",
      "https://feeds.bloomberg.com/fixed-income/news.rss",
      "https://feeds.bloomberg.com/equities/news.rss",
      "https://feeds.bloomberg.com/etfs/news.rss"
  ]

  cutoff = datetime.utcnow() - timedelta(days=1)
  news_24h = []

  for url in feeds:
      feed = feedparser.parse(url)
      for entry in feed.entries:
          if hasattr(entry, 'published_parsed'):
              published = datetime(*entry.published_parsed[:6])
              if published > cutoff:
                  news_24h.append({
                      "Headline": entry.title,
                      "Link": entry.link,
                      "Article": entry.summary,
                      "Date": published.isoformat()
                  })
  return news_24h

In [31]:
news_24h = get_bloomberg_rss_feeds()
news_24h

[{'Headline': 'Bubble Debate Drives Korean Retail Investors to Risky VIX Bets',
  'Link': 'https://www.bloomberg.com/news/articles/2025-10-19/bubble-debate-drives-korean-retail-investors-to-risky-vix-bets',
  'Article': 'Investors in South Korea looking to hedge their big US stock holdings or play their next wager are embracing a new type of trades: leveraged VIX bets.',
  'Date': '2025-10-19T00:00:00'},
 {'Headline': 'US Warns of ‘Imminent’ Attack by Hamas Against Palestinians',
  'Link': 'https://www.bloomberg.com/news/articles/2025-10-18/us-warns-of-imminent-attack-by-hamas-against-palestinians',
  'Article': 'The US State Department said it informed countries involved in the Gaza peace agreement that an attack by Hamas is being planned against Palestinians and that it would be a violation of the ceasefire deal.',
  'Date': '2025-10-18T22:09:03'},
 {'Headline': 'Protesters Oppose Trump in ‘No Kings’ Event in NYC',
  'Link': 'https://www.bloomberg.com/news/videos/2025-10-18/protester

In [32]:
BloombergNewsEntry.model_validate(news_24h[0])

BloombergNewsEntry(Headline='Bubble Debate Drives Korean Retail Investors to Risky VIX Bets', Date='2025-10-19T00:00:00', Link='https://www.bloomberg.com/news/articles/2025-10-19/bubble-debate-drives-korean-retail-investors-to-risky-vix-bets', Article='Investors in South Korea looking to hedge their big US stock holdings or play their next wager are embracing a new type of trades: leveraged VIX bets.', Industry=None, KeyPoints=None)

In [33]:
rss_feed = await bloomberg_news_entry_from_dict(news_24h) # to remove, testing on 10 now


--- Starting processing of 80 entries ---
--- Processing Complete! ---


In [34]:
rss_feed

[BloombergNewsEntry(Headline='Bubble Debate Drives Korean Retail Investors to Risky VIX Bets', Date='2025-10-19T00:00:00', Link='https://www.bloomberg.com/news/articles/2025-10-19/bubble-debate-drives-korean-retail-investors-to-risky-vix-bets', Article='Investors in South Korea looking to hedge their big US stock holdings or play their next wager are embracing a new type of trades: leveraged VIX bets.', Industry=None, KeyPoints=None),
 BloombergNewsEntry(Headline='US Warns of ‘Imminent’ Attack by Hamas Against Palestinians', Date='2025-10-18T22:09:03', Link='https://www.bloomberg.com/news/articles/2025-10-18/us-warns-of-imminent-attack-by-hamas-against-palestinians', Article='The US State Department said it informed countries involved in the Gaza peace agreement that an attack by Hamas is being planned against Palestinians and that it would be a violation of the ceasefire deal.', Industry=None, KeyPoints=None),
 BloombergNewsEntry(Headline='Protesters Oppose Trump in ‘No Kings’ Event i