# Setup

In [None]:
! uv pip install agentics-py

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m238 packages[0m [2min 6.17s[0m[0m
[2K[2mPrepared [1m72 packages[0m [2min 8.80s[0m[0m
[2mUninstalled [1m7 packages[0m [2min 34ms[0m[0m
[2K[2mInstalled [1m72 packages[0m [2min 223ms[0m[0m
 [32m+[39m [1magentics-py[0m[2m==0.1.5[0m
 [32m+[39m [1maiosqlite[0m[2m==0.21.0[0m
 [32m+[39m [1manthropic[0m[2m==0.71.0[0m
 [32m+[39m [1mappdirs[0m[2m==1.4.4[0m
 [32m+[39m [1mbackoff[0m[2m==2.2.1[0m
 [32m+[39m [1mbcrypt[0m[2m==5.0.0[0m
 [32m+[39m [1mbrowserbase[0m[2m==1.4.0[0m
 [32m+[39m [1mchromadb[0m[2m==1.1.1[0m
 [32m+[39m [1mcoloredlogs[0m[2m==15.0.1[0m
 [32m+[39m [1mcomm[0m[2m==0.2.3[0m
 [32m+[39m [1mcrewai[0m[2m==0.203.1[0m
 [32m+[39m [1mcrewai-tools[0m[2m==0.76.0[0m
 [32m+[39m [1mddgs[0m[2m==9.6.1[0m
 [32m+[39m [1mdeprecation[0m[2m==2.1.0[0m
 [32m+[39m [1mdiskcache[0m[2m==5.6.3[0m
 [32m+[39m [1mdocker[0m[2m==

In [None]:
import os
from pathlib import Path
import sys
from getpass import getpass

from dotenv import find_dotenv, load_dotenv

CURRENT_PATH = ""

IN_COLAB = "google.colab" in sys.modules
print("In Colab:", IN_COLAB)

if IN_COLAB:
    CURRENT_PATH = "/content/drive/MyDrive/"
    # Mount your google drive
    from google.colab import drive

    drive.mount("/content/drive")
    from google.colab import userdata

    os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")
else:
    load_dotenv(find_dotenv())

if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY:")

base = Path(CURRENT_PATH)

In Colab: True
Mounted at /content/drive


#Setup Custom LLM

In [None]:
from crewai import LLM

# pick a provider (openai, anthropic, groq, etc.) - see crewai docs for details
gemini_llm = LLM(
    model="gemini/gemini-2.0-flash-lite",
    temperature=0.7,  # Adjust based on task
    max_tokens=4096,  # Set based on output needs
    # timeout=300,
)  # Longer timeout for complex tasks

print(gemini_llm)

<crewai.llm.LLM object at 0x79df71f08740>


In [None]:
# test call
print(gemini_llm.call("where is the Eiffel Tower?"))

The Eiffel Tower is located in **Paris, France**.



# Download HF Dataset

In [None]:
from datasets import load_dataset, DatasetDict
import warnings
warnings.filterwarnings("ignore")

DATASET_NAME = "NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed"
SPLIT_NAME = "train"

In [None]:
def download_dataset(dataset_name: str, split_name: str) -> DatasetDict:
  try:
    # The load_dataset function downloads the dataset to your local cache
    # and loads it into memory as a Dataset object.
    dataset = load_dataset(dataset_name, split=split_name)

    print("\n--- Download Successful! ---")
    print(f"Loaded dataset type: {type(dataset)}")

    # Print basic information
    print(f"\nTotal number of rows in the '{split_name}' split: {len(dataset)}")
    print("\nFeatures (columns) in the dataset:")
    print(dataset.column_names)
    return dataset
  except FileNotFoundError:
      print(f"Error: Dataset or split '{dataset_name}/{split_name}' not found on the Hub.")
      print("Please check the dataset name and split name for typos.")
  except Exception as e:
      print(f"An unexpected error occurred during dataset loading: {e}")

ds = download_dataset(DATASET_NAME, SPLIT_NAME)

README.md:   0%|          | 0.00/787 [00:00<?, ?B/s]

data/train-00000-of-00001-ccd537eba28316(…):   0%|          | 0.00/49.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/142000 [00:00<?, ? examples/s]


--- Download Successful! ---
Loaded dataset type: <class 'datasets.arrow_dataset.Dataset'>

Total number of rows in the 'train' split: 142000

Features (columns) in the dataset:
['symbol', 'publishedDate', 'title', 'image', 'site', 'text', 'url', 'sentiment', 'sentimentScore']


In [None]:
ds[101010]

{'symbol': 'BTC-USD',
 'publishedDate': '2022-11-29T18:40:15.000Z',
 'title': '$16K Bitcoin dropping to $12K–$14K — Can this really happen? Watch The Market Report',
 'image': 'https://i-invdn-com.investing.com/news/LYNXNPEE1P15Z_L.jpg',
 'site': 'investing',
 'text': '$16K Bitcoin dropping to $12K–$14K — Can this really happen? Watch The Market Report     \xa0      BTC/USD +1.77%      Add to/Remove from Watchlist          Add to Watchlist   Add Position     Position added successfully to:            Please name your holdings portfolio     Type:  BUY SELL    Date:           \xa0    Amount:    Price    Point Value:       Leverage:  1:1 1:10 1:25 1:50 1:100 1:200 1:400 1:500 1:1000    Commission:          \xa0  Create New Watchlist Create    Create a new holdings portfol',
 'url': 'https://www.investing.com/news/cryptocurrency-news/16k-bitcoin-dropping-to-12k14k--can-this-really-happen-watch-the-market-report-2953888',
 'sentiment': 'Positive',
 'sentimentScore': 0.9231}

# Setup Custom Tools
Todo: Used to get news for inference, not for training.

In [None]:
# from crewai.tools import tool
# from ddgs import DDGS


# ## Define a Crew AI tool to get news for a given date using the DDGS search engine
# @tool("web_search")
# def web_search(query: str) -> str:
#     """Fetch web search results for the given query using DDGS."""
#     return str(DDGS().text(query, max_results=10))


# questions_answering_ag.verbose_agent = True
# questions_answering_ag.tools = [web_search]
# dow_jones_data.filter_states(end=1)
# answers = await (questions_answering_ag << dow_jones_data)
# print(answers.pretty_print())

# Transduction between AGs
### News -> Classify News + Key points

**Customizing Transduction**   
You can fine-tune how logical transduction works by configuring:

LLMs – choose the underlying language model to run the transduction.   
Instructions – add task-specific guidance for the LLM.   
Prompt Templates – control how inputs are rendered into prompts.   
Few-Shot Examples – provide examples to steer the model’s behavior.   
Verbose Options – enable detailed logging and debug outputs.



In [None]:
from typing import Optional, Literal, List, Dict, Any
from pydantic import BaseModel, Field, HttpUrl, ValidationError
from datasets import load_dataset, DatasetDict
from pydantic import BaseModel
from agentics import AG
import asyncio

In [None]:
INDUSTRIES = [
    "Information Technology",
    "Health Care",
    "Financials",
    "Consumer Discretionary",
    "Communication Services",
    "Industrials",
    "Consumer Staples",
    "Energy",
    "Utilities",
    "Real Estate",
    "Materials",
    "General Market"
]

IndustryType = Literal[tuple(INDUSTRIES)]

class NewsEntry(BaseModel):
    """
    Pydantic model representing a single comprehensive news entry, including
    sentiment analysis and source metadata.
    """
    # url: HttpUrl = Field(description="The full URL link to the original news article.")
    # image: Optional[FlexibleImageUrl] = Field(description="Optional URL link to the main image or a relative path.")
    publishedDate: str = Field(description="The publication date of the article (ISO 8601 string).")
    symbol: str = Field(description="The stock ticker symbol the news is related to (e.g., 'AAPL', 'OCEA').")
    site: str = Field(description="The source website or platform where the news was published.")
    text: str = Field(description="A short snippet or summary of the news article content.")
    title: str = Field(description="The headline or title of the news article.")
    sentiment: Optional[str] = Field(description="The calculated sentiment label, usually one of 'positive', 'negative', or 'neutral'.")
    sentimentScore: Optional[float] = Field(description="The numeric sentiment score calculated by the source.")
    # New additions
    industry: IndustryType = Field(None, description=f"The primary industry sector this news is relevant to. Must be one of: {INDUSTRIES}.")
    keyPoints: str = Field(None, description="A bullet list summarizing the 5 most important points of the news article.")

In [None]:
news_entry_instance = NewsEntry.model_validate(ds[101010])
print(news_entry_instance)

publishedDate='2022-11-29T18:40:15.000Z' symbol='BTC-USD' site='investing' text='$16K Bitcoin dropping to $12K–$14K — Can this really happen? Watch The Market Report     \xa0      BTC/USD +1.77%      Add to/Remove from Watchlist          Add to Watchlist   Add Position     Position added successfully to:            Please name your holdings portfolio     Type:  BUY SELL    Date:           \xa0    Amount:    Price    Point Value:       Leverage:  1:1 1:10 1:25 1:50 1:100 1:200 1:400 1:500 1:1000    Commission:          \xa0  Create New Watchlist Create    Create a new holdings portfol' title='$16K Bitcoin dropping to $12K–$14K — Can this really happen? Watch The Market Report' sentiment='Positive' sentimentScore=0.9231 industry=None keyPoints=None


In [None]:
from typing import Optional, Literal, List, Dict, Any, Tuple
from pydantic import BaseModel, Field, HttpUrl, ValidationError
from datasets import load_dataset, DatasetDict
from pydantic import BaseModel
from agentics import AG
import asyncio

async def process_hf_dataset(hf_dataset: List[Dict[str, Any]]) -> List[NewsEntry]:
  """
  Processes a list of data (simulating a Hugging Face dataset split)
  to validate each entry against the NewsEntry Pydantic model.
  """
  print(f"\n--- Starting processing of {len(hf_dataset)} entries ---")

  # Validate each record synchronously
  results = [NewsEntry.model_validate(record) for record in hf_dataset]
  print("--- Processing Complete! ---")
  return results

In [None]:
news_entry = await process_hf_dataset(ds.select(range(5))) # to remove, testing on 5 now


--- Starting processing of 5 entries ---
--- Processing Complete! ---


In [None]:
# news_entry

In [None]:
prompts = {
    "industry_class_and_keypoints": f"""Classify the primary industry sector this news is relevant to as \"industry\". Must be one of: {INDUSTRIES}.
    Next, summarize the 5 most important points of the news article in bullet point as \"keyPoints\"."""
}

In [None]:
# Create source and target AGs
ag_news_entry = AG(atype=NewsEntry, states=news_entry)
# ag_news_kp = AG(atype=NewsKeyPoints, llm=gemini_llm)

# ag_news_entry.instructions = prompts["industry_class_and_keypoints"]
ag_news_entry.llm = (gemini_llm)
ag_news_entry = await ag_news_entry.self_transduction(
    source_fields=list(NewsEntry.model_fields.keys()),
    target_fields=["industry", "keyPoints"],
    instructions= prompts["industry_class_and_keypoints"]
)

Output()

In [None]:
ag_news_entry.pretty_print()

aType : <class '__main__.NewsEntry'>
publishedDate: '2023-10-04T21:54:28.000Z'
symbol: HE
site: benzinga
text: RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) -- The law firm of Kessler Topaz
  Meltzer & Check, LLP (www.ktmc.com) informs investors that a securities class action
  lawsuit has been filed in the United States District Court for the Northern District
  of California against Hawaiian Electric Industries, Inc. ("Hawaiian Electric") (NYSE:HE).
  The action charges Hawaiian Electric with violations of the federal securities laws,
  including omissions and fraudulent misrepresentations relating to the compa...
title: 'HE REMINDER: Kessler Topaz Meltzer & Check, LLP Reminds Hawaiian Electric
  Industries, Inc. (HE) Shareholders of Securities Fraud Class Action Lawsuit and
  Encourages Investors with Substantial Losses to Contact the Firm'
sentiment: Negative
sentimentScore: -0.6417
industry: Utilities
keyPoints: '- A securities class action lawsuit has been filed against Hawaiian El

'aType : <class \'__main__.NewsEntry\'>\npublishedDate: \'2023-10-04T21:54:28.000Z\'\nsymbol: HE\nsite: benzinga\ntext: RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) -- The law firm of Kessler Topaz\n  Meltzer & Check, LLP (www.ktmc.com) informs investors that a securities class action\n  lawsuit has been filed in the United States District Court for the Northern District\n  of California against Hawaiian Electric Industries, Inc. ("Hawaiian Electric") (NYSE:HE).\n  The action charges Hawaiian Electric with violations of the federal securities laws,\n  including omissions and fraudulent misrepresentations relating to the compa...\ntitle: \'HE REMINDER: Kessler Topaz Meltzer & Check, LLP Reminds Hawaiian Electric\n  Industries, Inc. (HE) Shareholders of Securities Fraud Class Action Lawsuit and\n  Encourages Investors with Substantial Losses to Contact the Firm\'\nsentiment: Negative\nsentimentScore: -0.6417\nindustry: Utilities\nkeyPoints: \'- A securities class action lawsuit has been f

### Classified News + Key points -> Aggregate by (date, indstry)

In [None]:
ConsolidatedNews = Dict[Tuple[str, str], List[NewsEntry]]

In [None]:
from collections import defaultdict

In [None]:
async def group_articles_by_date_and_industry(
    articles: List[NewsEntry],
) -> ConsolidatedNews:
    """
    Takes a list of articles and groups them into a dictionary where the
    key is (published_date_part, industry) and the value is a list of articles.

    This function acts as the 'reduce' operation, consolidating the entire list.
    """
    # Use defaultdict for easy appending to lists
    grouped_result: ConsolidatedNews = defaultdict(list)

    for article in articles:
        # 1. Extract the date part (e.g., '2023-10-04' from '2023-10-04T21:54:28.000Z')
        published_date = article.publishedDate.split('T')[0] if article.publishedDate else None

        # 2. Extract the industry, defaulting to 'Unknown' if missing
        industry = article.industry or None

        # 3. Form the composite key
        key = (published_date, industry)

        # 4. Append the article to the list for this key
        grouped_result[key].append(article)

    # Convert back to a standard dictionary before returning
    return dict(grouped_result)

In [None]:
consolidated_news = await ag_news_entry.areduce(
    group_articles_by_date_and_industry
)

In [None]:
consolidated_news.states

{('2023-10-04',
  'Utilities'): [NewsEntry(publishedDate='2023-10-04T21:54:28.000Z', symbol='HE', site='benzinga', text='RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) -- The law firm of Kessler Topaz Meltzer & Check, LLP (www.ktmc.com) informs investors that a securities class action lawsuit has been filed in the United States District Court for the Northern District of California against Hawaiian Electric Industries, Inc. ("Hawaiian Electric") (NYSE:HE). The action charges Hawaiian Electric with violations of the federal securities laws, including omissions and fraudulent misrepresentations relating to the compa...', title='HE REMINDER: Kessler Topaz Meltzer & Check, LLP Reminds Hawaiian Electric Industries, Inc. (HE) Shareholders of Securities Fraud Class Action Lawsuit and Encourages Investors with Substantial Losses to Contact the Firm', sentiment='Negative', sentimentScore=-0.6417, industry='Utilities', keyPoints='- A securities class action lawsuit has been filed against Hawaiian E