In [2]:
from databonsai.categorize import MultiCategorizer, BaseCategorizer
from databonsai.transform import BaseTransformer
from databonsai.llm_providers import OpenAIProvider, AnthropicProvider, OllamaProvider
from databonsai.utils import (
    apply_to_column,
    apply_to_column_batch,
    apply_to_column_autobatch,
)
import pandas as pd

In [3]:
provider = OpenAIProvider(model="gpt-3.5-turbo")  # Or AnthropicProvider()

In [4]:
categories = {
    "Weather": "Insights and remarks about weather conditions.",
    "Sports": "Observations and comments on sports events.",
    "Politics": "Political events related to governments, nations, or geopolitical issues.",
    "Celebrities": "Celebrity sightings and gossip",
    "Tech": "News and updates about technology and tech companies.",
    "Others": "Comments do not fit into any of the above categories",  # Best practice in case it can't be categorized easily
    "Anomaly": "Data that does not look like comments or natural language",  # Helps to flag unclean/problematic data
}
categorizer = BaseCategorizer(
    categories=categories,
    llm_provider=provider,
    examples=[
        {"example": "Big stormy skies over city", "response": "Weather"},
        {"example": "The team won the championship", "response": "Sports"},
        {"example": "I saw a famous rapper at the mall", "response": "Celebrities"},
    ],
)

In [5]:
headlines2 = [
    "Local Fire Department Honored with National Award for Bravery",
    "Breakthrough Research Promises New Treatment for Alzheimer’s Disease",
    "Major Airline Announces Expansion of International Routes",
    "Historic Peace Agreement Signed Between Rival Nations",
    "City Council Votes to Increase Funding for Public Libraries",
    "Renowned Chef Opens Vegan Restaurant in Downtown",
    "Veteran Astronaut Set to Lead Next Moon Mission",
    "Global Music Festival Raises Funds for Refugee Relief",
    "Innovative Urban Farming Techniques Revolutionize City Life",
    "Climate Summit Sets Ambitious Goals for Carbon Reduction",
    "Documentary Film Exposing Corruption Premieres to Critical Acclaim",
    "New Legislation Aims to Boost Small Businesses",
    "Ancient Shipwreck Discovered Off the Coast of Sicily",
    "World Health Organization Declares New Strain of Virus Contained",
    "International Art Theft Ring Busted by Joint Task Force",
    "Leading Economists Predict Global Recession in Next Year",
    "Celebrity Fashion Designer Debuts Eco-Friendly Line",
    "Major Breakthrough in Quantum Encryption Technology Announced",
    "Wildlife Conservation Efforts Successfully Increase Tiger Population",
    "Rare Astronomical Event Visible This Weekend",
    "Nationwide Protests Demand Action on Climate Change",
    "Revolutionary New Battery Design Could Transform Renewable Energy Storage",
    "Record-Breaking Heatwave Strikes Southern Europe",
    "Underground Water Reserves Discovered Beneath Sahara Desert",
    "Virtual Reality Platform Takes Online Education to New Heights",
    "Controversial New Policy Sparks Debate Over Internet Privacy",
    "Youngest Nobel Laureate Awarded for Work in Peace Building",
    "Sports League Implements New Rules to Protect Players from Concussions",
    "Historic Church Undergoes Restoration to Preserve Cultural Heritage",
    "Pioneering Surgery Gives New Hope to Heart Disease Patients",
    "Wildfires Rage Across California, Thousands Evacuated",
    "Tech Start-Up Revolutionizes Mobile Payment Systems",
    "Pharmaceutical Company Faces Lawsuit Over Drug Side Effects",
    "Renewable Energy Now Powers Entire Small Nation",
    "Central Bank Raises Interest Rates in Surprise Move",
    "Marine Biologists Discover New Species in the Deep Ocean",
    "Global Conference on Women's Rights Concludes with Action Plan",
    "Country Music Star Reveals Struggle with Mental Health in New Album",
    "Massive Oil Spill Threatens Wildlife Along the Coast",
    "Protests Erupt as Government Cuts Healthcare Funding",
    "Archaeologists Uncover New Evidence of Ancient Civilization in Thailand",
    "Fashion Week Highlights Sustainability in New Collections",
    "New Strain of Wheat Could Increase Crop Yields Substantially",
    "Scientists Link Air Pollution to Decline in Urban Wildlife",
    "Innovative Community Program Cuts Urban Crime Rate",
    "Next Generation of Smartphones Features Advanced AI Capabilities",
    "Historical Drama Film Set to Break Box Office Records",
    "Study Shows Increase in Cyber Attacks on Financial Institutions",
    "New Yoga Trend Combines Traditional Practices with Modern Technology",
    "Local Community Garden Doubles as Educational Facility for Schools",
]

In [6]:
categories = []
idx = apply_to_column_autobatch(
    headlines2,
    categories,
    categorizer.categorize_batch,
    max_retries=3,
    batch_size=5,
    ramp_factor=1.7,
    max_batch_size=20,
    ramp_factor_decay=0.98,
    reduce_factor=0.7,
    reduce_factor_decay=0.9,
    start_idx=0,
)

Categorizing:   0%|          | 0/50 [00:00<?, ?row/s]

Retrying with smaller batch size: 4


Categorizing: 100%|██████████| 50/50 [00:04<00:00, 11.98row/s]


In [7]:
print(categories)

['Politics', 'Tech', 'Others', 'Politics', 'Politics', 'Celebrities', 'Others', 'Others', 'Tech', 'Politics', 'Celebrities', 'Politics', 'Others', 'Others', 'Others', 'Others', 'Celebrities', 'Tech', 'Others', 'Weather', 'Politics', 'Tech', 'Weather', 'Tech', 'Tech', 'Politics', 'Celebrities', 'Sports', 'Others', 'Tech', 'Weather', 'Tech', 'Politics', 'Weather', 'Politics', 'Others', 'Celebrities', 'Weather', 'Politics', 'Others', 'Celebrities', 'Others', 'Others', 'Politics', 'Others', 'Tech', 'Celebrities', 'Tech', 'Others', 'Others']
