In [1]:
from typing import Optional, Dict, Any, List, Set
import time
from ollama import Client
import re
from dataclasses import dataclass

In [2]:
## Data-processing


@dataclass
class ProcessedText:
    """
    Store processed text-data
    """

    cleaned_text: str
    sentences: List[str]
    num_sentences: int


@dataclass
class NewsCategories:
    """Store available news categories"""

    CATEGORIES: Set[str] = frozenset(
        {
            "Health",
            "Science",
            "Television",
            "Travel",
            "Movies",
            "Food",
            "Well",
            "Fashion",
            "Technology",
            "Sports",
            "Your Money",
            "Theater",
            "Education",
            "Automobiles",
            "Global Business",
            "Books",
            "Art & Design",
            "Style",
            "Media",
            "Dance",
            "Real Estate",
            "Economy",
            "Opinion",
            "Music",
        }
    )


class TextPreprocessor:
    """
    Simplified version of RoBERTaPreprocessor focused on text cleaning and sentence splitting
    """

    def __init__(self) -> None:
        self.abbreviations = {"mr.", "mrs.", "dr.", "st.", "ave.", "prof."}

    def clean_text(self, text: str) -> str:
        """
        Clean text while preserving important punctuation and structure.
        """
        ## Replace multiple newlines/spaces with single space
        text = re.sub(r"\n+", " ", text)
        text = re.sub(r"\s+", " ", text)

        # Remove URLs and emails
        text = re.sub(r"http\S+|www\.\S+", "", text)
        text = re.sub(r"\b[\w-]+@[\w-]+[.][\w-]+", "", text)

        # Replace multiple white-spaces with a single space
        text = " ".join(text.split())

        # Normalize dashes to hyphen
        text = text.replace("—", "-").replace("–", "-")

        # Fix spacing around punctuation
        text = re.sub(r"\s+([.,!?;:])", r"\1", text)
        text = re.sub(r"\(\s+", "(", text)
        text = re.sub(r"\s+\)", ")", text)

        # Additional cleaning
        text = re.sub(r"[\u0080-\uFFFF]", "", text)  # Remove non-ASCII characters
        text = re.sub(r"\d+", "NUM", text)  # Replace numbers with "NUM"

        return text.strip()

    def text2sentences(self, text: str) -> List[str]:
        """
        Split text into sentences while handling common abbreviations
        """
        sentences, current = [], []
        words = text.split()

        for word in words:
            current.append(word)
            if word.lower() in self.abbreviations:
                continue
            if word.endswith((".", "!", "?")):
                sentences.append(" ".join(current))
                current = []

        if current:
            sentences.append(" ".join(current))

        return sentences

    def process_text(self, text: str) -> ProcessedText:
        """Clean & split into sentences"""
        cleaned_text = self.clean_text(text)
        sentences = self.text2sentences(cleaned_text)

        return ProcessedText(
            cleaned_text=cleaned_text,
            sentences=sentences,
            num_sentences=len(sentences),
        )

In [3]:
class NewsAnalyzer:
    def __init__(self, model_name: str = "llama2", host_ip: str = "http://localhost:11434") -> None:
        """
        Initialize the predictor with specific Ollama model.
        Args:
            model_name: Name of the Ollama model to use (default: llama2)
        """
        self.model = model_name
        self.client = Client(host=host_ip)
        self.preprocessor = TextPreprocessor()
        self.categories = NewsCategories

    def generate_prompt(
        self,
        processed_text: ProcessedText,
        abstract: Optional[str] = None,
    ) -> str:
        """
        Create a structure prompt for category generator
        Args:
            processed_text: ProcessedText object containing cleaned text and sentences
            abstract: Optional article abstract
        """
        ## If abstract is processed - use it; otherwise use the first few sentences
        if abstract:
            context = self.preprocessor.clean_text(abstract)

        else:
            ## Take first 3 sentences and upto 500 characters
            context_sentences = processed_text.sentences[:3]
            context = " ".join(context_sentences)
            if len(context) > 500:
                context = context[:500].rsplit(" ", 1)[0] + "..."
        categories_str = ", ".join(sorted(self.categories.CATEGORIES))
        prompt = f"""You are a news categorizer and a catchy headline generator. Create relevant topics that this article can be categorized. The current categories that we recognize are: {categories_str}. If you find that the article doesn't fit the category, categorize it as Novel Category.
        Article context: {context} Suggest the relevant category that captures the main point and suggest a headline that invokes reader's curiosity without being a click-bait. Response should be category and short headline within 10 words!
        Category: 
        Headline:"""
        return prompt

    def analyze_news(
        self, body: str, abstract: Optional[str] = None, temperature: float = 0.7
    ) -> Dict[Any, str]:
        """
        Predict a headline using the Ollama model.
        Args:
            body: The main text content
            abstract: Optional article abstract
            temperature: Controls randomness in generation (0.0 to 1.0)

        Returns:
            Dictionary containing the generated headline and metadata
        """
        try:
            ## Preprocessed text
            processed_text = self.preprocessor.process_text(body)

            ## Generate prompt:
            prompt = self.generate_prompt(processed_text, abstract)

            ## Generate response using Ollama client
            response = self.client.generate(
                model=self.model,
                prompt=prompt,
                # temperature=temperature,
                stream=False,
            )

            ## Parse response to extract category and headline
            response_text = response["response"].strip()
            category_line = ""
            headline_line = ""

            for line in response_text.split("\n"):
                if line.startswith("Category:"):
                    category_line = line.replace("Category:", "").strip()
                elif line.startswith("Headline:"):
                    headline_line = line.replace("Headline:", "").strip()

            return {
                "category": category_line,
                "headline": headline_line,
                "model": self.model,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "num_sentences_processed": processed_text.num_sentences,
            }

        except Exception as e:
            return {
                "error": f"Failed to generate headline: {str(e)}",
                "model": self.model,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            }

    def list_available_models(self) -> list:
        """List all available models in Ollama using direct API call."""
        import requests

        try:
            response = requests.get("http://localhost:11434/api/tags")
            print("API Response:", response.text)  # Debug print
            if response.status_code == 200:
                data = response.json()
                print("Parsed data:", data)  # Debug print
                if isinstance(data, dict) and "models" in data:
                    return [model["name"] for model in data["models"]]
                return []
            else:
                self.logger.error(
                    f"Failed to get models list. Status code: {response.status_code}"
                )
                return []
        except Exception as e:
            print(f"Error listing models: {str(e)}")
            return []


In [9]:
analyzer = NewsAnalyzer(model_name="qwen3:8b")

In [10]:
print("Available models:", analyzer.list_available_models())

API Response: {"models":[{"name":"qwen3:8b","model":"qwen3:8b","modified_at":"2025-05-04T18:07:43.79613168-05:00","size":5225387923,"digest":"e4b5fd7f8af048d3c02e0357274238a9e93da51936665599ccb957aa42bfe173","details":{"parent_model":"","format":"gguf","family":"qwen3","families":["qwen3"],"parameter_size":"8.2B","quantization_level":"Q4_K_M"}},{"name":"llama3.2:latest","model":"llama3.2:latest","modified_at":"2025-05-04T17:58:25.701054891-05:00","size":2019393189,"digest":"a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"3.2B","quantization_level":"Q4_K_M"}},{"name":"llama3.2:3b","model":"llama3.2:3b","modified_at":"2024-12-20T19:47:03.411473541-06:00","size":2019393189,"digest":"a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"3.2B","quantization_level

In [13]:
# Example article text - Takes 12.8 seconds for llama:3.2-3b and 
sample_abstract = """A killing at a Bay Area rapid-transit station has inspired Ryan Coogler's 
feature-film debut, a movie already honored at the Sundance and Cannes film festivals."""

sample_body = """OAKLAND — It had been nearly a year since Ryan Coogler last stood on the 
arrival platform on the upper-level of the Fruitvale Bay Area Rapid Transit Station, where 
22-year-old Oscar Grant III, unarmed and physically restrained, was shot in the back by a 
BART transit officer..."""

# Analyze news
result = analyzer.analyze_news(
    body=sample_body,
    abstract=sample_abstract,
)

In [16]:
result

{'category': 'Movies',
 'headline': "Bay Area Killing Sparks Ryan Coogler's Sundance-Honored Film",
 'model': 'qwen3:8b',
 'timestamp': '2025-05-04 18:13:31',
 'num_sentences_processed': 1}