# Retrieve Article Data 

In [None]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
import re

def clean_text(text):
    # Remove excessive whitespace, newlines, and special characters
    if text:
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII for simplicity
        return text
    return ''

def extract_article_data(url):
    # Define headers to mimic a browser and reduce blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }

    try:
        print("Fetching and analyzing webpage...")
        # Fetch the webpage with headers
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        html_content = response.text

        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Initialize context dictionary for the local AI
        article_data = {
            "url": url,
            "title": "",
            "main_content": [],
            "headings": [],
            "metadata": {
                "description": "",
                "keywords": "",
                "author": "",
                "date": ""
            },
            "links": [],
            "images": []
        }

        # Extract title
        title_tag = soup.find('title') or soup.find('h1')
        article_data["title"] = clean_text(title_tag.get_text()) if title_tag else "No title found"

        # Extract metadata (common meta tags for description, keywords, author, date)
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            name = meta.get('name', '').lower()
            content = meta.get('content', '')
            if name == 'description':
                article_data["metadata"]["description"] = clean_text(content)
            elif name == 'keywords':
                article_data["metadata"]["keywords"] = clean_text(content)
            elif name == 'author':
                article_data["metadata"]["author"] = clean_text(content)
            elif name in ('date', 'publish_date', 'publication_date'):
                article_data["metadata"]["date"] = clean_text(content)

        # Extract headings (h1, h2, h3) for structure and context
        for tag in soup.find_all(['h1', 'h2', 'h3']):
            level = tag.name  # e.g., 'h1', 'h2'
            text = clean_text(tag.get_text())
            if text:
                article_data["headings"].append({"level": level, "text": text})

        # Extract main content: look for common article containers
        content_selectors = [
            'article',  # HTML5 article tag
            'div[class*="article"]', 'div[id*="article"]',
            'div[class*="content"]', 'div[id*="content"]',
            'div[class*="post"]', 'div[id*="post"]',
            'main'  # HTML5 main tag
        ]
        main_content = []
        for selector in content_selectors:
            elements = soup.select(selector)
            for element in elements:
                paragraphs = element.find_all(['p', 'div', 'span'])
                for p in paragraphs:
                    text = clean_text(p.get_text())
                    if text and len(text) > 50:  # Filter short, irrelevant text
                        main_content.append(text)
            if main_content:  # Stop if we found content
                break

        # Fallback: if no content found, grab all paragraphs
        if not main_content:
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = clean_text(p.get_text())
                if text and len(text) > 50:
                    main_content.append(text)

        article_data["main_content"] = main_content if main_content else ["No main content found"]

        # Extract links for context
        for a_tag in soup.find_all('a', href=True):
            link_text = clean_text(a_tag.get_text())
            link_url = urljoin(url, a_tag['href'])  # Resolve relative URLs
            if link_text and link_url:
                article_data["links"].append({"text": link_text, "url": link_url})

        # Extract images for context
        for img_tag in soup.find_all('img'):
            src = urljoin(url, img_tag.get('src', ''))  # Resolve relative URLs
            alt = clean_text(img_tag.get('alt', ''))
            if src:
                article_data["images"].append({"src": src, "alt": alt})

        # Save the structured data as JSON for the local AI
        with open("article_data.json", "w", encoding="utf-8") as file:
            json.dump(article_data, file, indent=4, ensure_ascii=False)
        print("Article data extracted and saved to article_data.json")
        
        return article_data

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None
    except Exception as e:
        print(f"Error processing the page: {e}")
        return None

# Example usage
if __name__ == "__main__":
    url = "https://finance.yahoo.com/"
    result = extract_article_data(url)
    if result:
        print("Extraction successful! Data saved to article_data.json")

Fetching and analyzing webpage...
Article data extracted and saved to article_data.json
Extraction successful! Data saved to article_data.json


# Analyze Results Using Local LLM

This time we'll try using the ```ollama``` python library rather than ```smolagents```

In [None]:
from ollama import chat
from ollama import ChatResponse
import json

def analyze_json():
    # Load the JSON file
    with open('article_data.json', 'r') as file:
        data = json.load(file)

    # Prepare a prompt for the model with the JSON data
    prompt = f"""
    I have a JSON file containing financial news data from Yahoo Finance. Please analyze the content and provide a general summary of the key points, including:
    - Main themes or topics in the news.
    - Notable stock ticker changes.
    - Any significant events or announcements.
    - General sentiment or implications for the market.

    Here is the JSON data:
    {json.dumps(data, indent=2)}
    """

    # Send the prompt to the Ollama model
    response: ChatResponse = chat(
        model='deepseek-r1:14b',  # Replace with your preferred model if needed
        messages=[
            {
                'role': 'user',
                'content': prompt,
            },
        ]
    )

    # Print the model's summary
    print("Summary of the JSON Data:")
    print(response['message']['content'])

# Run the function
if __name__ == "__main__":
    analyze_json()