In [153]:
import requests
import json
from IPython.display import Markdown, display, update_display

In [154]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def fetch_website_contents(url):
    response = requests.get(url,headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.get_text(strip=True) if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)



def fetch_website_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [155]:
system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [156]:
def user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

"""

    links = fetch_website_links(url)
    user_prompt = user_prompt+"\n".join(links)
    return user_prompt

In [157]:
print(user_prompt("https://edwarddonner.com"))


Here is the list of links on the website https://edwarddonner.com -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
https://edwar

In [158]:
import requests

OLLAMA_API = 'http://localhost:11434/api/chat'
HEADERS = {"content-type": "application/json"}

def select_relevant_links(url):
    payload = {
        "model": "llama3.2",
        "stream": False,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt(url)}
        ],
        "response_format": {"type": "json_object"}  # Ask model to return JSON object
    }

    response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)
    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return {"links": []}

    data = response.json()
    
    # When using 'json_object', the model's output is already parsed as a dict
    # Some implementations might put it in 'message' -> 'content'
    links = data.get('message', {}).get('content')
    if isinstance(links, dict):
        return links
    else:
        # fallback if LLM returned string instead of dict
        try:
            import json
            return json.loads(links)
        except:
            return {"links": []}



In [159]:
select_relevant_links("https://edwarddonner.com")

{'links': [{'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'company page', 'url': 'https://edwarddonner.com/'},
  {'type': 'careers/jobs page', 'url': 'https://edwarddonner.com/posts/'}]}

In [160]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [162]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

In [163]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    return user_prompt

In [164]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

'\nYou are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages;\nuse this information to build a short brochure of the company in markdown without code blocks.\n\n\n## Landing Page:\n\nHugging Face ‚Äì The AI community building the future.\n\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-OCR\nUpdated\nabout 11 hours ago\n‚Ä¢\n623k\n‚Ä¢\n1.8k\nPaddlePaddle/PaddleOCR-VL\nUpdated\n1 day ago\n‚Ä¢\n14.8k\n‚Ä¢\n1.06k\ntencent/HunyuanWorld-Mirror\nUpdated\nabout 6 hours ago\n‚Ä¢\n5.94k\n‚Ä¢\n328\nkrea/krea-realtime-video\nUpdated\n5 days ago\n‚Ä¢\n1.04k\n‚Ä¢\n173\nQwen/Qwen3-VL-8B-Instruct\nUpdated\n10 days ago\n‚Ä¢\n262k\n‚Ä¢\n319\nBrowse 1M+ models\nSpaces

In [165]:
import requests
import json
from IPython.display import Markdown, display

def create_brochure(company_name, url):
    stream = requests.post(
        "http://localhost:11434/api/chat",
        json={
            "model": "llama3.2",
            "stream": True,
            "messages": [
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
            ]
        },
        stream=True  # Important for reading chunks
    )

    response = ""
    display_handle = display(Markdown(""), display_id=True)

    for chunk in stream.iter_lines():
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            if "message" in data and "content" in data["message"]:
                response += data["message"]["content"]
                display_handle.update(Markdown(response))


In [166]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face: Empowering the AI Community

Welcome to Hugging Face, the leading platform for machine learning collaboration, innovation, and community building. Our mission is to create a seamless and accessible environment where researchers, developers, and businesses can come together to develop, deploy, and scale AI models.

## About Us

Hugging Face was founded with a simple yet ambitious goal: to democratize access to AI technology. We believe that everyone should be able to build, use, and contribute to AI models without requiring extensive expertise or resources. Our platform provides the necessary tools, frameworks, and community support to facilitate collaboration, innovation, and progress in the field.

## What We Do

Hugging Face offers a comprehensive suite of products and services designed to empower users in various aspects of machine learning:

* **Models**: Access over 1 million pre-trained models across multiple modalities (text, image, video, audio) and domains.
* **Datasets**: Discover and utilize vast collections of datasets for any ML task, including text generation, sentiment analysis, and more.
* **Spaces**: Build, deploy, and scale AI applications with our intuitive platform, optimized for GPU acceleration.
* **Compute**: Leverage Hugging Face's scalable compute infrastructure to accelerate model training, inference, and deployment.

## Our Community

The Hugging Face community is a vibrant ecosystem of researchers, developers, and businesses working together to advance the state-of-the-art in machine learning. Join our forums, attend webinars, and participate in hackathons to connect with like-minded individuals and stay up-to-date on the latest developments.

## Careers & Opportunities

Join our team of innovators and collaborate with us to shape the future of AI. Explore available job openings, internships, and research opportunities at [link].

## Partnerships & Collaborations

Hugging Face collaborates with leading organizations in the tech industry, including Meta, Amazon, Google, Intel, Microsoft, and Grammarly, to advance the adoption of machine learning across various domains.

## Our Open Source Efforts

We are committed to building the foundation of ML tooling through our open-source initiatives:

* **Transformers**: A state-of-the-art AI model library for PyTorch.
* **Diffusers**: State-of-the-art diffusion models in PyTorch.
* **Safetensors**: Safe way to store/distribute neural network weights.

Stay tuned for updates on our latest projects and contributions to the open-source community.