In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import ollama

In [2]:
load_dotenv(override=True)
MODEL = "llama3.2"


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/01/23/ll

In [9]:
import json
import logging
import pprint
#pprint.pprint(response)

import re

def extract_json_from_text(text):
    """
    Extract the first JSON object found in the text.
    """
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if match:
        return match.group(0)
    return None

def get_links(url):
    website = Website(url)
    
    try:
        response = ollama.chat(
            model="llama3.2",
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(website)}
            ]
        )

        result = response['message']['content']
       
        # Log the raw result for debugging
        logging.debug(f"Raw result: {result}")

       
        if isinstance(result, str):
            if not result.strip():
                logging.warning("Result string is empty.")
                return None

            json_text = extract_json_from_text(result)
            if not json_text:
                logging.warning("No JSON object found in the result string.")
                return None

            logging.debug(f"Extracted JSON string: {repr(json_text)}")

            try:
                return json.loads(json_text)
            except json.JSONDecodeError as e:
                logging.error(f"JSON decoding error: {e}")
                logging.debug(f"Problematic JSON string: {repr(json_text)}")
                return None
        
    except Exception as e:
        logging.exception("An unexpected error occurred in get_links.")
        return None



In [26]:
get_links("https://huggingface.co")

{'links': [{'type': 'About page', 'url': 'https://huggingface.co/'},
  {'type': 'Company page', 'url': 'https://huggingface.co/'},
  {'type': 'Careers/Jobs page',
   'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'Blog page', 'url': 'https://blog.huggingface.co/'},
  {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter handle', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn company page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [13]:
import requests

def is_url_reachable(url, timeout=5):
    try:
        response = requests.head(url, timeout=timeout)
        return response.status_code < 400
    except requests.RequestException:
        return False

In [11]:
def get_all_details(url):
    if is_url_reachable(url,5):
        result = "Landing page:\n"
        result += Website(url).get_contents()
        links = get_links(url)
        print("Found links:", links)
        for link in links["links"]:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
        return result

In [14]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'About page', 'url': 'https://huggingface.co/about'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
nvidia/parakeet-tdt-0.6b-v2
Updated
about 12 hours ago
•
167k
•
868
nari-labs/Dia-1.6B
Updated
2 days ago
•
173k
•
2.18k
Lightricks/LTX-Video
Updated
1 day ago
•
291k
•
1.49k
ACE-Step/ACE-Step-v1-3.5B
Updated
3 days ago
•
427
lodestones/Chroma
Updated
2 days ago
•
538
Browse 1M+ models
Spaces
Running
6.59k
6.59k
DeepSite
🐳
Generate any application with DeepSeek
Running
on
CPU Upgrade
618
618
Computer Agent
🖥
Interact with an AI agent to perform web tasks
Running
o

In [15]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [100]:
def get_brochure_user_prompt(company_name, url):
    try:
        if is_url_reachable(url):
            web_content = get_all_details(url)[:5000] 
            user_prompt = f"You are looking at a company called: {company_name}\n"
            user_prompt += f"Use the name {company_name} clearly in the brochure.\n"
            user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
            user_prompt += f"\n\nReminder: the company name is {company_name}."
            #user_prompt += get_all_details(url)
            #user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
            user_prompt += web_content
            return user_prompt
    except requests.RequestException:
        return False

In [101]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co'}, {'type': 'Research Papers', 'url': 'https://huggingface.co/docs/transformers'}]}


False

In [102]:
import requests

def is_url_reachable1(url, timeout=5):
    try:
        response = requests.head(url, timeout=timeout)
        return response.status_code < 400
    except requests.RequestException:
        return False

In [103]:
def create_brochure(company_name, url):
    try:
        if is_url_reachable(url,5):
            response = ollama.chat(
                model="llama3.2",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
                  ]
            )
    
        result = response['message']['content']
        display(Markdown(result))
    except requests.RequestException:
        return False

In [104]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}]}


# HuggingFace: Empowering AI Innovation

[Image: A futuristic illustration of a brain with glowing neural connections]

At Hugging Face, we're building the future of Artificial Intelligence. Our platform is a collaborative space where machine learning practitioners, researchers, and developers come together to create, share, and apply AI models.

## What We Do

We provide an open-source foundation for machine learning tooling, enabling users to:

* Build and deploy models for text, image, video, audio, or 3D applications
* Host and collaborate on unlimited public models, datasets, and applications
* Accelerate their ML work with our optimized inference endpoints and computing solutions

## Our Community

With over 1 million+ models available, our community is a hub of innovation. We're proud to have partnered with leading organizations such as:

* Meta AI2
* Amazon AI
* Google AI
* Intel AI
* Microsoft AI
* Grammarly AI
* Writer AI

## Our Technologies

We've developed a range of cutting-edge technologies to support our platform, including:

* **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, and JAX
* **Diffusers**: State-of-the-art Diffusion models in PyTorch
* **Safetensors**: A safe way to store/distribute neural network weights
* **Hub Python Library**: A Python client to interact with our Hugging Face Hub

## Join Our Mission

Ready to accelerate your ML work? Explore our platform, sign up for a free account, and start building your portfolio. Our Open Source initiatives are always looking for contributors to help shape the future of AI.

### Get Started

* [Sign Up](#) for a free account
* [Explore Models](#) and datasets
* [Browse Spaces](#) for collaboration and deployment
* [Learn About Enterprise Solutions](#)

Join us in building a brighter future with AI.

In [105]:
def stream_brochure(company_name, url):
    if not is_url_reachable(url):
        print("❌ URL not reachable")
        return
    try:
        #if is_url_reachable(url,5):
         stream = ollama.chat(
            model="llama3.2",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
                ],
            stream=True
            )
    
       #result = response['message']['content']
       # display(Markdown(result))
    except requests.RequestException:
        return False
        
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    #for chunk in stream:
        #response += chunk.choices[0].delta.content or ''
        #response += chunk['message']['content'] or ''
        #response = response.replace("```","").replace("markdown", "")
        #update_display(Markdown(response), display_id=display_handle.display_id)

    for chunk in stream:
        content = chunk.get('message', {}).get('content', '')
        if content:
            response += content.replace("```", "")
            update_display(Markdown(response), display_id=display_handle.display_id)


In [106]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co/'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}]}


# HuggingFace: Building a Future of AI Collaboration

Welcome to Hugging Face, the premier platform for machine learning community collaboration. Our mission is to empower developers and researchers to build, discover, and share models, datasets, and applications that drive innovation in AI.

## About Us

At Hugging Face, we believe that AI should be accessible to everyone. That's why we've built a collaborative ecosystem where experts and enthusiasts can come together to create, learn, and grow. Our platform hosts over 1 million pre-trained models, 250k+ datasets, and thousands of applications.

## What We Offer

* **Models**: Browse our vast library of pre-trained models, including state-of-the-art Transformers, Diffusers, and more.
* **Datasets**: Access a wide range of high-quality datasets for various AI tasks, from text generation to computer vision.
* **Spaces**: Host and collaborate on unlimited public models, datasets, and applications with our powerful collaboration platform.
* **Compute**: Deploy your models on optimized inference endpoints or upgrade your spaces applications to leverage GPU computing.

## Our Community

Hugging Face has partnered with leading companies in AI research, including Meta, Google, Amazon, Intel, Microsoft, and Grammarly. Our community consists of over 50,000 organizations, with notable members like AI2 (non-profit), AI at Meta, Amazon, Google, Intel, and Microsoft.

## Our Technology

* **Transformers**: State-of-the-art machine learning toolkit for PyTorch, TensorFlow, and JAX.
* **Diffusers**: Advanced diffusion models in PyTorch.
* **Safetensors**: Safe way to store/distribute neural network weights.
* **Hub Python Library**: Python client to interact with the Hugging Face Hub.

## Join Our Community

Whether you're a researcher, developer, or enthusiast, we invite you to join our vibrant community. Share your work, learn from others, and accelerate your AI journey with Hugging Face.

### Get Started

* Sign up for free access to our platform.
* Explore our tutorials, documentation, and blog.
* Join our Discord community to connect with fellow users.

Hugging Face is more than just a platform – it's a movement. Let's build the future of AI together!