1. Get contents and links from the website
2. use llm to select the relevant links
3. get the detailed contents from the website and the links. 
5. use the content to create a brochure with llm

In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
huggingface = Website("https://huggingface.co")
huggingface.title

'Hugging Face – The AI community building the future.'

In [8]:
print(huggingface.get_contents()[:1000]) # show the first 1000 characters of the content

Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
NEW
Get started with Inference in seconds 🚀
Reachy Mini: The Open Robot for AI Builders
Welcome Cohere on the Hub 🔥
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
Qwen/Qwen3-Coder-480B-A35B-Instruct
Updated
5 days ago
•
14.4k
•
861
zai-org/GLM-4.5
Updated
about 23 hours ago
•
919
•
500
bosonai/higgs-audio-v2-generation-3B-base
Updated
about 16 hours ago
•
82.5k
•
423
tencent/HunyuanWorld-1
Updated
about 21 hours ago
•
5.43k
•
395
Qwen/Qwen3-235B-A22B-Instruct-2507
Updated
about 2 hours ago
•
18.5k
•
545
Browse 1M+ models
Spaces
Running
11.1k
11.1k
DeepSite v2
🐳
Generate any application with DeepSeek
Running
on
Zero
204
204
Higgs Audio Demo
🎤
Higgs Audio Demo
Run

In [9]:
huggingface.links[:10]  # show first 10 links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 'inference/get-started']

In [10]:
## system prompt for link selection
system_prompt = "You are an assistant that analyzes several links from a website and selects the most relevant ones \
    for creating a brochure about the company. Only select links that are likely to contain useful information \
    about the company's culture, customers, products, services, and careers.\n"
system_prompt+="Response should be in JSON format.\n "
system_prompt+="If /about, /careers are 2 links from https://www.huggingface.co then the response should be:\n"
system_prompt+="""{\
    "links":[
        {"type": "about", "url": "https://www.huggingface.co/about"},\
        {"type": "careers", "url": "https://www.huggingface.co/careers"}
    ]
}"""

In [36]:
def get_links_user_prompt(website: Website):
    user_prompt = f'here are some links from the website {website.url} -'
    user_prompt += f"You should decide which of these links are relevant for creating a brochure about the company,\
         respond with the full https URL in JSON format.\n"
    user_prompt += "Links(some might be relative links):\n"
    user_prompt += "\n".join(website.links)  # limit to first 50 links to avoid too much text
    return user_prompt

In [37]:
print(get_links_user_prompt(huggingface))

here are some links from the website https://huggingface.co -You should decide which of these links are relevant for creating a brochure about the company,         respond with the full https URL in JSON format.
Links(some might be relative links):
/
/models
/datasets
/spaces
/docs
/enterprise
/pricing
/login
/join
inference/get-started
/spaces
/models
/Qwen/Qwen3-Coder-480B-A35B-Instruct
/zai-org/GLM-4.5
/bosonai/higgs-audio-v2-generation-3B-base
/tencent/HunyuanWorld-1
/Qwen/Qwen3-235B-A22B-Instruct-2507
/models
/spaces/enzostvs/deepsite
/spaces/smola/higgs_audio_v2
/spaces/Qwen/Qwen3-Coder-WebDev
/spaces/zumjoy/Multi-Style_Video-to-Anime_Generator
/spaces/black-forest-labs/FLUX.1-Kontext-Dev
/spaces
/datasets/fka/awesome-chatgpt-prompts
/datasets/interstellarninja/hermes_reasoning_tool_use
/datasets/NousResearch/Hermes-3-Dataset
/datasets/MegaScience/MegaScience
/datasets/microsoft/rStar-Coder
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enter

In [42]:
def get_links(url):
    # user_prompt = get_links_user_prompt(website)
    website = Website(url)  # limit to first 50 links to avoid too much text
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)},
        ],
        response_format={"type": "json_object"},
    )
    content = response.choices[0].message.content
    return json.loads(content)

In [43]:
get_links("https://www.huggingface.co")

{'links': [{'type': 'about', 'url': 'https://www.huggingface.co'},
  {'type': 'careers', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog', 'url': 'https://www.huggingface.co/blog'},
  {'type': 'enterprise', 'url': 'https://www.huggingface.co/enterprise'},
  {'type': 'pricing', 'url': 'https://www.huggingface.co/pricing'},
  {'type': 'models', 'url': 'https://www.huggingface.co/models'},
  {'type': 'datasets', 'url': 'https://www.huggingface.co/datasets'},
  {'type': 'spaces', 'url': 'https://www.huggingface.co/spaces'},
  {'type': 'docs', 'url': 'https://www.huggingface.co/docs'}]}