In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/08/06/outsmart/',
 'https://edwarddonner.com/2024/08/06/outsmart/',
 'https://edwarddonner.com/2024/06/26/choosing-the-right-llm-resources/

## First step: Have GPT-4o-mini figure out which links are relevant

### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON.  
It should decide which links are relevant, and replace relative links such as "/about" with "https://company.com/about".  
We will use "one shot prompting" in which we provide an example of how it should respond in the prompt.

This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!

Sidenote: there is a more advanced technique called "Structured Outputs" in which we require the model to respond according to a spec. We cover this technique in Week 8 during our autonomous Agentic AI project.

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/
https://edwarddonner

In [9]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [10]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/meta-llama/Llama-3.3-70B-Instruct',
 '/tencent/HunyuanVideo',
 '/Datou1111/shou_xin',
 '/black-forest-labs/FLUX.1-dev',
 '/Qwen/QwQ-32B-Preview',
 '/models',
 '/spaces/JeffreyXiang/TRELLIS',
 '/spaces/ginipick/FLUXllama',
 '/spaces/multimodalart/flux-style-shaping',
 '/spaces/Kwai-Kolors/Kolors-Virtual-Try-On',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces',
 '/datasets/HuggingFaceFW/fineweb-2',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/CohereForAI/Global-MMLU',
 '/datasets/O1-OPEN/OpenO1-SFT',
 '/datasets/amphora/QwQ-LongCoT-130K',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs

In [11]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community forum', 'url': 'https://discuss.huggingface.co'},
  {'type': 'company profile on LinkedIn',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}

## Second step: make the brochure!

Assemble all the details into another prompt to GPT4-o

In [12]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [13]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/company'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'status page', 'url': 'https://status.huggingface.co'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Trending on
this week
Models
meta-llama/L

In [14]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'join page', 'url': 'https://huggingface.co/join'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nTrending on\nthis week\nModels\nmeta-llama/Llama-3.3-70B-Instruct\nUpdated\n3 days ago\n•\n102k\n•\n958\ntencent/HunyuanVideo\nUpdated\n6 days ago\n•\n3.73k\n•\n989\nDatou1111/shou_xin\nUpdated\n4 days ago\n•\n7.84k\n•\n313\nblack-forest-labs/FLUX.1-dev\nUpdated\nAug 16\n•\n1.38M\n•\n7.23k\nQwen/QwQ-32B-Preview\nUpdated\n14 days ago\n•\n92.8k\n•\n1.27k\nBrowse 400k+ models\nSpaces\nRunning\non\nZero\n1.12k\n🏢\nTRELLIS\nScalable and Versatile 3D Generation from ima

In [17]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [18]:
create_brochure("HuggingFace", "https://huggingface.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.com'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.com/enterprise'}, {'type': 'blog page', 'url': 'https://huggingface.com/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


```markdown
# Welcome to Hugging Face

### **About Us**
Hugging Face is where the AI community comes together to build the future. Our platform empowers collaboration on models, datasets, and applications, making it easier for machine learning professionals, researchers, and enthusiasts to create, discover, and exchange ideas.

### **Our Community**
Join a thriving community of over 50,000 organizations, including industry leaders like Meta, Amazon Web Services, Google, Microsoft, and many more. Together, we are pushing the boundaries of what's possible in the realm of Artificial Intelligence.

### **What We Offer**
- **Models**: Access to a growing library of over 400,000 models, such as the latest advancements in natural language processing, computer vision, and generative models. Recent trends include cutting-edge models like the `meta-llama/Llama-3.3-70B-Instruct` and various creative applications.
  
- **Datasets**: Browse and utilize over 100,000 datasets tailored for specific ML tasks. Our community-curated datasets support enhanced learning and development for diverse applications.

- **Spaces**: Create and share applications with ease. Our innovative Spaces platform allows users to collaboratively build and showcase applications, from image generation to virtual try-ons.

### **Enterprise Solutions**
For organizations looking to scale their machine learning efforts, we offer:
- Advanced enterprise features including dedicated support, access controls, and enhanced security.
- Flexible computing solutions to support varied workloads, starting at just $0.60/hour for GPU services.

### **Careers at Hugging Face**
At Hugging Face, we cultivate a vibrant company culture that champions collaboration, innovation, and diversity. We are always on the lookout for passionate individuals to join our mission of democratizing AI. Explore exciting career opportunities and contribute to groundbreaking work that shapes the AI landscape.

### **Join Us**
Transform the AI landscape with us. Whether you're a researcher, developer, an enterprise looking to innovate, or someone passionate about artificial intelligence, Hugging Face offers you the tools and community to thrive.

- **[Sign Up](#)** to start collaborating or [explore our careers](#) to join our team!
  
### **Stay Connected**
- Follow us on **[GitHub](#)**, **[Twitter](#)**, **[LinkedIn](#)**, and **[Discord](#)** to stay updated with the latest news and developments in AI.

---

Together, let’s build the future of AI!
```


## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [19]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [20]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}



# Hugging Face Brochure

## Company Overview

**Hugging Face** is a vibrant AI community focused on building the future of machine learning (ML) and artificial intelligence (AI). The mission is to democratize access to quality ML tools and frameworks, fostering collaboration among researchers, developers, and organizations worldwide.

**Website:** [huggingface.co](https://huggingface.co)

---

## Our Offerings

- **Models:** Explore and collaborate on over 400K models, including state-of-the-art technology across various modalities: text, image, video, audio, and 3D.
- **Datasets:** Access and share 100K+ datasets for various tasks in ML.
- **Spaces:** Utilize over 150K applications for deploying machine learning projects.
- **Enterprise Solutions:** Tailored offerings for organizations including enhanced security, dedicated support, and compute solutions.

### Example Models
- meta-llama/Llama-3.3-70B-Instruct
- tencent/HunyuanVideo
- black-forest-labs/FLUX.1-dev

---

## Who We Serve

Hugging Face caters to a diverse audience comprising over 50,000 organizations, including giants like:
- **Meta**
- **Google**
- **Microsoft**
- **Amazon Web Services**
  
Our community ranks among the most innovative minds in AI and ML, contributing consistently to advancements in the field.

---

## Company Culture

At Hugging Face, collaboration is key. We believe in a culture of openness and inclusivity where every contribution is valued. Our remote-friendly workforce thrives in a creative environment that encourages innovation and personal growth. We value each team member's unique perspective and promote an atmosphere of continuous learning.

### Join Us
We are always looking for passionate individuals to join our mission. Check out our [careers page](https://huggingface.co/jobs) for current openings and become a part of shaping the future of AI and ML.

---

## Why Choose Hugging Face?

- **Community-Driven:** We actively engage with the AI community to refine and elevate our offerings.
- **Open Source Philosophy:** Our commitment to open-source tools empowers developers and researchers to contribute and build on our technologies easily.
- **Cutting-Edge Tools:** Our state-of-the-art ML tooling includes Transformers, Diffusers, and more, ensuring access to the best resources for modern AI development.

For more details about our company, values, or career opportunities, please visit our website or contact us directly.

---

**Contact Us:**  
For press inquiries, reach out via our website or email our team directly.

Find us on social media:  
[GitHub](https://github.com/huggingface) | [Twitter](https://twitter.com/huggingface) | [LinkedIn](https://linkedin.com/company/hugging-face) | [Discord](https://discord.gg/huggingface)  

Together, let’s build a future where AI is accessible and useful to everyone!



In [21]:
# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:

stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## Welcome to Hugging Face

**Hugging Face** is more than just an AI company; we are a vibrant community dedicated to building the future of machine learning. Our platform serves as a collaborative space where individuals and organizations can share models, datasets, and applications, making AI accessible to all.

### Our Vision
Democratizing cutting-edge machine learning through collaboration, innovation, and open-source tools. We believe that together, we can accelerate advancements in AI technology and create a more inclusive future.

---

## What We Offer

### Extensive Model Library
With over **400,000 models** to explore, users can find state-of-the-art solutions tailored to their needs. From text to 3D modeling, our diverse repository is continually updated with contributions from around the globe.

### Comprehensive Datasets
Explore our catalog of over **100,000 datasets** that empower researchers and developers alike to train and enhance their AI applications.

### Collaborative Spaces
**Spaces** on Hugging Face provide a platform to host and collaborate on public models, datasets, and applications, fostering a spirit of collective intelligence and creativity.

### Enterprise Solutions
For businesses, we offer enterprise-grade services that include dedicated support, enhanced security, and customized access controls. Join over **50,000 organizations**, including industry leaders like Google, Microsoft, and Amazon, who trust us for their machine learning needs.

---

## Company Culture

At Hugging Face, we pride ourselves on a culture of openness, collaboration, and continuous learning. Our team of **224 members** is passionate about empowering individuals through AI and fostering a supportive environment where every voice is heard. We encourage innovation and growth, whether you're new to AI or an experienced professional.

### Join Us
Are you ready to be a part of our mission? We invite you to explore exciting career opportunities within our team. Our positions offer a chance to work with cutting-edge technology in a collaborative atmosphere committed to inclusivity and employee well-being.

---

## Connect with Us

Stay up-to-date with our latest developments and community contributions:
- [GitHub](https://github.com/huggingface)
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://linkedin.com/company/huggingface)
- [Discord](https://discord.gg/huggingface)

For any inquiries, please reach out to our dedicated team through our [contact page](https://huggingface.co/contact).

---

### Join the Hugging Face Community!
Become a part of the AI revolution today. Together, let's build a future where machine learning is approachable and beneficial for everyone!