In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import display, Markdown, update_display
from openai import OpenAI
import socket
import ipaddress
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
load_dotenv(override=True)
api_key = os.getenv("GROQ_API_KEY")

if not api_key:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [None]:
MODEL = "llama3-70b-8192"
openai = OpenAI(api_key=api_key, base_url = "https://api.groq.com/openai/v1")

In [4]:
def is_safe_url(url):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in ["http", "https"] or parsed.netloc == "":
            return False

        host = parsed.hostname
        ip = ipaddress.ip_address(socket.gethostbyname(host))
        if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
            return False
    except Exception:
        return False
    return True


class Website:
    """
    A utility class to represent a Website that we have scraped, using Selenium, with extracted links.
    """

    def __init__(self, url):
        if not is_safe_url(url):
            raise ValueError("Invalid or unsafe URL")

        self.url = url
        self.title = "No title found"
        self.text = ""
        self.links = []

        # Setup Selenium
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-blink-features=AutomationControlled")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        try:
            driver.set_page_load_timeout(20)
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Get title
            self.title = soup.title.string.strip() if soup.title else "No title found"

            # Remove irrelevant tags
            if soup.body:
                for tag in soup.body(["script", "style", "img", "input"]):
                    tag.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)

            # Extract all valid links
            all_links = [a.get("href") for a in soup.find_all("a") if a.get("href")]
            self.links = all_links

        except WebDriverException as e:
            print(f"Error loading page with Selenium: {e}")
        finally:
            driver.quit()

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\n\nWebpage Contents:\n{self.text}\n\n"


In [5]:
web_scrapper = Website("https://edwarddonner.com")
print(web_scrapper.get_contents())
# web_scrapper.links

Webpage Title:
Home - Edward Donner

Webpage Contents:
Skip to content
Home
Connect Four
Outsmart
An arena that pits LLMs against each other in a battle of diplomacy and deviousness
About
Posts
Well, hi there.
I’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (
very
amateur) and losing myself in
Hacker News
, nodding my head sagely to things I only half understand.
I’m the co-founder and CTO of
Nebula.io
. We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I’m previously the founder and CEO of AI startup untapt,
acquired in 2021
.
We work with groundbreaking, proprietary LLMs verticalized for talent, we’ve
patented
our matching model, and our award-winning platform has 

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\nYou should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format.Do not include Terms of Service, Privacy, email links.\n\n Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [7]:
# print(get_links_user_prompt(web_scrapper))

In [8]:
def get_links(url):
    website= Website(url)
    response = openai.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system","content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format = {"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
# get_links("https://anthropic.com")

In [10]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [11]:
# print(get_all_details("https://huggingface.co"))

In [12]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [13]:
# get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [14]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [15]:
# create_brochure("HuggingFace", "https://huggingface.co")

In [16]:
create_brochure("Gopal Info", "https://gopalinfo.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.gopalinfo.com/about-us/'}, {'type': 'portfolio', 'url': 'https://www.gopalinfo.com/portfolio/'}, {'type': 'contact', 'url': 'https://www.gopalinfo.com/contact-us/'}]}


**Gopal Info Brochure**
======================

**Your Trusted Partner in Digital Growth**
------------------------------------------

Gopal Info is a leading digital solutions company that offers expert services in graphic design, website development, digital marketing, SEO, and social media management. We help businesses thrive online with tailored strategies built for success.

**Our Mission**
--------------

To empower businesses by delivering innovative and impactful digital solutions, committed to excellence, creativity, and customer satisfaction.

**Our Vision**
-------------

To be a trusted leader in digital solutions, inspiring growth and innovation for businesses worldwide.

**Our Values**
-------------

* Client satisfaction
* Integrity
* Innovation
* Excellence in every project
* Transparency and reliability
* Strong partnerships

**Services**
------------

* Graphic Designing: Crafting eye-catching designs to elevate your brand
* UI/UX Design: Creating intuitive and engaging user experiences
* Branding Consulting: Building strong, memorable brands with expert guidance
* Web Designing: Creating stunning websites that capture your brand's essence
* Website Development: Building robust and scalable web solutions for your business
* Search Engine Optimization: Boosting your online visibility with proven SEO strategies
* Social Media Marketing: Driving engagement and growth with targeted social media strategies
* Google Ads: Maximizing reach and conversions with targeted Google Ads campaigns
* Mobile App Development: Developing innovative mobile apps that enhance user experience

**Technologies**
-------------

We utilize a diverse range of advanced technologies to deliver high-quality services, including website and mobile development to digital marketing.

**How We Work**
--------------

We follow an agile methodology to ensure your project's success. Our transparent and systematic approach prioritizes collaboration, creativity, and quality at every step, from initial consultation to final delivery.

**Our Team**
------------

Meet our Director & Project Manager, Pratik Mandaliya, who brings 4+ years of experience in project management and client communication. He ensures the company fulfills its promises of perfection, timely delivery, and creative work that stands out.

**Accomplishments**
----------------

* 4+ active clients
* 300+ projects done
* 70%+ success rate
* 0+ awards

**Get in Touch**
----------------

Address: 502 - Matrix, Makarba, Ahmedabad - 380015, India
Connect with us: [WhatsApp]( WhatsApp us)

Let's create something amazing together! Share your details, and we'll connect with you to bring your vision to life.