In [1]:
import os
import requests
import json
import socket
import ipaddress
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from IPython.display import display, Markdown, update_display
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
load_dotenv(override=True)
API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = "https://api.groq.com/openai/v1"

if not API_KEY:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif API_KEY.strip() != API_KEY:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [3]:
MODEL = "llama3-70b-8192"
groq_client = OpenAI(api_key=API_KEY, base_url =BASE_URL)
# ollama_with_openai = OpenAI(api_key = "ollama", base_url = "http://localhost:11434/v1")

## 🌐 2. Website Content Extractor
This class retrieves the  following things from th website using `requests` or Selenium:
`- Website Title` ,
`- Website Content` ,
`- Website Images` ,
`- Website Videos` ,
`- Website Links` ,
and can be added more...

In [4]:
"""
Explanation of this code: https://chatgpt.com/share/686f8aed-a210-8007-970d-37906975fa4f
"""


def is_safe_url(url):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in ["http", "https"] or parsed.netloc == "":
            return False

        host = parsed.hostname
        ip = ipaddress.ip_address(socket.gethostbyname(host))
        if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
            return False
    except Exception:
        return False
    return True


class WebScraper:
    """
    A utility class to represent a Website that we have scraped, using Selenium, with extracted links.
    """

    def __init__(self, url):
        if not is_safe_url(url):
            raise ValueError("Invalid or unsafe URL")

        self.url = url
        self.title = "No title found"
        self.text = ""
        self.links = []
        self.images = []
        self.files = []
        self.tables = []

        # Setup Selenium
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-blink-features=AutomationControlled")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        try:
            driver.set_page_load_timeout(60)
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Get title
            self.title = soup.title.string.strip() if soup.title else "No title found"

            # Remove irrelevant tags
            if soup.body:
                for tag in soup.body(["script", "style", "input"]):
                    tag.decompose()
                self.text = soup.body.get_text(strip=True)

            # Extract all Images
            all_images = [img.get("src") for img in soup.find_all("img") if img.get("src")]
            self.images = all_images

            # Extract all valid links
            all_links = [a.get("href") for a in soup.find_all("a") if a.get("href") and is_safe_url(a.get("href"))]
            self.links = all_links

            # Extract all tables
            all_tables = [table for table in soup.find_all("table")]
            self.tables = all_tables


        except WebDriverException as e:
            print(f"Error loading page with Selenium: {e}")
        finally:
            driver.quit()

    def get_contents(self):
        return f"-> Webpage Title:\n{self.title}\n\n\n-> Webpage Contents (limited text displayed up to 1000 characters):\n{self.text[:1000]}\n\n\n-> Links (limited to 20 links displayed):\n{self.links[:20]}\n\n\n-> Images:\n{self.images}\n\n\n-> Tables:\n{self.tables}\n\n\n"


In [5]:
website = WebScraper("https://www.microwebtec.com/")
print(website.get_contents())
# website.links


-> Webpage Title:
Full stack Development Company - Microweb Software Pvt Ltd


-> Webpage Contents (limited text displayed up to 1000 characters):
HomeAbout MicrowebCasesServicesTechnologyTechnology SubCloud Application Development ServicesAzure DevOps ServicesAI & ML ServicesShopify DevelopmentGolang Development ServicesDevOps Consulting ServicesWebflow DevelopmentBusiness TransformationLaravel Application DevelopmentSymfony Web DevelopmentNode.js DevelopmentAngularJs Web Development ServicesRuby on Rails Application DevelopmentMicrosoft DevelopmentMobile Application DevelopmentIoT and Embedded Systems & Smart SolutionsCloud TechnologyReactJS DevelopmentDrupal Web Development Services2D and 3D Video AnimationUI & UX DesignWeb DevelopmentEnterprise SolutionsDigital MarketingSoftware Outsource to IndiaGraphic DesigningeCommerce DevelopmentWordPress DevelopmentWooCommerce DevelopmentShopify DevelopmentPython DevelopmentGet in touchGet in touchWe Build Brilliance!WhoMicroweb software spec

In [6]:
link_system_prompt = (
    "You are provided with a list of links found on a webpage. "
    "You are able to decide which of the links would be most relevant to include in a brochure about the company, "
    "such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
    "You should respond in JSON as in this example:"
)
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

def link_user_prompt_for(website):
    user_prompt = (
        f"Here is the list of links on the website of {website.url} - please decide which of these are relevant web links for a brochure about the company, "
        f"respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\n\n"
        "Links (some might be relative links):\n"
    )
    user_prompt += "\n".join(website.links)
    return user_prompt


In [7]:
# print(link_user_prompt_for(website))

In [8]:
def get_links(url):
    website= WebScraper(url)
    response = groq_client.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system","content": link_system_prompt},
            {"role": "user", "content": link_user_prompt_for(website)}
        ],
        response_format = {"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
get_links("https://www.microwebtec.com")

{'links': [{'type': 'about page', 'url': 'https://www.microwebtec.com/about/'},
  {'type': 'contact page', 'url': 'https://www.microwebtec.com/contact'},
  {'type': 'services page', 'url': 'https://www.microwebtec.com/services/'},
  {'type': 'cases page', 'url': 'https://www.microwebtec.com/cases/'}]}

In [10]:
def get_all_details(url):
    result = "Landing page:\n"
    result += WebScraper(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += WebScraper(link["url"]).get_contents()
    return result

In [11]:
print(get_all_details("https://www.microwebtec.com"))

Found links: {'links': [{'type': 'about page', 'url': 'https://www.microwebtec.com/about/'}, {'type': 'contact page', 'url': 'https://www.microwebtec.com/contact'}]}
Landing page:
-> Webpage Title:
Full stack Development Company - Microweb Software Pvt Ltd


-> Webpage Contents (limited text displayed up to 1000 characters):
HomeAbout MicrowebCasesServicesTechnologyTechnology SubCloud Application Development ServicesAzure DevOps ServicesAI & ML ServicesShopify DevelopmentGolang Development ServicesDevOps Consulting ServicesWebflow DevelopmentBusiness TransformationLaravel Application DevelopmentSymfony Web DevelopmentNode.js DevelopmentAngularJs Web Development ServicesRuby on Rails Application DevelopmentMicrosoft DevelopmentMobile Application DevelopmentIoT and Embedded Systems & Smart SolutionsCloud TechnologyReactJS DevelopmentDrupal Web Development Services2D and 3D Video AnimationUI & UX DesignWeb DevelopmentEnterprise SolutionsDigital MarketingSoftware Outsource to IndiaGraphic 

In [12]:
brochure_system_prompt = (
    "You are an assistant that analyzes the contents of several relevant pages from a company website "
    "and creates a short, compelling brochure about the company. "
    "Your audience includes prospective customers, investors, and potential recruits. "
    "Respond in clear, well-formatted Markdown. "
    "Include information about the company's mission, products or services, culture and values, key customers or partners, and careers/jobs if that information is available."
)

def brochure_user_prompt(company_name, url):
    content = get_all_details(url)
    content = content[:5_000]  # Truncate content to 5,000 characters

    user_prompt = (
        f"You are looking at a company called: **{company_name}**\n\n"
        f"Below is the content gathered from the company's landing page and other relevant subpages (such as About, Careers, and Press).\n"
        f"Use this content to generate a **concise, informative brochure** in **Markdown format** for prospective **customers, investors, and potential recruits**.\n\n"
        f"The brochure should aim to:\n"
        f"- Describe what the company does\n"
        f"- Highlight company culture and values (if available)\n"
        f"- Mention notable customers or partners\n"
        f"- Include a summary of career opportunities or team info if relevant\n\n"
        f"### Company Website Content:\n\n"
        f"{content}"
    )
    return user_prompt


In [13]:
# brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [14]:
def create_brochure(company_name, url):
    response = groq_client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [15]:
create_brochure("Microweb Software", "https://www.microwebtech.com")

Error loading page with Selenium: Message: unknown error: net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH
  (Session info: chrome=138.0.7204.101)
Stacktrace:
	GetHandleVerifier [0x0xb71a33+62339]
	GetHandleVerifier [0x0xb71a74+62404]
	(No symbol) [0x0x9b2123]
	(No symbol) [0x0x9af85b]
	(No symbol) [0x0x9a30d2]
	(No symbol) [0x0x9a4b05]
	(No symbol) [0x0x9a3368]
	(No symbol) [0x0x9a2ea3]
	(No symbol) [0x0x9a2bb1]
	(No symbol) [0x0x9a0b54]
	(No symbol) [0x0x9a14fb]
	(No symbol) [0x0x9b5b4e]
	(No symbol) [0x0xa41367]
	(No symbol) [0x0xa1f3bc]
	(No symbol) [0x0xa407a3]
	(No symbol) [0x0xa1f1b6]
	(No symbol) [0x0x9ee7a2]
	(No symbol) [0x0x9ef644]
	GetHandleVerifier [0x0xde65c3+2637587]
	GetHandleVerifier [0x0xde19ca+2618138]
	GetHandleVerifier [0x0xb984aa+220666]
	GetHandleVerifier [0x0xb888d8+156200]
	GetHandleVerifier [0x0xb8f06d+182717]
	GetHandleVerifier [0x0xb79978+94920]
	GetHandleVerifier [0x0xb79b02+95314]
	GetHandleVerifier [0x0xb64c4a+9626]
	BaseThreadInitThunk [0x0x76fd5d49+25]
	RtlIniti

**Microweb Software Brochure**
============================

**About Us**
------------

Microweb Software is a technology company that provides innovative software solutions to transform businesses. Our mission is to empower organizations to achieve their full potential by delivering cutting-edge technology that simplifies complex processes and improves efficiency.

**Products and Services**
-------------------------

We offer a range of products and services designed to meet the unique needs of our clients. Our solutions are tailored to help businesses streamline operations, enhance customer experience, and drive growth.

### Key Features:

* Customizable software solutions
* Expert consulting services
* Innovative technology integrations

**Company Culture and Values**
-----------------------------

At Microweb Software, we prioritize a culture of innovation, collaboration, and customer-centricity. Our values include:

* Excellence in everything we do
* Integrity and trust in all our interactions
* Innovation and creativity in our solutions
* Collaboration and teamwork across all departments

**Notable Customers and Partners**
---------------------------------

We are proud to work with a diverse range of customers and partners across various industries, including [list of notable customers or partners, if available].

**Career Opportunities**
----------------------

Join our dynamic team of innovators and thought leaders. We offer a range of career opportunities in software development, consulting, sales, and more. Our values-driven culture and commitment to excellence make Microweb Software an exciting place to grow your career.

### Current Openings:

[Insert links to current job openings or a brief summary of available positions]

**Get in Touch**
-------------

Ready to learn more about Microweb Software or explore partnership opportunities? Contact us at [insert contact information].

Note: As there was limited content available on the company website, some sections may not have been populated. This brochure provides a general outline of the company's mission, products, culture, and career opportunities.

In [16]:
create_brochure("Gopal Info", "https://www.gopalinfo.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.gopalinfo.com/about-us/'}, {'type': 'portfolio page', 'url': 'https://www.gopalinfo.com/portfolio/'}, {'type': 'contact page', 'url': 'https://www.gopalinfo.com/contact-us/'}]}


**Gopal Info: Your Trusted Partner in Digital Growth**
==============================================

### About Us

Gopal Info is a digital solutions company that provides expert services in graphic design, website development, digital marketing, SEO, and social media management. Our mission is to help businesses thrive online with tailored strategies built for success.

### Services

We offer a range of services designed to elevate your brand's online presence:

* **Graphic Designing**: Crafting eye-catching designs to elevate your brand
* **UI/UX Design**: Creating intuitive and engaging user experiences
* **Branding Consultation**: Building strong, memorable brands with expert guidance
* **Web Designing**: Creating stunning websites that capture your brand's essence
* **Web Development**: Developing robust and scalable websites
* **Search Engine Optimization**: Boosting your online visibility with proven SEO strategies
* **Social Media Marketing**: Driving engagement and growth with targeted social media strategies
* **Google Ads**: Maximizing reach and conversions with targeted Google Ads campaigns
* **Mobile App Development**: Developing innovative mobile applications

### Culture and Values

At Gopal Info, we value innovation, creativity, and collaboration. Our team of experts is dedicated to delivering exceptional results and providing outstanding customer service.

### Key Customers and Partners

We have had the pleasure of working with a diverse range of clients across various industries. Our partners and customers trust us to deliver high-quality solutions that meet their unique needs.

### Careers

Join our team of innovators and creatives! We're always looking for talented individuals to join our team. Check out our careers page for available positions and opportunities to grow with us.

Learn more about us at [www.gopalinfo.com](http://www.gopalinfo.com)