# Company Brochure

Create a tool that scrap the data from the give company's url and create a summarized brochure for the company.
- Extract data from the landing page and other relevant links
- Use LLM to extract the relevant links, do not include the links like `Privacy Policy`, `Terms and Conditions`, and the links that have no relevant information.
- Combine the content from each relevant link and use LLM to generate the company brochure.

In [None]:
#imports
import json
import requests
from IPython.display import display, Markdown # type: ignore
from requests.exceptions import RequestException
from requests.models import Response
from requests import codes
from typing import Optional, List, Dict, Any
from schemas.ollama.link import Link, LinksResponse
from pydantic import ValidationError
from bs4 import BeautifulSoup, Tag

ImportError: cannot import name 'Markdown' from 'IPython' (/opt/miniconda3/envs/llms/lib/python3.11/site-packages/IPython/__init__.py)

In [None]:
# constants
MODEL = "llama3.1:8b"
HEADERS = {
    "Content-Type": "application/json"
}
OLLAMA_API = "http://localhost:11434/api/chat"

#### Step 1: Prompt user to retrieve the company name and website url

In [None]:
# company_name = input("Enter the company name: ")
# website_url = input("Enter the company's website URL: ")
company_name = "Faizan Pervaiz"
website_url = "https://faizanpervaiz.com/"

if not company_name or not website_url:
    raise ValueError("Company name and website URL cannot be empty.")

if not website_url.startswith("http"):
    raise ValueError("Please enter a valid URL starting with http or https.")

#### Step 2: Create the webscrapper class that takes the url of a company's page and extract the title and content

In [None]:
class WebScraper:
    irrelevant_tags = ["script", "style", "img", "input", "button"]
    def __init__(self, url: str, irrelevant_tags: Optional[list[str]] = None):
        self.url = url
        self.raw_content: Optional[bytes] = None
        self.title: Optional[str] = None
        self.body: Optional[str] = None
        self.links: Optional[list[str]] = None
        if irrelevant_tags:
            self.irrelevant_tags = irrelevant_tags
        self._fetch_content()

    def _fetch_content(self):
        try:
            response: Response = requests.get(self.url)
            if response.status_code != codes.ok:
                raise ValueError(f"Failed to fetch content from {self.url}, status code: {response.status_code}")
            
            self.raw_content = response.content

            if not self.raw_content:
                raise ValueError(f"Website content is emtpy.")

            soup = BeautifulSoup(self.raw_content, "html.parser")
            self.title = soup.title.string if soup.title and soup.title.string else "No Title Found"
            if soup.body:
                for irrelevant in soup.find_all(self.irrelevant_tags):
                    irrelevant.decompose()
                self.body = soup.body.get_text(separator="\n", strip=True)
            
            self.links = [
                href for link in soup.find_all("a", href=True)
                if isinstance(link, Tag) and isinstance(href := link.get("href"), str) and href.startswith("http")
            ]
        except RequestException as e:
            raise ValueError(f"Failed to fetch content from {self.url}: {e}")

    def get_content(self) -> str:
        """
        Returns the  page title and page content of the provided url
        :return: returns the page title and page content
        """
        return f"\nPage Title: {self.title}\nPage Content:\n{self.body}"

    def __repr__(self) -> str:
        """
        Returns a string representation of the WebScraper instance.
        This includes the URL being scraped.
        :return: A string representation of the WebScraper instance.
        """
        return f"WebScraper(url={self.url})"
    
    def __str__(self) -> str:
        """
        Returns a user-friendly string representation of the WebScraper instance.
        This can be used for logging or displaying information about the scraper.
        :return: A user-friendly string representation of the WebScraper instance.
        """
        return f"WebScraper for {self.url}"

#### Step 3: Create system prompt for llama model

In [None]:
system_prompt = f"You are a helpful assistant to summarize the content of a company's website."
system_prompt += f"\nYou'll be given a list of all of the links found on a company's website."
system_prompt += f"\n Extract the relevant links from the provided list that can be used to create a professional brochure about the comapany."
system_prompt += f"\nYou need to provide the list of all relevant link, skip links that are related to 'Privacy Policy', 'Terms and Conditions' and the ones that do not look relevant."
system_prompt += f"\nProvide the list in the JSON format and the structure should look as follows:\n"
system_prompt += """{
    "links": [
        {"type": "About Page", "link": "https://example.com/about-us"}
        {"type": "Career Page", "link": "https://example.com/careers"}
    ]
}"""
system_prompt += f"\nMake sure that you include full url in the link."

#### Step 4: Create User Prompt For Llama Model

In [None]:
def get_user_prompt(website: WebScraper) -> str:
    user_prompt = f"Here is the list of links from a company website {website.url}"
    user_prompt += "\nPlease decide which links are relevant and skip the irrelevant links like 'Privacy Policy', 'Terms and Conditions' and so on."
    user_prompt += "\nResponse with full URL of links and do not add the irrelevant links"
    user_prompt += "\nLinks from the website are:\n"
    user_prompt += '\n'.join(website.links or [])
    user_prompt += "\n\nYou should response in JSON format as the following structure:\n"
    user_prompt += """{
        links: [
            {type: "about page", "link": "https://www.example.com/about"},
            {type: "careers page", "link": "https://www.example.com/careers"}
        ]
    }"""
    return user_prompt

#### Step 5: Instantiate `WebsiteScraper` Object

In [None]:
website = WebScraper(website_url)

#### Step 6: Use Llama Model To Get Relevant Links

In [None]:
def get_relevant_links(website: WebScraper) -> List[Link]:
    messages: List[Dict[str, Any]] = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_user_prompt(website=website)}
    ]
    payload: Dict[str, Any] = {
        "model": MODEL,
        "messages": messages,
        "format": LinksResponse.model_json_schema(),
        "stream": False
    }
    response: Response = requests.post(OLLAMA_API, headers=HEADERS, json=payload)
    if response.status_code != codes.ok:
        raise ValueError(f"Request to model API failed with status code {response.status_code}: {response.text}")

    content = response.json()['message']['content']

    if not isinstance(content, str):
        raise ValueError("Invalid Response")
    
    content = json.loads(content)
    try:
        return LinksResponse.model_validate(content).links
    except ValidationError as e:
        print("Validation failed:", e)
        return []

#### Step 7: Retrieve Details From Each Relevant Link

In [None]:
landing_page_content = "Landing Page: \n"+website.get_content()
links = get_relevant_links(website)
complete_data = landing_page_content

for item in links:
    try:
        web_page = WebScraper(str(item.link))
        complete_data += "\n" + item.type + "\n" + web_page.get_content() + "\n"
    except Exception:
        continue

#### Step 8: System Prompt For Company Brochure Creation

In [None]:
system_prompt = "You are a helpful assistant that helps user to create a company brochure. \
You will be given the content from the relevant links from the company's website. \
You will use the content to create a company brochure about the company perspective, services, prospective custoemrs and recruits. \
Response in Markdown. Include company's culture, values, careers/jobs(if you have the information) and mission statement."

#### Step 9: User Prompt For Company Brochure Creation

In [None]:
def get_user_prompt_for_brochure_creation(company_name: str, url: str) -> str:
    """Generates the prompt for creating a company brochure."""
    user_prompt =  f"You are looking at the content of the company called {company_name}."
    user_prompt += f"\n\n Here is the content for the landing page and other relevent pages of the website {url}.\n"
    user_prompt += "\n\nUse this information to create the short company brochure in Markdown format."
    user_prompt += complete_data
    user_prompt = user_prompt[:20_000]
    return user_prompt

#### Step 10: Create Company Browser

In [None]:
def create_brochure(company_name:str, url: str):
    """Creates a company brochure using Llama API."""
    try:
        messages: List[Dict[str, Any]] = [
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": get_user_prompt_for_brochure_creation(
                        company_name=company_name,
                        url=url
                    )
            }
        ]
        payload: Dict[str, Any] = {
            "model": MODEL,
            "messages": messages,
            "stream": False
        }

        response: Response = requests.post(OLLAMA_API, headers=HEADERS, json=payload)
        if response.status_code != codes.ok:
            raise ValueError(f"Request to model API failed with status code {response.status_code}: {response.text}")

        content = response.json()['message']['content']
        print(type(content))
        display(Markdown(content))
    except Exception as e:
        raise RuntimeError(f"Error while calling OpenAI API: {e}")
    
create_brochure(company_name=company_name, url=website.url)

This is a collection of web pages with various content. I'll try to summarize the main points and identify any notable information.

**Home Page**

* The home page appears to be a news aggregator site, with articles from different categories (News, Business, Politics, Science, etc.).
* The most recent article on the homepage is "Rap group calls out publication for using their image" dated July 18, 2025.
* There are links to various social media platforms (Facebook, Pinterest) at the bottom of the page.

**Category Pages**

* I found two category pages: News and Science.
* The News category appears to be a collection of articles from June 2025, with topics ranging from technology to entertainment.
* The Science category has an article about a woman making consumer boycotts great again, dated June 17, 2025.

**Article Details**

* Most articles have the same structure:
	+ Introduction with a phrase like "Intro text we refine our methods of responsive web design..."
	+ A few sentences sum