# BrochureCraft UI

This tool generates company brochures using AI models (GPT or Claude) by analyzing a company's website content.
It produces brochures in Markdown format, including sections like company culture, customer information, and careers.
It uses the paid AI-powered models GPT `gpt-4o-mini` and Claude `claude-3-haiku-20240307` models to filter relevant website links and summarize content.
The script extracts and formats company information into user-friendly Markdown content for prospective customers, investors, and recruits.

In [1]:
# Importing necessary modules
import os               # Provides functions for interacting with the operating system
import requests         # For making HTTP requests to fetch web page data
import json             # For handling JSON data
import anthropic        # For accessing Claude API
import gradio as gr     # For creating a user interface
from typing import List # Type hinting for list of elements
from dotenv import load_dotenv  # Loads environment variables from .env file
from openai import OpenAI  # Interface for OpenAI API
from bs4 import BeautifulSoup  # For parsing HTML content

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants
GPT_MODEL = "gpt-4o-mini"  
CLAUDE_MODEL = "claude-3-haiku-20240307"

HEADERS = {  # Default headers for web requests
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [3]:
# Setting up environment 
# Load environment variables from .env file
load_dotenv()                        
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')

openai = OpenAI()
claude = anthropic.Anthropic()

In [4]:
# Website class handles the webpage content and link extraction
class Website:
    _url: str = None  # Private attribute for storing the URL
    title: str = None  # Title of the webpage
    content: str = None  # Raw HTML content of the webpage
    links: List[str] = []  # List of hyperlinks found on the webpage
    text: str = None  # Cleaned text content of the webpage

    @property
    def url(self) -> str:
        """Returns the URL of the website."""
        return self._url

    @url.setter
    def url(self, value: str) -> None:
        """Sets and validates the URL."""
        if not value or value.strip() == "":
            raise ValueError("Invalid URL: URL cannot be None or empty")
        self._url = value.strip()

    def __init__(self, url, headers=None) -> None:
        """
        Initializes the Website class by fetching and parsing the web page.

        Args:
            url (str): The URL of the website to fetch.
            headers (dict, optional): Custom headers to be used in the request. Defaults to None.
        """
        self.url = url  # Set the provided URL
        the_headers = headers if headers else HEADERS  # Use default or custom headers
        
        try:
            response = requests.get(url, headers=the_headers)  # Make a GET request to the URL
            response.raise_for_status()  # Raise an error for unsuccessful requests
            response.encoding = response.apparent_encoding  # Auto-detect encoding
            self.content = response.content  # Store the raw HTML content
            
            soup = BeautifulSoup(self.content, 'html.parser')  # Parse HTML using BeautifulSoup
            self.title = soup.title.string if soup.title else "No title found"  # Extract webpage title
            
            # Clean up unnecessary elements (scripts, styles, images, inputs)
            if soup.body:
                for irrelevant in soup.body.find_all(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)  # Extract text content
            else:
                self.text = "No body content found"  # Fallback if no body content is available

            # Extract all links from anchor tags
            self.links = [a["href"] for a in soup.find_all("a") if a.has_attr("href")]

        except requests.RequestException as e:
            # Handle errors during the request
            print(f"Failed to fetch {self.url}: {e}")

    def get_contents(self) -> str:
        """
        Returns a formatted string containing the title and text content of the webpage.

        Returns:
            str: Formatted webpage content with title and text.
        """
        return f"- Webpage Title:\n{self.title}\n- Webpage Contents:\n{self.text}\n\n"

In [5]:
# System prompt for filtering relevant links for brochure creation
links_system_prompt = """You are provided with a list of links from a webpage. 
Your task is to evaluate which links are most relevant for inclusion in a company brochure, such as links to an About page, Company page, or Careers/Jobs pages.

Your response should be in JSON format, exactly as shown in the example below, without any introduction, summary, or additional text:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

# User prompt construction for filtering relevant links
def links_user_prompt(website):
    """
    Constructs a user prompt for filtering relevant links from a website for brochure content.

    Args:
        website (Website): The Website object representing the webpage.

    Returns:
        str: User prompt in string format.
    """
    user_prompt = f"Here is the list of links from the website {website.url}:\n"
    user_prompt += "Please identify the links that are relevant for a company brochure and respond with the full HTTPS URLs in JSON format. \n"
    user_prompt += "Exclude links such as Terms of Service, Privacy Policies, and email links.\n"
    
    if not website.links:
        user_prompt += "Unfortunately, no relevant links were found for creating a brochure about the company. Please respond with an empty JSON object.\n"
        return user_prompt

    user_prompt += "Here are the links available on the website (some may be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
# Function to get links using the GPT model
def get_links_gpt(url):
    """
    Retrieves and processes relevant links using GPT.

    Args:
        url (str): The website URL to fetch and analyze.

    Returns:
        dict: A JSON object containing relevant links for the brochure.
    """
    website = Website(url)  # Create a Website instance for the given URL
    
    response = openai.chat.completions.create(
        model=GPT_MODEL,
        messages=[  # Send system and user prompts to the GPT model
            {"role": "system", "content": links_system_prompt},
            {"role": "user", "content": links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content    
    return json.loads(result)

# Function to get links using the Claude model
def get_links_claude(url):
    """
    Retrieves and processes relevant links using Claude.

    Args:
        url (str): The website URL to fetch and analyze.

    Returns:
        dict: A JSON object containing relevant links for the brochure.
    """
    website = Website(url)  # Create a Website instance for the given URL
    response = claude.messages.create(
        model=CLAUDE_MODEL,
        max_tokens=500,
        temperature=0.7,
        system=links_system_prompt,
        messages=[{"role": "user", "content": links_user_prompt(website)}]
    )
    result = response.content[0].text    
    return json.loads(result)

In [7]:
# Function to get all details (landing page and relevant links) for a given URL
def get_all_details(url, model):
    """
    Retrieves the landing page content and relevant links for a website.

    Args:
        url (str): The website URL to fetch.
        model (str): The model to use for processing links ("GPT" or "Claude").

    Returns:
        str: Combined content of the landing page and all relevant links.
    """
    result = "Landing page:\n"
    result += Website(url).get_contents()  # Get contents of the landing page
    if model == "GPT":
        links = get_links_gpt(url)
    elif model == "Claude":
        links = get_links_claude(url)
    else:
        raise ValueError("--- Unknown Model ---")
    
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()  # Get contents for each relevant link
    return result

In [8]:
# System prompt for generating the brochure
def brochure_system_prompt(tone):
    """
    Generates the system prompt to instruct the AI on the tone of the brochure.

    Args:
        tone (str): The desired tone for the brochure, such as "professional", "friendly", or "persuasive".

    Returns:
        str: Formatted system prompt string customized with the specified tone.
    """
    system_prompt = f"""You are a helpful assistant that responds in Markdown.
You are tasked with analyzing the contents of several relevant pages from a company website and crafting a concise, professional brochure aimed at prospective customers, investors, and recruits. 
Your Markdown response should include the following information, if available:  
 - Details about the company's culture  
 - Information about its customers
 - Careers or job opportunities  

Focus on creating a {tone} tone that highlights the company's strengths and vision.
"""
    return system_prompt

# User prompt for generating a company brochure
def brochure_user_prompt(company_name, url, model):
    """
    Constructs a user prompt for generating a company brochure.

    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's website.

    Returns:
        str: User prompt in string format, limited to 5,000 characters.
    """
    user_prompt = f"You are analyzing a company called {company_name}.\n"
    user_prompt += "Below is the content from its landing page and other relevant pages. Using this information, create a concise brochure about the company.\n"
    user_prompt += get_all_details(url, model)  # Generate details for the entire URL
    user_prompt = user_prompt[:5_000]  # Limit the length of the prompt to 5,000 characters
    return user_prompt

In [9]:
# Function to stream the creation of a brochure dynamically using GPT
def stream_brochure_gpt(company_name, url, tone):
    """
    Streams the creation of a brochure dynamically using GPT.

    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's website.
    """
    try:   
        stream = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=[{"role": "system", "content": brochure_system_prompt(tone)},
                      {"role": "user", "content": brochure_user_prompt(company_name, url, model="GPT")}],
            stream=True  # Enable streaming mode for dynamic output
        )
        response = ""
        for chunk in stream:
            response += chunk.choices[0].delta.content or ""
            yield response
    except Exception as e:
        print(f"An error occurred while streaming the brochure for `{company_name}` using `{GPT_MODEL}` model: {e}")
        
# Function to stream the creation of a brochure dynamically using Claude
def stream_brochure_claude(company_name, url, tone):
    """
    Streams the creation of a brochure dynamically using Claude.

    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's website.
    """
    try:   
        result = claude.messages.stream(
            model=CLAUDE_MODEL,
            max_tokens=1000,
            temperature=0.7,
            system=brochure_system_prompt(tone),
            messages=[{"role": "user", "content": brochure_user_prompt(company_name, url, model="Claude")}],
        )
        response = ""
        with result as stream:
            for text in stream.text_stream:
                response += text or ""
                yield response
    except Exception as e:
        print(f"An error occurred while streaming the brochure for `{company_name}` using `{CLAUDE_MODEL}` model: {e}")

In [10]:
def stream_brochure(company_name, url, model, tone):
    """
    Streams the creation of a company brochure dynamically using the selected AI model.

    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's website.
        model (str): The AI model to use for brochure creation ("GPT" or "Claude").
        tone (str): The desired tone for the brochure (e.g., "professional", "friendly").

    Yields:
        str: The content of the brochure as it is generated dynamically in real-time.
    
    Raises:
        ValueError: If an unknown model is provided.
    """
    # Check which model to use and call the respective streaming function
    if model == "GPT":
        result = stream_brochure_gpt(company_name, url, tone)  # Stream brochure from GPT model
    elif model == "Claude":
        result = stream_brochure_claude(company_name, url, tone)  # Stream brochure from Claude model
    else:
        raise ValueError("--- Unknown Model ---")  # Raise error if an unknown model is provided
    
    # Yield the dynamically generated brochure content from the chosen model
    yield from result

## Create a User Interface using Gradio

In [11]:
# Interface for Gradio to create a User Interface for brochure generation
view = gr.Interface(
    fn=stream_brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(["GPT", "Claude"], label="Select Model", value="GPT"),
        gr.Textbox(label="Tone (e.g. professional, humourous, serious, friendly, persuasive, ...):"),
    ],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)

view.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


