In [None]:
import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from scraper import Website, fetch_website_links, fetch_website_contents
from openai import OpenAI
from urllib.parse import urljoin, urlparse

In [None]:
load_dotenv(override=True)
api_key = os.getenv('OPENROUTER_API_KEY')

if api_key and api_key.startswith('sk-or-') and len(api_key) > 10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

# MODEL = 'openai/gpt-4o-mini'
MODEL = 'anthropic/claude-haiku-4-5'
openai = OpenAI(
    base_url= os.getenv('BASE_URL'),
    api_key=api_key,
)

In [None]:
system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

def get_links_user_prompt(url):
    # function to get the user prompt for the links, we will fetch the links from the website and then create a prompt for the model to decide which links are relevant for the brochure
    base = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
    links = fetch_website_links(url)
    # Resolve relative links to absolute and drop non-http links
    absolute_links = []
    for link in links:
        absolute = urljoin(base, link)
        if absolute.startswith("http"):
            absolute_links.append(absolute)
    unique_links = list(dict.fromkeys(absolute_links))  # deduplicate, preserve order

    user_prompt = f"""Here is the list of links on the website {url} -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links:

"""
    user_prompt += "\n".join(unique_links)
    return user_prompt


In [None]:
def select_relevant_links(url):
    #Function to select relevant links  
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
    )
    result = response.choices[0].message.content.strip()
    # Strip markdown code fences if the model wraps JSON in ```json ... ```
    if result.startswith("```"):
        result = result.split("```")[1]
        if result.startswith("json"):
            result = result[4:]
        result = result.strip()
    # Extract the JSON object in case there's surrounding prose
    start, end = result.find("{"), result.rfind("}")
    if start != -1 and end != -1:
        result = result[start:end + 1]
    return json.loads(result)
    

In [None]:
select_relevant_links("https://huggingface.co")

In [None]:
def fetch_page_and_all_relevant_links(url):
    #Function to fetch the page and all relevant links
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [None]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
def create_brochure(company_name, url):
    # creates a brochure for the company by fetching the relevant links and their contents and then prompting the model to create a brochure in markdown format
    user_prompt = get_brochure_user_prompt(company_name, url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": user_prompt},
        ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [None]:
create_brochure("HuggingFace", "https://huggingface.co")

In [None]:
def stream_brochure(company_name, url):
    # creates a brochure for the company by fetching the relevant links and their contents and then prompting the model to create a brochure in markdown format, but streams the response so we can see it being generated in real time  
    stream = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")

In [None]:
import re
from pathlib import Path
from fpdf import FPDF

# ── Colour palette ───────────────────────────────────────────────
C_NAVY   = (22,  60, 110)    # header banner
C_BLUE   = (41,  98, 200)    # H2 accent
C_TEAL   = (0,  150, 136)    # H3 + rules
C_ORANGE = (210,  90,  20)   # bullet marker
C_DARK   = (40,  44,  52)    # body text
C_WHITE  = (255, 255, 255)

BROCHURE_SYSTEM_PROMPT = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a concise, precise brochure (300-400 words max) for prospective customers, investors and recruits.
Respond in markdown without code blocks. Be brief and impactful.
Use exactly one H1 (company name), H2 section headings, H3 sub-headings, bullet lists, and short paragraphs.
Cover: what the company does, company culture, key customers, and careers - only if info is available.
"""

def create_brochure_generate_pdf(company_name, url):
    # 1. Generate brochure markdown content via LLM and display in notebook while we build the PDF
    user_prompt = get_brochure_user_prompt(company_name, url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": BROCHURE_SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
    )
    md_content = response.choices[0].message.content
    display(Markdown(md_content))

    # 2. Helpers
    def safe(text):
        return text.encode("latin-1", errors="replace").decode("latin-1")

    def strip_md(text):
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
        text = re.sub(r'\*(.*?)\*',     r'\1', text)
        text = re.sub(r'`(.*?)`',       r'\1', text)
        text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text)
        return text.strip()

    # 3. Build PDF
    pdf = FPDF()
    pdf.set_margins(20, 20, 20)
    pdf.set_auto_page_break(auto=True, margin=20)
    pdf.add_page()
    W = pdf.w - pdf.l_margin - pdf.r_margin

    # --- Full-width header banner ---
    pdf.set_fill_color(*C_NAVY)
    pdf.rect(0, 0, pdf.w, 40, style="F")
    pdf.set_xy(0, 9)
    pdf.set_font("Helvetica", "B", 26)
    pdf.set_text_color(*C_WHITE)
    pdf.cell(pdf.w, 13, safe(company_name.upper()), align="C")
    pdf.set_xy(0, 24)
    pdf.set_font("Helvetica", "I", 11)
    pdf.set_text_color(180, 210, 255)
    pdf.cell(pdf.w, 8, safe("Company Brochure"), align="C")
    pdf.ln(26)

    # --- Teal rule under banner ---
    pdf.set_draw_color(*C_TEAL)
    pdf.set_line_width(1.0)
    pdf.line(pdf.l_margin, pdf.get_y(), pdf.w - pdf.r_margin, pdf.get_y())
    pdf.ln(6)

    # 4. Render markdown lines
    for line in md_content.splitlines():
        stripped = strip_md(line)

        if line.startswith("# "):
            continue                        # already in banner

        elif line.startswith("## "):
            pdf.ln(4)
            y = pdf.get_y()
            # coloured left accent bar
            pdf.set_fill_color(*C_BLUE)
            pdf.rect(pdf.l_margin, y, 3, 10, style="F")
            pdf.set_xy(pdf.l_margin + 6, y)
            pdf.set_font("Helvetica", "B", 14)
            pdf.set_text_color(*C_BLUE)
            pdf.multi_cell(W - 6, 10, safe(stripped))
            # thin underline
            pdf.set_draw_color(*C_BLUE)
            pdf.set_line_width(0.3)
            pdf.line(pdf.l_margin, pdf.get_y(), pdf.l_margin + W, pdf.get_y())
            pdf.ln(3)

        elif line.startswith("### "):
            pdf.ln(2)
            pdf.set_font("Helvetica", "B", 12)
            pdf.set_text_color(*C_TEAL)
            pdf.multi_cell(W, 7, safe(stripped))
            pdf.ln(1)

        elif line.startswith(("- ", "* ", "+ ")):
            pdf.set_font("Helvetica", "B", 11)
            pdf.set_text_color(*C_ORANGE)
            pdf.set_x(pdf.l_margin)
            pdf.cell(6, 7, safe(">"))
            pdf.set_font("Helvetica", "", 11)
            pdf.set_text_color(*C_DARK)
            pdf.multi_cell(W - 6, 7, safe(strip_md(line[2:])))

        elif line.strip() == "---":
            pdf.ln(2)
            pdf.set_draw_color(*C_TEAL)
            pdf.set_line_width(0.4)
            pdf.line(pdf.l_margin, pdf.get_y(), pdf.l_margin + W, pdf.get_y())
            pdf.ln(4)

        elif line.strip():
            pdf.set_font("Helvetica", "", 11)
            pdf.set_text_color(*C_DARK)
            pdf.multi_cell(W, 7, safe(stripped))
            pdf.ln(1)

        else:
            pdf.ln(3)

    # Footer
    pdf.set_y(-14)
    pdf.set_font("Helvetica", "I", 8)
    pdf.set_text_color(150, 150, 150)
    pdf.cell(0, 6, safe(f"{company_name}  |  AI-generated brochure"), align="C")

    # 5. Save to brochures/ folder at project root
    safe_name = re.sub(r'[^\w\s-]', '', company_name).strip().replace(' ', '_')
    brochures_dir = Path.cwd().parent / "brochures"
    brochures_dir.mkdir(exist_ok=True)
    output_path = brochures_dir / f"{safe_name}.pdf"
    pdf.output(str(output_path))
    print(f"PDF saved: {output_path}")

In [None]:
create_brochure_generate_pdf("Edward Donner", "https://edwarddonner.com")