In [11]:
# https://github.com/eiliaJafari

In [1]:
# Imports: Web scraping, API calls, environment loading, UI rendering, and LLM integration
import re
import os
import requests
import json
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
import gradio as gr
import ollama

In [2]:
# Load environment variables from .env file (OpenAI API key)
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

GPT_MODEL = 'gpt-4o-mini'
openai = OpenAI()

OLLAMA_MODEL = "llama3.2"
OLLAMA_URL = "http://localhost:11434/api/chat"

In [3]:
# Use common headers to avoid being blocked by some websites
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# Represents a scraped website, containing cleaned text and all hyperlinks
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        # Remove irrelevant elements before extracting text
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        # Extract all href links (absolute or relative)
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
# Prompt used to guide the LLM to extract meaningful internal links from the company homepage
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

# Generates a user prompt listing all links found on the website
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [5]:
# Uses Ollama to extract relevant links from the company's website
def get_links_ollama(url):
    website = Website(url)
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        "stream": False
    }

    try:
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status()
        result_text = response.json()["message"]["content"]
    except Exception as e:
        print("Error during Ollama request:", e)
        return {}

    # Parse and clean LLM response (extract JSON block only)
    match = re.search(r'\{\s*"links"\s*:\s*\[.*?\]\s*\}', result_text, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            result_json = json.loads(json_str)
        except json.JSONDecodeError:
            print("Failed to decode JSON block:")
            print(json_str)
            return {}
    else:
        print("No valid JSON found in Ollama output:")
        print(result_text)
        return {}

    return result_json

# Uses OpenAI GPT model to extract relevant links from the company's website
def get_links_gpt(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [6]:
# Aggregates content from the main page and selected subpages (About, Careers, etc.)
def get_all_details(url, get_links_func):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links_func(url)

    if not links or "links" not in links:
        result += "\n\n[No additional relevant links found or failed to extract JSON]\n"
        return result
    # Append contents of each relevant linked page
    for link in links.get("links", []):
        result += f"\n\n{link.get('type', 'Unknown')}\n"
        result += Website(link["url"]).get_contents()

    return result

In [7]:
# Prompt to guide brochure generation – focuses on company storytelling
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information. Also make the relevant links, hyperlinks."

# Builds a user prompt with detailed company info (landing + subpages), used for brochure creation
def get_brochure_user_prompt(company_name, url, model):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    if model == "GPT":
        user_prompt += get_all_details(url, get_links_gpt)
    elif model == "OLLAMA":
        user_prompt += get_all_details(url, get_links_ollama)
    
    # Truncate if the message exceeds 5000 characters to avoid LLM overload
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [8]:
# Calls the selected LLM backend (GPT or Ollama) to generate the brochure content
def create_brochure(company_name, url, provider):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_brochure_user_prompt(company_name, url, provider)}
    ]

    if provider == "GPT":
        response = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=messages
        )
        result = response.choices[0].message.content

    elif provider == "OLLAMA":
        response = ollama.chat(
            model=OLLAMA_MODEL,
            messages=messages
        )
        result = response['message']['content']
    
    else:
        print("Invalid model")
    
    return result


In [9]:
# Wrapper function used by Gradio interface
def brochure(company_name, url, model):
    if model=="GPT":
        result = create_brochure(company_name, url, "GPT")
    elif model=="OLLAMA":
        result = create_brochure(company_name, url, "OLLAMA")
    else:
        raise ValueError("Unknown model")
    return result

In [10]:
# Gradio UI: Accepts company name, URL, and LLM model, returns brochure
view = gr.Interface(
    fn=brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(["GPT", "OLLAMA"], label="Select model")],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)
view.launch()

* Running on local URL:  http://127.0.0.1:7874
* To create a public link, set `share=True` in `launch()`.


