In [1]:
import os
import requests
from IPython.display import Markdown, display
from bs4 import BeautifulSoup
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import gradio as gr # For Gradio

# Connecting to Nebius LLM via API
Setup connection to Nebius API

In [2]:
#Find the key file

os.chdir("C:\\Users\\vital\\PythonStuff\\keys")
cwd = os.getcwd() 

with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()

os.environ["NEBIUS_API_KEY"] = nebius_api_key

# Nebius uses the same OpenAI() class, but with additional details
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)

llama_8b_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama_70b_model ="meta-llama/Llama-3.3-70B-Instruct"


# A Class to represent a Webpage

In [3]:
class Website:
    def __init__(self, url):
        self.url = url
        self.title = ""
        self.text = ""
        self.links = []   # NEW: store hyperlinks
        self.scrape()

    def scrape(self):
        try:
            # Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

            # Try to find Chrome
            chrome_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
            ]

            chrome_binary = None
            for path in chrome_paths:
                if os.path.exists(path):
                    chrome_binary = path
                    break

            if chrome_binary:
                chrome_options.binary_location = chrome_binary

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(30)

            #print(f"🔍 Loading: {self.url}")
            driver.get(self.url)

            # Wait for page to load
            time.sleep(5)

            # Try to wait for main content
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "main"))
                )
            except Exception:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "body"))
                    )
                except Exception:
                    pass  # Continue anyway

            # Get title and page source
            self.title = driver.title
            page_source = driver.page_source
            driver.quit()

            print(f"✅ Page loaded: {self.title}")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Remove unwanted elements
            for element in soup(["script", "style", "img", "input", "button", "nav", "footer", "header"]):
                element.decompose()

            # Get main content
            main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')
            if main:
                self.text = main.get_text(separator="\n", strip=True)
            else:
                self.text = soup.get_text(separator="\n", strip=True)

            # Clean up text
            lines = [line.strip() for line in self.text.split('\n') if line.strip() and len(line.strip()) > 2]
            self.text = '\n'.join(lines[:200])  # Limit to first 200 lines

            #print(f"📄 Extracted {len(self.text)} characters")

            # NEW: Extract hyperlinks
            links = [a.get('href') for a in soup.find_all('a', href=True)]
            # Filter out empty, javascript:, and mailto: links
            self.links = [link for link in links if link and not link.startswith(('javascript:', 'mailto:'))]

            #print(f"📄 Extracted {len(self.text)} characters and {len(self.links)} links")

        except Exception as e:
            print(f"❌ Error occurred: {e}")
            self.title = "Error occurred"
            self.text = "Could not scrape website content"
            self.links = []
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# Build the system prompt for URL links

In [4]:
defineSystemPrompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
defineSystemPrompt += "Your response must be valid JSON only.\n"
defineSystemPrompt += "Do not include any explanation, text, or Markdown code fences.\n"
defineSystemPrompt += "You MUST respond in JSON as in this example:"
defineSystemPrompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

# Build the user prompt for URL links

In [5]:
# Function to build the user prompt for LLM
def user_prompt_for_links(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "\nplease decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format." \
                   "\nDo not include Terms of Service, Privacy, email links.\n"
    
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

# Function to chat with LLM model

In [6]:
def answer_with_llm(prompt: str,
                    system_prompt="You are a good assistant",
                    max_tokens=512,
                    client=nebius_client,
                    model=llama_8b_model,
                    prettify=True,
                    stream=True,
                    temperature=None) -> str:

    messages = []

    if system_prompt:
        messages.append(
            {
                "role": "system",
                "content": system_prompt
            }
        )

    messages.append(
        {
            "role": "user",
            "content": prompt
        }
    )

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature
    )

    #if prettify:
    #    return prettify_string(completion.choices[0].message.content)
   # else:
    
    return completion.choices[0].message.content

# The function that helps to get relevant links by calling the LLM

In [7]:
def getLinks(url):
    website = Website(url)
    prompt = user_prompt_for_links(website)
    rawResponse = answer_with_llm(prompt,defineSystemPrompt)
    try:
        # Convert JSON string into a Python dict
        return json.loads(rawResponse)
    except json.JSONDecodeError:
        print("❌ LLM did not return valid JSON")
        return {"links": []}
    #return answer_with_llm(prompt,defineSystemPrompt)

# Invoke action to view all links gathered from the website and view in JSON format

In [8]:
#getLinks("https://www.citigroup.com/global/businesses/services")

# Build the brochure

In [9]:
# This function is call by get_brochure_user_prompt function. Then in this function, it will creates an instance of your Website class.
#The Website constructor calls its own self.scrape() method —
#launching Selenium to load the page, wait for <body>, and extract text and links.
#The scraped title, main body text, and links are stored in self.title, self.text, and self.links.
#Next, it will call the getLinks function where getLinks(url) asks LLM which links are relevant
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = getLinks(url)

    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

#print(get_all_details("https://huggingface.co"))

# Using Gradio to generate Brochure

In [15]:
def stream_brochure(company_name, url, model_choice):
    yield f"### Generating brochure for {company_name} using {model_choice}...\n"
    try:
        # Step 1: Scrape and collect all website content
        content = get_all_details(url)

        # Step 2: Select model based on user input
        model_map = {
            "Llama 8B": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "Llama 70B": "meta-llama/Llama-3.3-70B-Instruct",
            "Gemma 9B": "google/gemma-2-9b-it-fast",
            "Qwen 72B": "Qwen/Qwen2.5-72B-Instruct"
        }
        selected_model = model_map.get(model_choice, "meta-llama/Meta-Llama-3.1-8B-Instruct")

        # Step 3: Build the prompts
        system_prompt = (
            "You are an assistant that analyzes several company pages and writes a short brochure in markdown, "
            "highlighting its business focus, culture, customers, and career opportunities."
        )

        user_prompt = (
            f"You are looking at {company_name}.\n"
            f"Use the following website information to create a short, professional brochure in markdown.\n"
            f"{content[:5000]}"
        )

        # Step 4: Stream LLM output token-by-token
        stream_resp = nebius_client.chat.completions.create(
            model=selected_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=1024,
            stream=True
        )

        full_text = ""
        for chunk in stream_resp:
            if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                token = chunk.choices[0].delta.content
                full_text += token
                yield full_text  # live update in Gradio

        yield full_text

    except Exception as e:
        yield f"❌ Error: {e}"


In [16]:
view = gr.Interface(
    fn=stream_brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(["llama_8b_model", "llama_70b_model"], label="Select model")],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)
view.launch(inbrowser=True)
#llama_8b_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#llama_70b_model ="meta-llama/Llama-3.3-70B-Instruct"

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


