# Connecting to Nebius LLM via API
Setup connection to Nebius API

In [10]:
import os
import glob
import requests
from IPython.display import Markdown, display
from bs4 import BeautifulSoup
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

#Find the source file and port to dataframe

os.chdir("C:\\Users\\vital\\PythonStuff\\keys")
cwd = os.getcwd() 

with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()

os.environ["NEBIUS_API_KEY"] = nebius_api_key

# Nebius uses the same OpenAI() class, but with additional details
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)

llama_8b_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"


# A Class to represent a Webpage

In [21]:
class Website:
    def __init__(self, url):
        self.url = url
        self.title = ""
        self.text = ""
        self.scrape()

    def scrape(self):
        try:
            # Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

            # Try to find Chrome
            chrome_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
            ]

            chrome_binary = None
            for path in chrome_paths:
                if os.path.exists(path):
                    chrome_binary = path
                    break

            if chrome_binary:
                chrome_options.binary_location = chrome_binary

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(30)

            print(f"🔍 Loading: {self.url}")
            driver.get(self.url)

            # Wait for page to load
            time.sleep(5)

            # Try to wait for main content
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "main"))
                )
            except Exception:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "body"))
                    )
                except Exception:
                    pass  # Continue anyway

            # Get title and page source
            self.title = driver.title
            page_source = driver.page_source
            driver.quit()

            print(f"✅ Page loaded: {self.title}")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Remove unwanted elements
            for element in soup(["script", "style", "img", "input", "button", "nav", "footer", "header"]):
                element.decompose()

            # Get main content
            main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')
            if main:
                self.text = main.get_text(separator="\n", strip=True)
            else:
                self.text = soup.get_text(separator="\n", strip=True)

            # Clean up text
            lines = [line.strip() for line in self.text.split('\n') if line.strip() and len(line.strip()) > 2]
            self.text = '\n'.join(lines[:200])  # Limit to first 200 lines

            print(f"📄 Extracted {len(self.text)} characters")

        except Exception as e:
            print(f"❌ Error occurred: {e}")
            self.title = "Error occurred"
            self.text = "Could not scrape website content"


In [23]:
#ed = Website("https://openai.com")
#print(ed.title)
#print(ed.text)

In [12]:
# Function to build the user prompt for LLM
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; " \
                   "please provide a short summary of this website in markdown. " \
                   "If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [24]:
defineSystemPrompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [25]:
def answer_with_llm(prompt: str,
                    system_prompt=defineSystemPrompt,
                    max_tokens=512,
                    client=nebius_client,
                    model=llama_8b_model,
                    prettify=True,
                    temperature=None) -> str:

    messages = []

    if system_prompt:
        messages.append(
            {
                "role": "system",
                "content": system_prompt
            }
        )

    messages.append(
        {
            "role": "user",
            "content": prompt
        }
    )

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature
    )

    #if prettify:
    #    return prettify_string(completion.choices[0].message.content)
   # else:
    return completion.choices[0].message.content

# The function that helps to summarize the website by calling the LLM

In [5]:
def summarize(url):
    website = Website(url)
    prompt = user_prompt_for(website)
    return answer_with_llm(prompt)

# Function to display nicely in Juypter.
This does not work in normal python scripts

In [6]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

# Invoke action to summarize the website

In [7]:
display_summary("https://edwarddonner.com")

**Summary of Edward Donner's Website**
===============

Edward Donner is a co-founder and CTO of Nebula.io, a company that applies AI to help people discover their potential and pursue their reason for being. He is a code enthusiast and enjoys experimenting with Large Language Models (LLMs).

**News and Announcements**
-------------------------

* **Upcoming Events**
	+ AI in Production: Gen AI and Agentic AI on AWS at scale (September 15, 2025)
	+ 2025 AI Executive Briefing (April 21, 2025)
* **Courses and Training**
	+ Connecting my courses – become an LLM expert and leader (May 28, 2025)
	+ The Complete Agentic AI Engineering Course (May 18, 2025)

**About the Author**
---------------------

Edward Donner is a former founder and CEO of AI startup untapt, acquired in 2021. He is also a DJ and amateur electronic music producer.

In [8]:
display_summary("https://huggingface.co/huggingface")

**Website Summary**
===============

Hugging Face is a community-driven platform that aims to democratize good machine learning, one commit at a time. The website provides a hub for developers, researchers, and enthusiasts to share, collaborate, and learn from each other.

**Recent Activity**
-------------------

* A new Space has been updated by mishig about 4 hours ago.
* A new activity has been created by yjernite about 21 hours ago.
* A paper has been authored by burtenshaw 2 days ago, titled "A Cartography of Open Collaboration in Open Source AI: Mapping Practices, Motivations, and Governance in 14 Open Large Language Model Projects".

**Community**
-------------

* The Hugging Face community has 62,207 members and is growing rapidly.
* The community is building the future of AI and machine learning together.

**Models and Datasets**
------------------------

* The website provides a collection of pre-trained models, including DistilBERT, which has been updated on May 6, 2024.
* There are also 34 datasets available, including documentation images, transformer metadata, and policy documents.

**Spaces**
------------

* Spaces are interactive environments where users can experiment with models and datasets.
* There are 32 Spaces available, including the Inference Playground, AI Deadlines, and Number Tokenization Blog.

**News and Announcements**
---------------------------

* Organizations can now publish blog articles on the Hugging Face platform.
* The website has experienced significant growth in the past few months, with a 171% increase in team members and a 158% increase in organization card views.

In [14]:
display_summary("https://openai.com")

**Summary of Just a moment...**

This website appears to be a placeholder or a temporary page indicating that the user is waiting for a response from openai.com. The content is minimal, consisting of a single sentence advising the user to enable JavaScript and cookies to continue. There are no news or announcements on this website.