In [1]:
import os
import requests
from IPython.display import Markdown, display
from bs4 import BeautifulSoup
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

# Connecting to Nebius LLM via API
Setup connection to Nebius API

In [2]:
#Find the key file

os.chdir("C:\\Users\\vital\\PythonStuff\\keys")
cwd = os.getcwd() 

with open("nebius_api_key", "r") as file:
    nebius_api_key = file.read().strip()

os.environ["NEBIUS_API_KEY"] = nebius_api_key

# Nebius uses the same OpenAI() class, but with additional details
nebius_client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=os.environ.get("NEBIUS_API_KEY"),
)

llama_8b_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llama_70b_model ="meta-llama/Llama-3.3-70B-Instruct"


# A Class to represent a Webpage

In [3]:
class Website:
    def __init__(self, url):
        self.url = url
        self.title = ""
        self.text = ""
        self.links = []   # NEW: store hyperlinks
        self.scrape()

    def scrape(self):
        try:
            # Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

            # Try to find Chrome
            chrome_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
            ]

            chrome_binary = None
            for path in chrome_paths:
                if os.path.exists(path):
                    chrome_binary = path
                    break

            if chrome_binary:
                chrome_options.binary_location = chrome_binary

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(30)

            #print(f"🔍 Loading: {self.url}")
            driver.get(self.url)

            # Wait for page to load
            time.sleep(5)

            # Try to wait for main content
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "main"))
                )
            except Exception:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "body"))
                    )
                except Exception:
                    pass  # Continue anyway

            # Get title and page source
            self.title = driver.title
            page_source = driver.page_source
            driver.quit()

            print(f"✅ Page loaded: {self.title}")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Remove unwanted elements
            for element in soup(["script", "style", "img", "input", "button", "nav", "footer", "header"]):
                element.decompose()

            # Get main content
            main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')
            if main:
                self.text = main.get_text(separator="\n", strip=True)
            else:
                self.text = soup.get_text(separator="\n", strip=True)

            # Clean up text
            lines = [line.strip() for line in self.text.split('\n') if line.strip() and len(line.strip()) > 2]
            self.text = '\n'.join(lines[:200])  # Limit to first 200 lines

            #print(f"📄 Extracted {len(self.text)} characters")

            # NEW: Extract hyperlinks
            links = [a.get('href') for a in soup.find_all('a', href=True)]
            # Filter out empty, javascript:, and mailto: links
            self.links = [link for link in links if link and not link.startswith(('javascript:', 'mailto:'))]

            #print(f"📄 Extracted {len(self.text)} characters and {len(self.links)} links")

        except Exception as e:
            print(f"❌ Error occurred: {e}")
            self.title = "Error occurred"
            self.text = "Could not scrape website content"
            self.links = []
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
#web = Website("https://www.citibank.com.sg/")
#print(web.title)
#print(web.text)
#print(web.links)

# Build the system prompt for URL links

In [5]:
defineSystemPrompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
defineSystemPrompt += "Your response must be valid JSON only.\n"
defineSystemPrompt += "Do not include any explanation, text, or Markdown code fences.\n"
defineSystemPrompt += "You MUST respond in JSON as in this example:"
defineSystemPrompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
#print(defineSystemPrompt)

# Build the user prompt for URL links

In [7]:
# Function to build the user prompt for LLM
def user_prompt_for_links(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "\nplease decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format." \
                   "\nDo not include Terms of Service, Privacy, email links.\n"
    
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
#print(user_prompt_for_links(web))

# Function to chat with LLM model

In [9]:
def answer_with_llm(prompt: str,
                    system_prompt="You are a good assistant",
                    max_tokens=512,
                    client=nebius_client,
                    model=llama_8b_model,
                    prettify=True,
                    stream=True,
                    temperature=None) -> str:

    messages = []

    if system_prompt:
        messages.append(
            {
                "role": "system",
                "content": system_prompt
            }
        )

    messages.append(
        {
            "role": "user",
            "content": prompt
        }
    )

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature
    )

    #if prettify:
    #    return prettify_string(completion.choices[0].message.content)
   # else:
    
    return completion.choices[0].message.content

# The function that helps to get relevant links by calling the LLM

In [10]:
def getLinks(url):
    website = Website(url)
    prompt = user_prompt_for_links(website)
    rawResponse = answer_with_llm(prompt,defineSystemPrompt)
    try:
        # Convert JSON string into a Python dict
        return json.loads(rawResponse)
    except json.JSONDecodeError:
        print("❌ LLM did not return valid JSON")
        return {"links": []}
    #return answer_with_llm(prompt,defineSystemPrompt)

# Invoke action to view all links gathered from the website and view in JSON format

In [11]:
#getLinks("https://www.citigroup.com/global/businesses/services")

# Build the brochure

In [12]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = getLinks(url)

    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [13]:
#print(get_all_details("https://huggingface.co"))

In [14]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
#get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [17]:
def create_brochure(company_name, url):

    brochureResult = answer_with_llm(get_brochure_user_prompt(company_name, url),system_prompt)
    
    display(Markdown(brochureResult))

In [19]:
create_brochure("Citibank Transaction", "https://www.citigroup.com/global/businesses/services")

✅ Page loaded: Citi | Services
✅ Page loaded: Citi | Services
Found links: {'links': [{'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us'}, {'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us/strategy'}, {'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us/leadership'}, {'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us/global-presence'}, {'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us/heritage'}, {'type': 'about page', 'url': 'https://www.citigroup.com/global/about-us/people-engagement-inclusion'}, {'type': 'careers page', 'url': 'https://jobs.citi.com/'}, {'type': 'alumni page', 'url': 'https://alumni.citi.com/'}, {'type': 'contact page', 'url': 'https://www.citigroup.com/global/contact-us'}, {'type': 'investors page', 'url': 'https://www.citigroup.com/global/investors'}, {'type': 'insights page', 'url': 'https://www.citigroup.com/global/insights'}, {'type': 'news page', 'url

Here is a short brochure about Citibank Transaction:

**Welcome to Citibank Transaction**

**About Us**

At Citibank Transaction, we are a global bank that is uniquely positioned to navigate the complexities of the modern world. With a network that spans over 180 countries, we have the expertise and experience to connect the dots for our clients and deliver financial services that enable growth and economic progress.

**Our Mission**

Our mission is to serve as a trusted partner to our clients by responsibly providing financial services that enable growth and economic progress. We are committed to delivering excellence, taking ownership, and succeeding together.

**Our Strengths**

* **Global Presence**: We have a physical presence in 94 markets, allowing us to connect the dots and anticipate change.
* **Financial Strength**: We manage nearly $5 trillion in financial flows, making us a leader in the industry.
* **Client Base**: We serve 19,000 institutional clients, including 85% of Fortune 500 companies.
* **ESG Commitment**: We have committed to $1+ trillion in ESG investments, playing a leading role in solving interconnected societal challenges.

**Our Services**

* **Liquidity Management**: We help our clients move, manage, and invest balances efficiently, while managing risk and optimizing financial outcomes.
* **Banking and International**: We offer a range of banking and international services to meet the needs of our clients.
* **Wealth**: We provide wealth management services to help our clients achieve their financial goals.

**Our Culture**

* **Diversity and Inclusion**: We are committed to creating a workplace that is inclusive and diverse.
* **People, Engagement & Inclusion**: We prioritize the well-being and engagement of our employees.

**Careers**

If you are looking for a challenging and rewarding career, consider joining our team. We offer a range of career opportunities across our global network.

**Investors**

We welcome investors who share our vision for a more sustainable and inclusive financial system. Learn more about our quarterly earnings, events, and presentations.

**Contact Us**

For more information about Citibank Transaction, please visit our website or contact us directly.

We hope this brochure provides a brief overview of our company and services. Thank you for considering Citibank Transaction as your partner in financial services.