In [None]:
!pip install requests beautifulsoup4 llama_stack llama-stack-client selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import time

# --- Configure Selenium (headless Chrome) ---
options = Options()
options.binary_location = "/usr/bin/google-chrome-stable"  # point to Chrome
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")

driver = webdriver.Chrome(
    service=Service("/usr/local/bin/chromedriver"),  # baked-in driver
    options=options
)

driver.get("https://tip25.myexpoonline.com/exhibitors")

exhibitors = []

while True:
    # Let JS render
    time.sleep(2)

    # Parse current page
    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.select_one("table.table.table-sm")
    if not table:
        break

    for row in table.select("tbody tr"):
        cols = row.find_all("td")
        if len(cols) < 7:
            continue

        # Column 2: company name + link
        company_link = cols[1].find("a")
        company_name = company_link.get_text(strip=True) if company_link else None
        company_url = company_link["href"] if company_link else None

        # Column 3: booth
        booth = cols[2].get_text(strip=True)

        # Column 4: first-time exhibitor
        first_time = False
        img = cols[3].find("img")
        if img and img.get("alt") == "First Time Exhibitor":
            first_time = True

        # Column 5: AFCEA member
        afcea_member = False
        img = cols[4].find("img")
        if img and img.get("alt") == "AFCEA Member":
            afcea_member = True

        # Column 7: press release
        press_release = False
        span = cols[6].find("span", {"title": "Digital Listing"})
        if span:
            press_release = True

        exhibitors.append({
            "company": company_name,
            "url": company_url,
            "booth": booth,
            "first_time_exhibitor": first_time,
            "afcea_member": afcea_member,
            "press_release": press_release
        })

    # Try to click the "next" button
    try:
        next_btn = driver.find_element(By.CSS_SELECTOR, "a.pager-right-next")
        # If button is disabled, break
        if "disabled" in next_btn.get_attribute("class"):
            break
        next_btn.click()
    except NoSuchElementException:
        break

driver.quit()

# Convert to DataFrame for Jupyter exploration
df = pd.DataFrame(exhibitors)
print(f"Scraped {len(df)} exhibitors")
df.head()

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://lsd-llama-milvus-service.rag.svc.cluster.local:8321")

In [None]:

# Fetch all registered models
models = client.models.list()

embedding_model = next(m for m in models if m.model_type == "embedding")
embedding_model_id = embedding_model.identifier
embedding_dimension = embedding_model.metadata["embedding_dimension"]

print(f"Using embedding model: {embedding_model_id}")

In [None]:
vector_db_id = "my-milvus-db"
provider_id  = "milvus"

_ = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id=provider_id,
)
print(f"Registered vector DB: {vector_db_id}")

In [None]:
def exhibitor_to_text(exhibitor):
    """Convert exhibitor dictionary to searchable text"""
    text_parts = [f"Company: {exhibitor['company']}"]
    text_parts.append(f"Booth: {exhibitor['booth']}")

    if exhibitor['first_time_exhibitor']:
        text_parts.append("First Time Exhibitor at TIP25")

    if exhibitor['afcea_member']:
        text_parts.append("AFCEA Member")

    if exhibitor['press_release']:
        text_parts.append("Has Digital Listing/Press Release")

    if exhibitor['url']:
        text_parts.append(f"URL: {exhibitor['url']}")

    return "\n".join(text_parts)

In [None]:
from llama_stack_client import RAGDocument

documents = []
for i, exhibitor in enumerate(exhibitors, start=1):
    text = exhibitor_to_text(exhibitor)
    documents.append(
        RAGDocument(
            document_id=f"exhibitor-{i}",
            content=text,
            mime_type="text/plain",
            metadata={
                "source": "https://tip25.myexpoonline.com/exhibitors",
                "type": "exhibitor",
                "company": exhibitor["company"],
                "booth": exhibitor["booth"],
                "first_time_exhibitor": exhibitor["first_time_exhibitor"],
                "afcea_member": exhibitor["afcea_member"],
                "press_release": exhibitor["press_release"],
                "url": exhibitor.get("url", "")
            }
        )
    )

batch_size = 10
for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    client.tool_runtime.rag_tool.insert(
        documents=batch,
        vector_db_id="my-milvus-db",
        chunk_size_in_tokens=200,
        timeout=60
    )
print(f"Exhibitors ingested successfully: {len(documents)} exhibitors")