In [None]:
from chromadb import PersistentClient
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

CHROMA_PATH = "./chroma"
COLLECTION_NAME = "data"

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="qwen3"
)

persistent_client = PersistentClient(
    path=CHROMA_PATH,
)

collection = persistent_client.get_or_create_collection(name=COLLECTION_NAME)

vector_store = Chroma(
    client=persistent_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
)

# Add documents to the vector store
def add_documents_to_vector_store(sub_documents, sources):
    vector_store.add_documents(
        documents=sub_documents,
        ids=[doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in sub_documents],
    )

    vector_store.m
    print(f"Added {len(documents)} documents to the vector store.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [5]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
import time

# List of file extensions to skip
SKIP_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".pdf", ".zip", ".tar", ".gz", ".mp4", ".webp")

def is_valid(url):
    parsed = urlparse(url)
    path = parsed.path.lower()
    return (
        parsed.scheme in {"http", "https"}
        and parsed.netloc == "rc.virginia.edu"
        and not path.endswith(SKIP_EXTENSIONS)
    )

visited = set()
documents = {}

def crawl(url):
    if url in visited:
        return
    visited.add(url)

    try:
        response = requests.get(url, timeout=5)

        if response.url != url:
            print(f"Redirected from {url} to {response.url}")
            url = response.url
            if url in visited:
                return
            visited.add(url)

        content_type = response.headers.get("Content-Type", "")

        # Skip non-HTML content (e.g. image files served without .png in URL)
        if "text/html" not in content_type:
            print(f"Skipping non-HTML URL: {url}")
            return

        if response.status_code != 200:
            return

        soup = BeautifulSoup(response.text, "html.parser")

        if len(articles := soup.find_all("article")) != 1:
            print(f"Skipping {url} as it does not contain exactly one article")
        else:
            # Clear the soup to only contain the article content
            article_soup = BeautifulSoup(str(articles[0]), "html.parser")

            # Remove images before text extraction
            for tag in article_soup.find_all("img"):
                tag.decompose()

            # Remove "« Return to [previous page]" link
            return_link = article_soup.find("a", string=re.compile(r"^« Return to"))
            if return_link:
                return_link.decompose()

            metadata_tag = article_soup.find("p", class_="blog-post-meta")

            # Extract tags from the article metadata
            if metadata_tag:
                tags = [a_tag.get_text(strip=True) for a_tag in metadata_tag.find_all("a") if a_tag.get_text(strip=True)]
            else:
                tags = [a_tag.get_text(strip=True) for a_tag in article_soup.find_all("a", href=re.compile(r"^/tag/*")) if a_tag.get_text(strip=True)]
            
            date_tag = metadata_tag.find("time") if metadata_tag else None
            date_updated = date_tag['datetime'] if date_tag else None

            # Convert links to absolute URLs and convert to markdown format
            for a_tag in article_soup.find_all("a", href=True):
                href = a_tag['href']
                if not href.startswith("http"):
                    href = urljoin(url, href)
                a_tag['href'] = href
                a_tag.string = f"[{a_tag.get_text(strip=True)}]({href})"
            
            # Convert blog-post-title class to markdown
            title_tag = article_soup.find("h2", class_="blog-post-title")
            if title_tag:
                title_text = title_tag.get_text(strip=True)
                title_tag.string = f"# {title_text}\n\n"
            
            # Convert h1 tags to markdown
            for h1_tag in article_soup.find_all("h1"):
                h1_text = h1_tag.get_text(strip=True)
                h1_tag.string = f"\n\n## {h1_text}\n"
            
            # Remove tags from the article content
            if metadata_tag:
                metadata_tag.decompose()

            # Extract and clean text
            text = article_soup.get_text()
            text = re.sub(r"https?:\/\/\S+?\.png", "", text)  # remove .png URLs
            text = re.sub(r"\S+\.png", "", text)              # remove local refs
            text = re.sub(r"\n{3,}", "\n\n", text)
            text = re.sub(r"[ \t]+", " ", text)

            documents[url] = {
                "text": text.strip(),
                "metadata": {
                    "source_type": "website",
                    "source": url,
                    "chunk_number": 0,  # Assuming single chunk for simplicity
                    "tags": ["website", "blog-post"] + tags,
                    "date_updated": date_updated,
                }
            }
            print(f"Extracted text from {url}")
        
        # Crawl linked pages recursively
        for a_tag in soup.find_all("a", href=True):
            next_url = urljoin(url, a_tag['href'])

            # if the url contains a #, remove it
            next_url = next_url.split('#')[0]

            if is_valid(next_url):
                crawl(next_url)

        time.sleep(0.1)

    except Exception as e:
        print(f"Failed to crawl {url}: {e}")

crawl("https://rc.virginia.edu/")

Skipping https://rc.virginia.edu/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/userinfo/hpc/access/
Extracted text from https://rc.virginia.edu/userinfo/storage/
Skipping https://rc.virginia.edu/categories/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/form/support-request/
Extracted text from https://rc.virginia.edu/userinfo/systems/
Extracted text from https://rc.virginia.edu/userinfo/user-guide/
Skipping https://rc.virginia.edu/tags/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/about/mission/
Skipping https://rc.virginia.edu/about/people/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/about/students/
Extracted text from https://rc.virginia.edu/service/high-performance-computing/
Extracted text from https://rc.virginia.edu/service/dac/
Extracted text from https://rc.virginia.edu/service/dtc/
Extracted text from https://rc

In [6]:
print(len(documents))

372


In [7]:
from langchain.schema import Document

website_documents = []
for i, (url, doc) in enumerate(documents.items()):
    text = doc["text"]
    metadata = doc["metadata"]
    if not text.strip():
        print(f"Skipping empty document for URL: {url}")
        continue
    doc = Document(
        page_content=text,
        metadata={
            "source_type": metadata["source_type"],
            "source": metadata["source"],
            "chunk_number": metadata["chunk_number"],
            "tags": str(metadata["tags"]),
            "date_updated": metadata.get("date_updated", None),
        }
    )
    website_documents.append(doc)

In [8]:
print(website_documents[5])

page_content='# [Mission](https://rc.virginia.edu/about/mission/)

Research Computing empowers UVA researchers to achieve more with cutting-edge computational resources. Our support team strives to create innovative solutions for researchers who need 
help solving complex optimization, parallelization, workflow, and data analysis issues. We build and maintain the University's best computing platforms while educating the next 
generation of researchers on the power of advanced computing.' metadata={'source_type': 'website', 'source': 'https://rc.virginia.edu/about/mission/', 'chunk_number': 0, 'tags': "['website', 'blog-post']", 'date_updated': None}


In [9]:
def document_to_id(doc):
    return f"{doc.metadata['source_type']}_{doc.metadata['source']}_{doc.metadata['chunk_number']}"

In [None]:
# for each id, update the document with the id
for i, doc in enumerate(website_documents):
    id = document_to_id(doc)

    # check if the document already exists in the collection
    if id in collection.get()["ids"]:
        # update the document in the collection
        collection.update(
            ids=[id],
            metadatas=[doc.metadata],
        )
        print(f"Updated document {i + 1}/{len(website_documents)} with ID: {id}")

Updated document 1/372 with ID: website_https://rc.virginia.edu/userinfo/hpc/access/_0
Updated document 2/372 with ID: website_https://rc.virginia.edu/userinfo/storage/_0
Updated document 3/372 with ID: website_https://rc.virginia.edu/form/support-request/_0
Updated document 4/372 with ID: website_https://rc.virginia.edu/userinfo/systems/_0
Updated document 5/372 with ID: website_https://rc.virginia.edu/userinfo/user-guide/_0
Updated document 6/372 with ID: website_https://rc.virginia.edu/about/mission/_0
Updated document 7/372 with ID: website_https://rc.virginia.edu/about/students/_0
Updated document 8/372 with ID: website_https://rc.virginia.edu/service/high-performance-computing/_0
Updated document 9/372 with ID: website_https://rc.virginia.edu/service/dac/_0
Updated document 10/372 with ID: website_https://rc.virginia.edu/service/dtc/_0
Updated document 11/372 with ID: website_https://rc.virginia.edu/service/imaging/_0
Updated document 12/372 with ID: website_https://rc.virginia.e

Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionUpdateEvent: capture() takes 1 positional argument but 3 were given


Updated document 295/372 with ID: website_https://rc.virginia.edu/2024/06/production-release-of-the-afton-hpc-system-july-2-2024/_0
Updated document 296/372 with ID: website_https://rc.virginia.edu/2024/05/rivanna-maintenance-may-28-2024/_0
Updated document 297/372 with ID: website_https://rc.virginia.edu/2024/05/rivanna-maintenance-schedule-for-2024/_0
Updated document 298/372 with ID: website_https://rc.virginia.edu/2024/01/rivanna-maintenance-february-6-2024/_0
Updated document 299/372 with ID: website_https://rc.virginia.edu/2023/10/rcs-data-analytics-center-dac-now-serving-uvas-research-community/_0
Updated document 300/372 with ID: website_https://rc.virginia.edu/2024/10/virginia-women-in-hpc-student-lightning-talks-november-12-2024/_0
Updated document 301/372 with ID: website_https://rc.virginia.edu/2024/08/virginia-women-in-hpc-building-and-maintaining-supportive-communities/_0
Updated document 302/372 with ID: website_https://rc.virginia.edu/2023/09/virginia-women-in-hpc-event

In [10]:
BATCH_SIZE = 100

existing_ids = set(collection.get()["ids"])
to_add = [doc for doc in website_documents if document_to_id(doc) not in existing_ids]

print(f"Documents to add: {len(to_add)}/{len(website_documents)} total documents")

# Add documents to the vector store in batches
for i in range(0, len(to_add), BATCH_SIZE):
    batch = to_add[i:i + BATCH_SIZE]
    add_documents_to_vector_store(batch)
    print(f"Processed batch {i // BATCH_SIZE + 1}/{(len(to_add) + BATCH_SIZE - 1) // BATCH_SIZE}")

Documents to add: 30/372 total documents
Added 30 documents to the vector store.
Processed batch 1/1
