# Create Vector DBs

In [None]:
from langchain_community.document_loaders import RecursiveUrlLoader
import re
from bs4 import BeautifulSoup
import os
import subprocess
import argparse
from uuid import uuid4

from langchain_community.document_loaders import NotebookLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
#from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [None]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

In [None]:
def load_kb_pages(url, max_depth=4):

    kb_loader = RecursiveUrlLoader(
        url,
        max_depth=max_depth,
        timeout=10,
        #base_url="https://learn.fabric-testbed.net/knowledge-base/",
        base_url="https://learn.fabric-testbed.net",
        extractor=bs4_extractor,
        # use_async=False,
        # metadata_extractor=None,
        # check_response_status=True,
        # continue_on_failure=True,
        # prevent_outside=True,
        # ...
    )

    kb_pages = []

    for doc in kb_loader.lazy_load():

        if ('text/html' in doc.metadata['content_type']) and \
                ('Page not found' not in doc.metadata['title']) and \
                ('Log In' not in doc.metadata['title']): 
            kb_pages.append(doc)
    
    return kb_pages

In [None]:
def load_forum_pages(url, max_depth=4):
    
    forum_loader = RecursiveUrlLoader(
        url,
        max_depth=max_depth,
        timeout=10,
        base_url="https://learn.fabric-testbed.net/forums",
        extractor=bs4_extractor,
        # use_async=False,
        # metadata_extractor=None,
        # check_response_status=True,
        # continue_on_failure=True,
        # prevent_outside=True,
        # ...
    )


    forum_pages = []
    for doc in forum_loader.lazy_load():
        
        #print(doc.metadata)
        #pages.append(doc)
        if ('text/html' in doc.metadata['content_type']) and \
                ('Page not found' not in doc.metadata['title']) and \
                ('Log In' not in doc.metadata['title']): 
            forum_pages.append(doc)

    return forum_pages

In [None]:
def create_vectorstore(documents, database_loc, embedding="all-mpnet-base-v2"):

    embedding_model = HuggingFaceEmbeddings(model_name=embedding)
    vector_store = Chroma(embedding_function=embedding_model,
                          persist_directory=database_loc)

    # Store the document in the vector store
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents, ids=uuids)

    print(f"All notebooks have been processed and stored at {database_loc}.")

In [None]:
all_urls = ["https://learn.fabric-testbed.net/article-categories/getting-started/",
            "https://learn.fabric-testbed.net/article-categories/technical-guides/",
            "https://learn.fabric-testbed.net/article-categories/teaching-on-fabric/",
            "https://learn.fabric-testbed.net/forums/forum/fabric-general-questions-and-discussion/"]

## Create a Knowledge Base DB

In [None]:
kb_db_loc = "Vectorstore/kb_only"

In [None]:
root_urls = ["https://learn.fabric-testbed.net/article-categories/getting-started/",
            "https://learn.fabric-testbed.net/article-categories/technical-guides/",
            "https://learn.fabric-testbed.net/article-categories/teaching-on-fabric/"]

kb_pages = []
source_urls = set()
titles = set()

for url in root_urls:
    print(f"\n\n parsing URL: {url}")
    docs = load_kb_pages(url, max_depth=6)
    for doc in docs:
        if doc.metadata['source'] not in source_urls and doc.metadata['title'] not in titles:
            source_urls.add(doc.metadata['source'])
            titles.add(doc.metadata['title'])
            kb_pages.append(doc)
            print(f"adding {doc.metadata['source']}: {doc.metadata['title']}")

        else:
            print(f"Found duplicates: {doc.metadata['source']}: {doc.metadata['title']}")

print(len(kb_pages))

create_vectorstore(kb_pages, kb_db_loc, embedding="all-mpnet-base-v2")

## Create a Forum DB

In [None]:
forum_db_loc = "Vectorstore/forum_only"

In [None]:
root_urls = ["https://learn.fabric-testbed.net/forums/forum/fabric-general-questions-and-discussion/"]

forum_pages = []
source_urls = set()
titles = set()

for url in root_urls:
    docs = load_forum_pages(url, max_depth=6)
    for doc in docs:
        if doc.metadata['source'] not in source_urls and doc.metadata['title'] not in titles:
            source_urls.add(doc.metadata['source'])
            titles.add(doc.metadata['title'])
            forum_pages.append(doc)
            print(f"adding {doc.metadata['source']}: {doc.metadata['title']}")

        else:
            print(f"Found duplicates: {doc.metadata['source']}: {doc.metadata['title']}")

print(len(forum_pages))

create_vectorstore(forum_pages, forum_db_loc, embedding="all-mpnet-base-v2")

## Create a combined DB