In [None]:
import os
import re
from pathlib import Path

DATA_FOLDER = Path("../chromadb/data")

# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, SeleniumURLLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate

In [69]:
# website = "https://hackernoon.com/vector-databases-getting-started-with-chromadb-and-more"

# def load_document(loader_class, website_url):
#    loader = loader_class([website_url])
#    return loader.load()

# wb_loader_doc = load_document(WebBaseLoader, website)
# wb_loader_doc[0].page_content


In [70]:
# selenium_loader_doc = load_document(SeleniumURLLoader, website)
# selenium_loader_doc[0].page_content

In [71]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=200)
# splits = text_splitter.split_documents(selenium_loader_doc)
# splits[0]

In [29]:
import chromadb

client = chromadb.HttpClient(host="localhost", port=8000)
print("HEARTBEAT:", client.heartbeat())

HEARTBEAT: 1737548230942146000


In [7]:
import chromadb.utils.embedding_functions as embedding_functions

openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=OPENAI_API_KEY, model_name="text-embedding-3-small")

In [74]:
documents = []
md_files = list(DATA_FOLDER.glob("*.md"))


def extract_sections(file_path: Path) -> list[dict]:
    pattern = re.compile(r"(#\S+)(.*?)(?=(#\S+)|\Z)", re.DOTALL)
    with open(file_path, "r") as f:
        content = f.read()

    sections = []
    for match in pattern.finditer(content):
        section = {"title": match.group(1).strip(), "content": match.group(2).strip()}
        sections.append(section)
    return sections


data = []
for md_file in md_files:
    file_data = extract_sections(md_file)
    data.append(file_data)

documents = []
for entry in data:
    for item in entry:
        if item["title"] == "#Text":
            text_content = item["content"]
        elif item["title"] == "#Article":
            source = item["content"]
    document = f"{text_content} Source: {source}"
    documents.append(document)

document_ids = list(map(lambda tup: f"id{tup[0]}", enumerate(documents)))
print(len(documents))

51


In [30]:
# collection = client.get_or_create_collection(name="asylumineurope", embedding_function=openai_ef)
# print(collection)

name='asylumineurope' id=UUID('4a70b0ac-5a28-4731-8d8e-2b1e6dd10ffe') metadata=None tenant=None database=None


In [30]:
collection = client.get_or_create_collection(name="test_collection", embedding_function=openai_ef)
print(collection)

Collection(id=5e8409c8-34a3-459d-b660-e3f9d791a63e, name=test_collection)


In [77]:
# collection.add(documents=documents, ids=document_ids)

In [31]:
documents = [
    "P-A studied Business Administration in Aix-en-Provence. He has an assosciate degree.",
    "P-A enjoys Asian food a lot at the moment. Especially hand-pulled noodles.",
    "P-A works at Signavio.NEXT, the innovation team at SAP Signavio.",
    "P-A also studied ICT at 42 Berlin.",
    "P-A learns Mandarin on Duolingo, after finishing the Portuguese course.",
    "P-A is 31 years old.",
]

document_ids = list(map(lambda tup: f"id{tup[0]}", enumerate(documents)))

documents_with_ids = [(f"id{index}", doc) for index, doc in enumerate(documents)]
print(document_ids)

['id0', 'id1', 'id2', 'id3', 'id4', 'id5']


In [32]:
collection.add(documents=documents, ids=document_ids)

In [33]:
collection.count()

6

In [34]:
collection.query(query_texts=["question"], n_results=2, include=["documents"])

{'ids': [['id0', 'id1']],
 'distances': None,
 'embeddings': None,
 'metadatas': None,
 'documents': [['P-A studied Business Administration in Aix-en-Provence. He has an assosciate degree.',
   'P-A enjoys Asian food a lot at the moment. Especially hand-pulled noodles.']],
 'uris': None,
 'data': None,
 'included': ['documents']}

In [32]:
# Note: the following code does not work because the embeddings cannot be retrieved

# vectordb = Chroma(persist_directory="../src/chromadb/chroma_data", collection_name="test_collection", embedding_function=openai_ef)
# vectordb._collection.count()


In [33]:
# prompt = PromptTemplate.from_template(
#     """
#     Answer the question based only on the following context:
#     Context: {context}
#     Question: {question}
#     """
# )

In [34]:
# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [35]:
# vectorstore = Chroma.from_texts(["PA is working at SAP",
#                                 "PA also participates in a non-profit project",
#                                 "PA's fav pokemon is Snorlax",
#                                 "PA likes tacos"], embedding=OpenAIEmbeddings())
# vectorstore.persist()
# retriever = vectordb.as_retriever()

In [36]:
# rag_chain = (
#     {"context": retriever, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

In [37]:
# response = rag_chain.invoke("What can you tell me about PA?")
# response