### Useful links

https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/#retrieval-and-generation-generate

https://docs.trychroma.com/guides

# Imports

In [None]:
import os
import getpass
import openai
from openai import OpenAI

import chromadb
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction

from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# Data Preparation
PATH_DB = "db"                          # path to persistent database
PATH_DATA = "documents/Arbetsmiljö"     # path to documents
COLLECTION_NAME = "policy"              # name of Chroma collection

# Chroma DB

### Upload documents in directory to vector database

In [None]:
file_type = ".pdf"
documents = []

for filename in os.listdir(PATH_DATA):
    if filename.endswith(file_type.upper()):
        # Change '.PDF' to '.pdf'
        old_file_path = os.path.join(PATH_DATA, filename)
        new_filename = filename.replace(file_type.upper(), file_type)
        new_file_path = os.path.join(PATH_DATA, new_filename)
        os.rename(old_file_path, new_file_path)
    elif filename.endswith(file_type):
        # Read all files in the data directory
        try:
            loader = PyPDFLoader(os.path.join(PATH_DATA, filename))
            document = loader.load()
            documents.extend(document)
        except Exception as e:
            print(e, f"=> Skipping file: {filename}...", sep="\n")
print(f"=> Loaded {len(documents)} documents.")

# The langchain embedding model
embedder = OllamaEmbeddings(model="mxbai-embed-large")      

# Split the documents into chunks
text_splitter = SemanticChunker(embedder)
new_documents = text_splitter.split_documents(documents)
print(f"=> Split the documents into {len(new_documents)} chunks.")

# Create a Chroma DB with the loaded documents
print(f"=> Loading documents into Chroma DB.")
vectorstore = Chroma.from_documents(
    documents=new_documents,
    embedding=embedder,
    collection_name=COLLECTION_NAME,
    persist_directory=PATH_DB
)
print(f"Added {len(new_documents)} chunks to the collection: {vectorstore._collection.name}")

### Initialize existing persisting storage

Note that the embedding function used here is from Chroma and the one used in uploading data to the database is from Langchain.

When uploading multiple documents to Chroma, the Langchain's Chroma is used and hence, their respective embedding function is used there. However, when instatiating an existing ChromaDB, then Chroma's own library is used, hence the other embedding function.

It is possible to only use one of them, but there were some problems with instantiation a persistent db using Langchain, hence the mix.

In [None]:
# Instantiate a persistent chroma client in the persist_directory.
# This will automatically load any previously saved collections.
client_db = chromadb.PersistentClient(path=PATH_DB)

# Get the collection.
collection = client_db.get_collection(
    name=COLLECTION_NAME, 
    embedding_function=OllamaEmbeddingFunction(
        model_name="mxbai-embed-large",
        url="http://localhost:11434/api/embeddings",
    )
)
collection.get()["metadatas"]

# LLM

In [None]:
query = """Discuss why the governing documents does allow you to drink beer during working hours."""

# Query the collection to get the 5 most relevant results
results = collection.query(
    query_texts=[query], n_results=5, include=["documents", "metadatas"]
)["documents"][0]
print(results)

In [None]:
# Check if the OPENAI_API_KEY environment variable is set. Prompt the user to set it if not.
openai.api_key = os.getenv('OPENAI_API_KEY') if 'OPENAI_API_KEY' in os.environ else getpass.getpass("Enter your OpenAI API key: ")

openai_client = OpenAI() # defaults to getting the key using os.environ.get("OPENAI_API_KEY")
model_name = "gpt-4o-mini"

response = openai_client.chat.completions.create(
    model=model_name,
    messages=[
        {
            "role": "system", 
            "content": f"""
                I am going to ask you a question, which I would like you to answer based only on the provided context, and not any other information.
                If there is not enough information in the context to answer the question, say I am not sure, then try to make a guess.
                Break your answer up into nicely readable paragraphs.
            """
        }, {"role": "user", "content": f"Question: {query}. Context: {results}"}
    ],
).choices[0].message.content
print(response)

# Visualize Chunk Embeddings in 2D (experimental)

In [None]:
import pacmap
import plotly.express as px
import pandas as pd

def visualize_chunks(query, query_vector, collection, path_split):
    print('=> Fitting data to 2D...')    
    data = collection.get(include=['documents', 'metadatas', 'embeddings'])
    df = pd.DataFrame.from_dict(data=data['embeddings'])
    metadatas = data['metadatas']
    documents = data['documents']
    
    print('=> Extracting info...')
    embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

    # Fit the data (the index of transformed data corresponds to the index of the original data)
    documents_projected = embedding_projector.fit_transform(df.to_numpy() + [query_vector], init='pca')
    df = pd.DataFrame.from_dict(
        [
            {
                'x': documents_projected[i, 0],
                'y': documents_projected[i, 1],
                'source': metadatas[i]['source'].split(path_split)[1], # May give error. If so, check the 'source' attribute string and change the split() condition
                'extract': documents[i][:100] + '...',
                'symbol': 'circle',
                'size_col': 1,
            }
            for i in range(len(documents))
        ]
        + [
            {
                'x': documents_projected[-1, 0],
                'y': documents_projected[-1, 1],
                'source': 'User query',
                'extract': query,
                'size_col': 1,
                'symbol': 'star',
            }
        ]
    )

    # Visualize the chunk vector embeddings
    print('=> Visualizing...')
    fig = px.scatter(df, x='x', y='y', width=800, height=500,
        color='source',
        hover_data='extract',
        size='size_col',
        symbol='symbol',
        color_discrete_map={'User query': 'black'},
    )
    fig.update_traces(
        marker=dict(opacity=1, line=dict(width=0, color='DarkSlateGrey')),
        selector=dict(mode='markers'),
    )
    fig.update_layout(
        legend_title_text='<b>Chunk source</b>',
        title='<b>2D Projection of Chunk Embeddings via PaCMAP</b>',
    )
    fig.show()

Print path to source to get what token to split the path. 
Change `path_split` accordingly in next code cell.

In [None]:
# Get collection
collection = client_db.get_collection(COLLECTION_NAME)
print(collection.get()['metadatas'][0]['source'])

### Visualize from collection

In [None]:
import warnings
warnings.simplefilter("ignore", UserWarning)

query = """Discuss why the governing documents does allow you to drink beer during working hours."""

# Embedd a query
query_vector = OllamaEmbeddings(model="mxbai-embed-large").embed_query(query)

# Visualize
visualize_chunks(query, query_vector, collection, path_split='/')  # Change `path_split` accordingly from previous code cell.