# Vector Store with Custom Embeddings Using Langchain and Chroma

In [3]:
from utils.sentence_chunking import get_sentence_chunks

import chromadb
from pathlib import Path
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter as Rec

# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
# model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

from langchain.embeddings.base import Embeddings
from typing import List

class MyEmbeddings(Embeddings):
        def __init__(self):
            self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
    
        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return [self.model.encode(t).tolist() for t in texts]
        
        def embed_query(self, query: str) -> List[float]:
            return self.model.encode(query).tolist()

embedding_func = MyEmbeddings()

# Initialize ChromaDB client
# client = chromadb.Client()
client = chromadb.PersistentClient(path="db/pdfs")


# Create a collection in ChromaDB (will store embeddings)
# collection = client.get_or_create_collection(name="markdown_chunks_collection")

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

# vector_store = Chroma(
#     collection_name="markdown_chunks_collection",
#     embedding_function=embedding_func,
#     # other params...
# )

vector_store = Chroma(
    client = client,
    collection_name="markdown_chunks_collection",
    embedding_function=embedding_func,
    # persist_directory = "db/pdfs",
    # other params...
)

def split_chunks():
    try:
        # Path to markdown directory
        md_dir = Path("data/md/")
        chunk_id_counter = 1  # Initialize a counter for unique chunk IDs
        ids = []
        documents = []

        # Loop through all markdown files in the md directory
        for md_file in md_dir.glob("*.md"):
            with open(md_file, "r") as f:
                md_content = f.read()

            # Chunk the markdown content
            ## Chunk Method 1: Sentence Chunking
            # chunks = get_sentence_chunks(md_content, tokenizer)
            
            ## Chunk Method 2: CST Token Chunking
            # chunks = get_cst_token_chunks(md_content, tokenizer)
            
            ## Chunk Method 3: Recursive Character Chunking
            text_splitter = Rec(
                chunk_size=1000,
                chunk_overlap=500,
                length_function=len,
                add_start_index=True
            )
            chunks = text_splitter.split_text(md_content)
            
            for chunk in chunks:
                # Create a Document object for the chunk
                document_to_add = Document(
                    page_content = chunk,
                    metadata = {"source": str(md_file)}
                )
                
                documents.append(document_to_add)
                
                ids.append(str(chunk_id_counter)) # Add document ID to the list

                chunk_id_counter += 1  # Increment the ID counter
        
        vector_store.add_documents(documents = documents, ids = ids)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    split_chunks()
    
    # results = vector_store.similarity_search(query="insertion sort",k=1)
    # for doc in results:
    #     print(f"* {doc.page_content} [{doc.metadata}]")
    
    retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 10, "fetch_k": 20, "lambda_mult": 0.5},
    )

# Question-Answering Using Langchain and Retrieval-Augmented Generation (RAG)

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

import os
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.environ.get("OPENAI_API_KEY"),
)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If the answer is not in the retrieved context,"
    "say that you don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
response = rag_chain.invoke({"input": "What is Elon Musk's Son name?"})
response["answer"]


"I don't know."

# Search relevant documents based on query

In [4]:
results = vector_store.similarity_search(query="insertion sort",k=1)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* _CHAPTER 8. SORTING_ 67

###### 8.4 Insertion Sort

Insertion sort is a somewhat interesting algorithm with an expensive runtime of
_O(n[2]). It can be best thought of as a sorting scheme similar to that of sorting_
a hand of playing cards, i.e. you take one card and then look at the rest with
the intent of building up an ordered set of cards in your hand.

4 75 74

4 75 74 2 54 4 75 74 2 54 4 75 74 2 54

2 54

4 74 75 2 54 2 4 74 75 54 2 4 54 74 75

Figure 8.4: Insertion Sort Iterations

1) algorithm Insertionsort(list)
2) **Pre:** _list_ =
_̸_ _∅_
3) **Post: list has been sorted into values of ascending order**
4) _unsorted_ 1
_←_
5) **while unsorted < list.Count**
6) _hold_ _list[unsorted]_
_←_
7) _i_ _unsorted_ 1
_←_ _−_
8) **while i** 0 and hold < list[i]
_≥_
9) _list[i + 1]_ _list[i]_
_←_
10) _i_ _i_ 1
_←_ _−_
11) **end while**
12) _list[i + 1]_ _hold_
_←_
13) _unsorted_ _unsorted + 1_
_←_
14) **end while**
15) **return list**
16) end Insertionsort

|4|Col2|
|---|---|
||| [{'so