# Vector Store with Custom Embeddings Using Langchain and Chroma

## Run the cell below if db is empty

In [1]:
from utils.sentence_chunking import get_sentence_chunks

import chromadb
from pathlib import Path
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter as Rec

# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
# model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

from langchain.embeddings.base import Embeddings
from typing import List

class MyEmbeddings(Embeddings):
        def __init__(self):
            self.model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
    
        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return [self.model.encode(t).tolist() for t in texts]
        
        def embed_query(self, query: str) -> List[float]:
            return self.model.encode(query).tolist()

embedding_func = MyEmbeddings()

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="db/pdfs")

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

vector_store = Chroma(
    client = client,
    collection_name="markdown_chunks_collection",
    embedding_function=embedding_func,
    # persist_directory = "db/pdfs",
    # other params...
)

def split_chunks():
    try:
        # Path to markdown directory
        md_dir = Path("data/md/")
        chunk_id_counter = 1  # Initialize a counter for unique chunk IDs
        ids = []
        documents = []

        # Loop through all markdown files in the md directory
        for md_file in md_dir.glob("*.md"):
            with open(md_file, "r") as f:
                md_content = f.read()

            # Chunk the markdown content
            ## Chunk Method 1: Sentence Chunking
            # chunks = get_sentence_chunks(md_content, tokenizer)
            
            ## Chunk Method 2: CST Token Chunking
            # chunks = get_cst_token_chunks(md_content, tokenizer)
            
            ## Chunk Method 3: Recursive Character Chunking
            text_splitter = Rec(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
                add_start_index=True
            )
            chunks = text_splitter.split_text(md_content)
            
            for chunk in chunks:
                # Create a Document object for the chunk
                document_to_add = Document(
                    page_content = chunk,
                    metadata = {"source": str(md_file)}
                )
                
                documents.append(document_to_add)
                
                ids.append(str(chunk_id_counter)) # Add document ID to the list

                chunk_id_counter += 1  # Increment the ID counter
        
        vector_store.add_documents(documents = documents, ids = ids)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    split_chunks()
    
    # results = vector_store.similarity_search(query="insertion sort",k=1)
    # for doc in results:
    #     print(f"* {doc.page_content} [{doc.metadata}]")
    
    # retriever = vector_store.as_retriever(
    # search_type="mmr",
    # search_kwargs={"k": 10, "fetch_k": 20, "lambda_mult": 0.5},
    # )

  from .autonotebook import tqdm as notebook_tqdm


Checking for punkt data in C:\Users\elroy/nltk_data\tokenizers\punkt
punkt data found in C:\Users\elroy/nltk_data\tokenizers\punkt
Checking for punkt_tab data in C:\Users\elroy/nltk_data\tokenizers\punkt_tab
punkt_tab data found in C:\Users\elroy/nltk_data\tokenizers\punkt_tab


# Search relevant documents based on query

In [5]:
results = vector_store.similarity_search(query="insertion sort",k=5)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* |2|4|54|74|75|
|---|---|---|---|---|

|4|74|75|2|54|
|---|---|---|---|---|
||||||

|2|4|74|75|54|
|---|---|---|---|---|
||||||


-----

_CHAPTER 8. SORTING_ 68

###### 8.5 Shell Sort

Put simply shell sort can be thought of as a more efficient variation of insertion
sort as described in 8.4, it achieves this mainly by comparing items of varying
_§_
distances apart resulting in a run time complexity of O(n log[2] _n)._

Shell sort is fairly straight forward but may seem somewhat confusing at
first as it differs from other sorting algorithms in the way it selects items to
compare. Figure 8.5 shows shell sort being ran on an array of integers, the red
coloured square is the current value we are holding. [{'source': 'data\\md\\dsa.md'}]
* 4 75 74

4 75 74 2 54 4 75 74 2 54 4 75 74 2 54

2 54

4 74 75 2 54 2 4 74 75 54 2 4 54 74 75

Figure 8.4: Insertion Sort Iterations

1) algorithm Insertionsort(list)
2) **Pre:** _list_ =
_̸_ _∅_
3) **Post: list has been sorted into values of ascending 