In [1]:
from utils.convert_to_md import convert_all_pdfs_to_md
from utils.sentence_chunking import get_sentence_chunks

import chromadb
from pathlib import Path
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter as Rec
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

splitted_chunks = []

# Initialize ChromaDB client
# client = chromadb.Client()
client = chromadb.PersistentClient(path="db/pdfs")


# Create a collection in ChromaDB (will store embeddings)
collection = client.get_or_create_collection(name="markdown_chunks_collection")

def process_all_markdowns_in_chroma():
    try:
        # Path to markdown directory
        md_dir = Path("data/md/")
        chunk_id_counter = 0  # Initialize a counter for unique chunk IDs

        # Loop through all markdown files in the md directory
        for md_file in md_dir.glob("*.md"):
            with open(md_file, "r") as f:
                md_content = f.read()

            # Chunk the markdown content
            ## Chunk Method 1: Sentence Chunking
            # chunks = get_sentence_chunks(md_content, tokenizer)
            
            ## Chunk Method 2: CST Token Chunking
            # chunks = get_cst_token_chunks(md_content, tokenizer)
            
            ## Chunk Method 3: Recursive Character Chunking
            text_splitter = Rec(
                chunk_size=1000,
                chunk_overlap=500,
                length_function=len,
                add_start_index=True
            )
            chunks = text_splitter.split_text(md_content)

            # Embed and add each chunk to ChromaDB
            for chunk in chunks:
                # Embed the chunk
                chunk_embedding = model.encode(chunk)

                # Use a standardized format for the source path (absolute path)
                source_path = str(md_file)

                # Add the chunk and its embedding to ChromaDB
                collection.add(
                    documents=[chunk],  # Pass chunk as a list
                    embeddings=[chunk_embedding.tolist()],  # Pass embedding as a list of lists
                    metadatas=[{"source": source_path}],
                    ids=[str(chunk_id_counter)]  # Use a unique integer as the ID
                )
                chunk_id_counter += 1  # Increment the ID counter

            print(f"Processed and stored {len(chunks)} chunks from {md_file}")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    # Convert all PDFs to markdown
    convert_all_pdfs_to_md()

    # Process and store all markdown files in ChromaDB
    process_all_markdowns_in_chroma()

  from .autonotebook import tqdm as notebook_tqdm


Checking for punkt data in C:\Users\elroy/nltk_data\tokenizers\punkt
punkt data found in C:\Users\elroy/nltk_data\tokenizers\punkt
Checking for punkt_tab data in C:\Users\elroy/nltk_data\tokenizers\punkt_tab
punkt_tab data found in C:\Users\elroy/nltk_data\tokenizers\punkt_tab




Processing data\books\cp1.pdf...
Markdown file created: data\md\cp1.md
Processing data\books\dsa.pdf...
Markdown file created: data\md\dsa.md
Processed and stored 683 chunks from data\md\cp1.md
Processed and stored 273 chunks from data\md\dsa.md


# Document Processing and Embedding with ChromaDB and Sentence Transformers

In [6]:
import chromadb
from sentence_transformers import SentenceTransformer

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
import os

llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.environ.get("OPENAI_API_KEY"),
)

client = chromadb.PersistentClient(path="db/pdfs")
md_collection = client.get_or_create_collection("markdown_chunks_collection")

def retrieve_relevant_chunks(query, collection, model, top_k=5):
    # Encode the query using the same SentenceTransformer model
    query_embedding = model.encode(query)

    # Query the ChromaDB collection for the top_k similar chunks
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
    )
    
    # Extract the documents from the query results
    relevant_chunks = results["documents"][0]  # Since there's only one query
    return relevant_chunks



# Create system-level prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, say that you don't know. "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

# Create a question-answering chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Function to generate a response using retrieval
def get_response(query, collection, model, question_answer_chain):
    # Retrieve relevant chunks from ChromaDB
    relevant_chunks = retrieve_relevant_chunks(query, collection, model)
    
    # Combine the retrieved chunks into a single context string
    context = "\n".join(relevant_chunks)
    
    # Invoke the QA chain with the provided context
    response = question_answer_chain.invoke({
        "context": context,
        "input": query
    })
    
    return response

# Run a query
query = "What is Insertion Sort?"
response = get_response(query, md_collection, model, question_answer_chain)
print(response)
print(response["answer"])

#Load the model
# model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

# ### QUERY ###
# query = "How does the insertion sort algorithm work?"

# sentence = model.encode(query)

# results = md_collection.query(
#     query_embeddings = sentence.tolist(),
#     n_results = 10
# )

# for i, document in enumerate(results["documents"][0]):
#     print(f"Result {i+1}: {document}")
#     print(f"ID: {results['ids'][0][i]}")
#     # print(f"Metadata: {results['metadatas'][0][i]}")
#     # print(f"Similarity score: {results['distances'][0][i]}\n")

DEBUG: Retrieved Results: {'ids': [['860', '862', '738', '859', '736']], 'distances': [[0.6218289732933044, 0.8659114241600037, 0.9529175162315369, 1.015608310699463, 1.0191012620925903]], 'metadatas': [[{'source': 'data\\md\\dsa.md'}, {'source': 'data\\md\\dsa.md'}, {'source': 'data\\md\\dsa.md'}, {'source': 'data\\md\\dsa.md'}, {'source': 'data\\md\\dsa.md'}]], 'embeddings': None, 'documents': [['_CHAPTER 8. SORTING_ 67\n\n###### 8.4 Insertion Sort\n\nInsertion sort is a somewhat interesting algorithm with an expensive runtime of\n_O(n[2]). It can be best thought of as a sorting scheme similar to that of sorting_\na hand of playing cards, i.e. you take one card and then look at the rest with\nthe intent of building up an ordered set of cards in your hand.\n\n4 75 74\n\n4 75 74 2 54 4 75 74 2 54 4 75 74 2 54\n\n2 54\n\n4 74 75 2 54 2 4 74 75 54 2 4 54 74 75\n\nFigure 8.4: Insertion Sort Iterations\n\n1) algorithm Insertionsort(list)\n2) **Pre:** _list_ =\n_̸_ _∅_\n3) **Post: list has 

AttributeError: 'str' object has no attribute 'page_content'

In [2]:
results2 = md_collection.get()

gathered_chunks = []
for idx, chunk in enumerate(results2['documents']):
    # print(chunk[0])
    gathered_chunks.append(chunk)
    

In [3]:
from utils.bm25_ranking import find_closest_chunks_bm25, new_bm25, re_rank_chunks_with_embeddings
print("OLD BM25")
results_top_n = find_closest_chunks_bm25(query, results2, top_n=10)
for res in results_top_n:
    print(res['score']," ",res['id'],"\n")
    # print(res['document'],"\n")


print("NEW BM25")
bm25_results = new_bm25(query, results2, top_n=10)
sorted_results = re_rank_chunks_with_embeddings(query, bm25_results)

# print(results_top_n)
# for res in results_top_n:
#     print(res['score']," ",res['id'],"\n")
#     print(res['document'],"\n")

# organised_list = {}
# for item in results2:
#     print(item)
#     # organised_list[results2[item]] = ## id and document
# print(organised_list)
# print(results_top_n)
# print(md_collection.get(ids="297"))

OLD BM25
14.584834019480919   377 

12.523017371922746   378 

12.050891594269249   375 

11.880403985989389   866 

11.260478820127473   535 

11.123203021744485   815 

11.035082004738484   814 

11.002614755875888   406 

10.965935890038589   260 

10.352130470797842   805 

NEW BM25
Chunk ID: 815, BM25 Score: 11.123203021744485, Embedding Score: 0.4674035310745239, Source: data\md\dsa.md
Most libraries provide implementations of unordered sets and so DSA does
not; we simply mention it here to disambiguate between an unordered set and
ordered set.

We will only look at insertion for an unordered set and cover briefly why a
hash table is an efficient data structure to use for its implementation.

###### 5.1.1 Insertion

An unordered set can be efficiently implemented using a hash table as its backing
data structure. As mentioned previously we only add an item to a set if that
item is not already in the set, so the backing data structure we use must have
a quick look up and insertion 

In [22]:
print(md_collection.get(ids=['220']))

{'ids': ['220'], 'embeddings': None, 'metadatas': [{'source': 'data\\md\\cp1.md'}], 'documents': ['best possible acorns collected when Jayjay is at this height. The bottom-up DP code that requires\n\nonly 2000 = 2K states and time complexity of 2000 2000 = 4M is as follow:\n_×_\n\nfor (int tree = 0; tree < t; tree++) // initialization\ndp[h] = max(dp[h], acorn[tree][h]);\nfor (int height = h - 1; height >= 0; height--)\nfor (int tree = 0; tree < t; tree++) {\nacorn[tree][height] +=\nmax(acorn[tree][height + 1], // from this tree, +1 above\n((height + f <= h) ? dp[height + f] : 0)); // best from tree at height + f\ndp[height] = max(dp[height], acorn[tree][height]); // update this too\n}\nprintf("%d\\n", dp[0]); // solution will be here\n\nLesson: When na¨ıve DP states are too large causing the overall DP time complexity not-doable,\n\nthink of different ways other than the obvious to represent the possible states. Remember that no\n\nprogramming contest problem is unsolvable, the proble