In [14]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
import os

# Specify the path to your folder in Google Drive containing the PDF files
folder_path = "/content/drive/MyDrive/Data"

# Change the current working directory to the specified folder
os.chdir(folder_path)

# List all files in the current directory (optional)
files = os.listdir()
print("Files in the folder:", files)




Mounted at /content/drive
Files in the folder: ['inputFile.txt', 'data.docx', 'Samsung_TC.txt', 'Netflix_TC.txt', 'jjj.txt', 'jjj.index']


In [53]:
class ChunkerBase:
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150):
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.ids=[]
        self.chunk_embeddings = []
        self.collection = collection  # Use ChromaDB collection for storing documents
        self.max_chunk_size = max_chunk_size

    def embed_chunks(self):
        print("Num of input chunks: ", len(self.chunks))
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)
        print("Num of retrieved embeddings for chunks: ", len(self.chunk_embeddings))
        documents=[]
        # list(map(str, self.chunks))
        # embeddings_as_lists = [embedding for embedding in self.chunk_embeddings]


        self.ids = [f"idx{i+1}" for i in range(len(self.chunks))]

        print(self.chunks)
        print(self.ids)
        # self.collection.add(self.chunks,self.ids)
        self.collection.add(embeddings=self.chunk_embeddings, documents=self.chunks, ids=self.ids)

    # for document_id, document in collection.items():
    #     print(f"Document ID: {document_id}")
    #     print(f"Content: {document['content']}")  # Replace with your document structure
    #     print(f"Metadata: {document['metadata']}")  # Replace with your document structure
    #     print("-" * 50)
        # Store chunks and embeddings in ChromaDB collection
        # for i, chunk in enumerate(self.chunks):
        #     embedding = self.chunk_embeddings[i]
        #     document = {
        #         'chunk': chunk,
        #         'embedding': embedding  # Convert embedding to list for storage
        #     }
        #     documents.append(document)

        # self.collection.add(documents)

    def embed_query(self, query):
        return self.embeddings_model.embed_documents([query])[0]

    def search_similar_chunks(self, query_embedding, top_k=5):
        print(query_embedding)
        results = self.collection.query(query_embedding, n_results=top_k)

        if not results:
            print("No similar chunks found.")
            return [], []

        print(results)

        similar_chunks = []
        for doc_list, score_list in zip(results['documents'], results['distances']):
            for doc, score in zip(doc_list, score_list):
                similar_chunks.append((doc, score))

        return similar_chunks


    def get_all_chunks(self):
        return self.chunks

In [54]:
class RecursiveChunker(ChunkerBase):
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150, max_paragraph_words=80):
        super().__init__(model_name, max_chunk_size)
        self.max_paragraph_words = max_paragraph_words

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def recursive_chunk_text(self, text):
        words = text.split()

        if len(words) <= self.max_chunk_size:
            return [text]

        # Find the best splitting point
        split_point = self.max_chunk_size
        while split_point > 0 and not words[split_point - 1].endswith(('.', '!', '?')):
            split_point -= 1

        if split_point == 0:  # No good splitting point found, force split at max_chunk_size
            split_point = self.max_chunk_size

        # Split the text
        chunk = ' '.join(words[:split_point])
        remaining_text = ' '.join(words[split_point:])

        # Recursive call
        chunks = [chunk] + self.recursive_chunk_text(remaining_text)

        return chunks

    def chunk_text(self, text):
        paragraphs = text.split('\n\n')
        self.chunks = []

        for paragraph in paragraphs:
            if len(paragraph.split()) > self.max_paragraph_words:
                indented_chunks = self.recursive_chunk_text(paragraph)
                self.chunks.extend(indented_chunks)
            else:
                self.chunks.append(paragraph)

        print('Total number of chunks:', len(self.chunks))

    def read_text_from_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

# Initialize the chunker
chunker = RecursiveChunker()

# File path to read
file_path = 'inputFile.txt'

# Read text from file
text_from_file = chunker.read_text_from_file(file_path)

# Chunk the text
chunker.chunk_text(text_from_file)
chunker.embed_chunks()

print('Total number of chunks:', len(chunker.chunks))




Total number of chunks: 37
Num of input chunks:  37




Num of retrieved embeddings for chunks:  37
['Introduction to Quantum Physics', 'Quantum physics, also known as quantum mechanics or quantum theory, is a fundamental branch of physics that deals with the behavior of matter and energy at the smallest scales, typically at the level of atoms and subatomic particles. Developed in the early 20th century, quantum physics has revolutionized our understanding of the universe, providing insights into the nature of reality that are often counterintuitive and challenging to grasp.', 'Historical Background', 'The roots of quantum physics can be traced back to the late 19th and early 20th centuries when classical physics could not explain certain experimental observations. Key milestones in the development of quantum theory include:', " Blackbody Radiation: Max Planck's solution to the ultraviolet catastrophe in 1900 introduced the concept of quantized energy levels, laying the groundwork for quantum theory.", 'Photoelectric Effect: In 1905, Albert

In [55]:
# Query similar chunks
query = "What is the purpose of life?"
query_embedding = chunker.embed_query(query)

# Search for similar chunks
similar_chunks = chunker.search_similar_chunks(query_embedding,15 )

print(f"Top {len(similar_chunks)} Similar Chunks for Query: '{query}'")
print("=" * 50)
for i, (chunk_content, similarity_score) in enumerate(similar_chunks, start=1):
    print(f"{i}. Similarity Score: {similarity_score:.4f}")
    print(f"   Chunk Content: {chunk_content}")
    print("-" * 50)

[-0.05420077592134476, 0.09739908576011658, -0.04984784498810768, -0.027614671736955643, 0.04167580604553223, 0.041564375162124634, 0.10489417612552643, -0.04742909595370293, 0.06994100660085678, 0.021041879430413246, 0.011390548199415207, -0.036631301045417786, -0.03106766566634178, 0.02023126184940338, 0.03306577727198601, -0.04286669194698334, -0.09495856612920761, -0.020348843187093735, -0.011758950538933277, 0.0028310164343565702, 0.022865736857056618, 0.012151896953582764, -0.01715513877570629, -0.007286441046744585, -0.10938718169927597, 0.0725872814655304, 0.0079837366938591, 0.0024699384812265635, -0.005898844916373491, -0.0177479051053524, 0.10306547582149506, 0.0350564569234848, 0.08130563795566559, -0.027693582698702812, -0.013340256176888943, 0.028055788949131966, 0.10737166553735733, -0.044121429324150085, 0.007881196215748787, -0.022034524008631706, -0.030411357060074806, -0.046319566667079926, -0.005548986606299877, 0.008468106389045715, -0.0020491129253059626, -0.03435