In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
import os

# Specify the path to your folder in Google Drive containing the PDF files
folder_path = "/content/drive/MyDrive/Data"

# Change the current working directory to the specified folder
os.chdir(folder_path)

# List all files in the current directory (optional)
files = os.listdir()
print("Files in the folder:", files)

Mounted at /content/drive
Files in the folder: ['paper.pdf', 'bert_embeddings.npy', 'resume.pdf', 'inputFile.txt', 'indices.faiss', 'food', 'Book.txt', 'merged.txt', 'business', 'data.docx', 'mergedbusiness.txt', 'Netflix_TC.txt', 'Samsung_TC.txt', 'data_base.index', 'data_base1.index']


In [148]:
import re
import os
import numpy as np
import faiss
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings
from docx import Document

class ChunkerBase:
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150):
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []
        self.max_chunk_size = max_chunk_size

    def embed_chunks(self):
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)

    def embed_query(self, query):
        return self.embeddings_model.embed_documents([query])[0]

    def store_embeddings_in_memory(self):
        return np.array(self.chunk_embeddings).astype(np.float32)


    def search_similar_chunks(self, query_embedding, top_k=5, index_path="all_files_index.index"):
        if not os.path.exists(index_path):
            print(f"No FAISS index found at {index_path}")
            return [], []

        index = faiss.read_index(index_path)
        query_embedding = np.array([query_embedding]).astype(np.float32)
        distances, indices = index.search(query_embedding, top_k)

        if indices.size == 0:
            print("No similar chunks found.")
            return [], []

        indices = indices.flatten()  # Ensure indices are flattened

        # Filter out indices that are out of range
        valid_indices = [idx for idx in indices if idx < len(self.chunk_embeddings)]

        if not valid_indices:
            print("No valid indices found within range.")
            return [], []


        # Prepare the results

        results = [(self.chunks[idx], distances[0][i]) for i, idx in enumerate(valid_indices)]

        return results, valid_indices


In [150]:
class SemanticChunker(ChunkerBase):
    def __init__(self, similarity_threshold=0.6, model_name="all-MiniLM-L6-v2", max_chunk_size=200):
        super().__init__(model_name, max_chunk_size)
        self.similarity_threshold = similarity_threshold

    def chunk_text(self, text):
        sentences = re.split(r'(?<=[.?!])\s+', text)
        self.chunks = []
        current_chunk = sentences[0]

        for i in range(1, len(sentences)):
            next_sentence = sentences[i]
            similarity_score = self._calculate_similarity(current_chunk, next_sentence)
            if similarity_score >= self.similarity_threshold:
                current_chunk += ' ' + next_sentence
            else:
                self.chunks.append(current_chunk)
                current_chunk = next_sentence

        self.chunks.append(current_chunk)
        i=0
        for c in self.chunks:
            print(i+1," :" ,c)
            i+=1

    def _calculate_similarity(self, sentence1, sentence2):
        embeddings = self.embeddings_model.embed_documents([sentence1, sentence2])
        similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        return similarity_score

class RecursiveChunker(ChunkerBase):
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150, max_paragraph_words=80):
        super().__init__(model_name, max_chunk_size)
        self.max_paragraph_words = max_paragraph_words

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def recursive_chunk_text(self, text):
        words = text.split()

        if len(words) <= self.max_chunk_size:
            return [text]

        # Find the best splitting point
        split_point = self.max_chunk_size
        while split_point > 0 and not words[split_point - 1].endswith(('.', '!', '?')):
            split_point -= 1

        if split_point == 0:  # No good splitting point found, force split at max_chunk_size
            split_point = self.max_chunk_size

        # Split the text
        chunk = ' '.join(words[:split_point])
        remaining_text = ' '.join(words[split_point:])

        # Recursive call
        chunks = [chunk] + self.recursive_chunk_text(remaining_text)

        return chunks

    def chunk_text(self, text):
        paragraphs = text.split('\n\n')
        self.chunks = []

        for paragraph in paragraphs:
            if len(paragraph.split()) > self.max_paragraph_words:
                indented_chunks = self.recursive_chunk_text(paragraph)
                self.chunks.extend(indented_chunks)
            else:
                self.chunks.append(paragraph)


In [151]:
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def read_docx(file_path):
    # Open the document
    doc = Document(file_path)

    full_text = []
    # Extract text from paragraphs
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text.strip())
            full_text.append('\t'.join(row_data))

    return '\n'.join(full_text)

# def query_similar_chunks(query, chunker):
#     query_embedding = chunker.embed_query(query)
#     top_k = 5
#     results, indices = chunker.search_similar_chunks(query_embedding, top_k)

#     # for idx in indices:
#     #   if 0 <= idx < len(chunker.chunks):
#     #     print(f"Chunk at index {idx}: {chunker.chunks[idx]}")
#     #   else:
#     #     print(f"Index {idx} is out of bounds.")

#     for i in range(min(top_k, len(results))):
#       result, similarity_score = results[i]
#       idx = indices[i]
#       print(f"{i+1}. Chunk: {result}")
#       print(f"   Similarity Score: {similarity_score:.4f}")
#       print(f"   Index in original list: {idx}")
#       print("-" * 40)
def query_similar_chunks(query, chunker):
    query_embedding = chunker.embed_query(query)
    top_k = 5
    results, indices = chunker.search_similar_chunks(query_embedding, top_k)

    print(f"Indices: {indices}")  # Print indices for debugging
    if not indices or len(indices) == 0:
        print("No similar chunks found.")
        return

    for i in range(min(top_k, len(results))):
        result, similarity_score = results[i]
        idx = indices[i]  # Assuming indices are returned as list of indices
        print(f"{i+1}. Chunk: {result}")
        print(f"   Similarity Score: {similarity_score:.4f}")
        print(f"   Index in original list: {idx}")
        print("-" * 40)



# Semantic Chunking

In [141]:
# Example usage
# file_names = [ 'Samsung_TC.txt','Netflix_TC.txt' ]  # List of file names to process
file_names = ['Netflix_TC.txt']  # List of file names to process
chunker = SemanticChunker()  # Initialize chunker
all_embeddings = []  # List to store all embeddings

for file_name in file_names:
    if file_name.endswith('.docx'):
        text = read_docx(file_name)
    elif file_name.endswith('.txt'):
        text = read_text_from_file(file_name)
    else:
        print(f"Unsupported file type for {file_name}")
        continue

    chunker.chunk_text(text)  # Chunk the text
    chunker.embed_chunks()   # Embed the chunks
    embeddings = chunker.store_embeddings_in_memory()  # Get embeddings as numpy array
    all_embeddings.append(embeddings)  # Append embeddings to the list

# Combine all embeddings into a single numpy array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Store all embeddings in a single FAISS index
index_path = "all_files_index.index"
d = len(all_embeddings[0])
index = faiss.IndexFlatL2(d)
index.add(all_embeddings)
faiss.write_index(index, index_path)
print(f"Number of embeddings stored in FAISS: {index.ntotal}")
print(f"FAISS index saved at: {index_path}")



1  : Netflix Terms of Use
Netflix provides a personalized subscription service that allows our members to access entertainment content (“Netflix content”) over the Internet on certain Internet-connected TVs, computers and other devices ("Netflix ready devices").
2  : These Terms of Use govern your use of our service.
3  : As used in these Terms of Use, "Netflix service", "our service" or "the service" means the personalized service provided by Netflix for discovering and accessing Netflix content, including all features and functionalities, recommendations and reviews, our websites, and user interfaces, as well as all content and software associated with our service. References to ‘you’ in these Terms of Use indicate the member who created the Netflix account and whose payment method is charged.
4  : Membership
1.1.
5  : Your Netflix membership will continue until terminated.
6  : To use the Netflix service you must have Internet access and a Netflix ready device, and provide us with o

# RECURSIVE CHUNKING

In [167]:
# Example usage
# file_names = [ 'Samsung_TC.txt' ]  # List of file names to process
file_names = ['Netflix_TC.txt']  # List of file names to process
chunker = RecursiveChunker()  # Initialize chunker
all_embeddings = []  # List to store all embeddings

for file_name in file_names:
    if file_name.endswith('.docx'):
        text = read_docx(file_name)
    elif file_name.endswith('.txt'):
        text = read_text_from_file(file_name)
    else:
        print(f"Unsupported file type for {file_name}")
        continue

    chunker.chunk_text(text)  # Chunk the text
    chunker.embed_chunks()   # Embed the chunks
    embeddings = chunker.store_embeddings_in_memory()  # Get embeddings as numpy array
    all_embeddings.append(embeddings)  # Append embeddings to the list

# Combine all embeddings into a single numpy array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Store all embeddings in a single FAISS index
index_path = "all_files_index.index"
d = len(all_embeddings[0])
index = faiss.IndexFlatIP(d)
index.add(all_embeddings)
faiss.write_index(index, index_path)
print(f"Number of embeddings stored in FAISS: {index.ntotal}")
print(f"FAISS index saved at: {index_path}")

Number of embeddings stored in FAISS: 36
FAISS index saved at: all_files_index.index


In [168]:

# Querying similar chunks based on user input

query = input("Enter your query: ").strip()
query_similar_chunks(query, chunker)

Enter your query: What are Payment Methods?
Query embedding: [-0.02443544752895832, 0.06432610005140305, -0.02624577097594738, -0.02540670521557331, -0.11863012611865997, -0.003059344133362174, 0.040795039385557175, -0.0007149440352804959, 0.017736826092004776, 0.036992646753787994, 0.03377222269773483, 0.013538331724703312, -0.007383543066680431, -0.028677470982074738, -0.0419476144015789, -0.09126945585012436, -0.009491856209933758, 0.020931117236614227, 0.06351090222597122, 0.04956262558698654, 0.0016210480825975537, -0.027426041662693024, -0.06253906339406967, 0.011167054064571857, 0.08678759634494781, -0.01248201448470354, -0.00705499155446887, -0.009299100376665592, 0.04435496777296066, 0.0064119938760995865, 0.04559795558452606, 0.05215005204081535, 0.019344573840498924, -0.03354409709572792, -0.13030223548412323, 0.061955008655786514, 0.03317441791296005, -0.024116361513733864, -0.08416393399238586, 0.027248471975326538, -0.02960111014544964, -0.022300606593489647, -0.001004706

In [163]:
# Example usage
# file_names = [ 'Samsung_TC.txt' ]  # List of file names to process
file_names = ['Samsung_TC.txt','Netflix_TC.txt']  # List of file names to process
chunker = RecursiveChunker()  # Initialize chunker
all_embeddings = []  # List to store all embeddings

for file_name in file_names:
    if file_name.endswith('.docx'):
        text = read_docx(file_name)
    elif file_name.endswith('.txt'):
        text = read_text_from_file(file_name)
    else:
        print(f"Unsupported file type for {file_name}")
        continue

    chunker.chunk_text(text)  # Chunk the text
    chunker.embed_chunks()   # Embed the chunks
    embeddings = chunker.store_embeddings_in_memory()  # Get embeddings as numpy array
    all_embeddings.append(embeddings)  # Append embeddings to the list

# Combine all embeddings into a single numpy array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Store all embeddings in a single FAISS index
index_path = "all_files_index.index"
d = len(all_embeddings[0])
index = faiss.IndexFlatIP(d)
index.add(all_embeddings)
faiss.write_index(index, index_path)
print(f"Number of embeddings stored in FAISS: {index.ntotal}")
print(f"FAISS index saved at: {index_path}")

Number of embeddings stored in FAISS: 51
FAISS index saved at: all_files_index.index


In [166]:
# Querying similar chunks based on user input

query = input("Enter your query: ").strip()
query_similar_chunks(query, chunker)

Enter your query: What are Payment Methods?
Query embedding: [-0.02443544752895832, 0.06432610005140305, -0.02624577097594738, -0.02540670521557331, -0.11863012611865997, -0.003059344133362174, 0.040795039385557175, -0.0007149440352804959, 0.017736826092004776, 0.036992646753787994, 0.03377222269773483, 0.013538331724703312, -0.007383543066680431, -0.028677470982074738, -0.0419476144015789, -0.09126945585012436, -0.009491856209933758, 0.020931117236614227, 0.06351090222597122, 0.04956262558698654, 0.0016210480825975537, -0.027426041662693024, -0.06253906339406967, 0.011167054064571857, 0.08678759634494781, -0.01248201448470354, -0.00705499155446887, -0.009299100376665592, 0.04435496777296066, 0.0064119938760995865, 0.04559795558452606, 0.05215005204081535, 0.019344573840498924, -0.03354409709572792, -0.13030223548412323, 0.061955008655786514, 0.03317441791296005, -0.024116361513733864, -0.08416393399238586, 0.027248471975326538, -0.02960111014544964, -0.022300606593489647, -0.001004706

In [165]:

 #Querying similar chunks based on user input

query = input("Enter your query: ").strip()
query_similar_chunks(query, chunker)

Enter your query: Tell me about Samsung Repair Service?
Query embedding: [-0.17534057796001434, 0.038957901298999786, 0.07168073952198029, -0.05924196168780327, -0.016197102144360542, -0.007155026309192181, 0.007627218030393124, 0.06780639290809631, -0.05567319691181183, 0.03392814099788666, 0.0023585734888911247, 0.05499417707324028, 0.05208814889192581, -0.01727146841585636, -0.04349059611558914, -0.09509707242250443, -0.010309889912605286, -0.009285918436944485, -0.06406436860561371, -0.008375595323741436, -0.04124922677874565, 0.04421263933181763, -0.05326984077692032, -0.02845717780292034, 0.022765368223190308, -0.027410104870796204, -0.023140957579016685, -0.019680742174386978, 0.01370469480752945, -0.06446986645460129, -0.0015020162099972367, 0.05304665490984917, 0.028970342129468918, 0.031401291489601135, -0.04569243639707565, -0.04462385177612305, -0.008950679562985897, -0.02787860296666622, -0.02806185930967331, 0.00119238521438092, -0.08007792383432388, 0.03224433586001396, 