In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
import os

# Specify the path to your folder in Google Drive containing the PDF files
folder_path = "/content/drive/MyDrive/Data"

# Change the current working directory to the specified folder
os.chdir(folder_path)

# List all files in the current directory (optional)
files = os.listdir()
print("Files in the folder:", files)




Mounted at /content/drive
Files in the folder: ['paper.pdf', 'bert_embeddings.npy', 'resume.pdf', 'inputFile.txt', 'indices.faiss', 'food', 'Book.txt', 'merged.txt', 'business', 'data.docx', 'mergedbusiness.txt', 'Samsung_TC.txt', 'data_base.index', 'data_base1.index', 'data.docx_index.index', 'Netflix_TC.txt', 'all_files_index.index']


# LOADING DIFFERENT FILES, MAKING THEIR CHUNKS SAVING THEM INTO A FILE, MAKING THE EMBEDDINGS AND SAVING IN ONE FAISS, AND THEN QUERYING AGAINST EITHER OF INFORMATION IN THE FILES

In [126]:
import re
import os
import numpy as np
import faiss
from langchain_community.embeddings import HuggingFaceEmbeddings
from docx import Document

class ChunkerBase:
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150):
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []
        self.total_embeddings=[]
        self.max_chunk_size = max_chunk_size

    def embed_chunks(self):
        print("Num of input chunks: ", len(self.chunks))
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)
        print("Num of retrieved embeddings for chunks: ", len(self.chunk_embeddings))

    def embed_query(self, query):
        return self.embeddings_model.embed_documents([query])[0]

    def store_embeddings_in_memory(self):
        return np.array(self.chunk_embeddings).astype(np.float32)

    def search_similar_chunks(self, query_embedding, top_k=5, index_path="jjj.index"):
        if not os.path.exists(index_path):
            print(f"No FAISS index found at {index_path}")
            return [], []

        print('Searching in index:', index_path)
        index = faiss.read_index(index_path)
        query_embedding = np.array([query_embedding]).astype(np.float32)

        distances, indices = index.search(query_embedding, top_k)

        if indices.size == 0:
            print("No similar chunks found.")
            return [], []

        print("Indices of chunks are:", indices)
        print("Distances are:", distances)

        for i, chunk in enumerate(self.get_all_chunks()):
            print(f"Chunk {i+1}: {chunk}")
            print("-" * 40)


        if not os.path.exists(index_path):
            print(f"FAISS index file '{index_path}' not found.")
            return

        try:
            index = faiss.read_index(index_path)
            ntotal = index.ntotal
            d = index.d
            embeddings = np.zeros((ntotal, d), dtype=np.float32)
            index.reconstruct_n(0, ntotal, embeddings)
            self.total_embeddings = embeddings.tolist()
            print(f"Loaded {ntotal} embeddings from FAISS index '{index_path}' into self.total_embeddings.")
        except Exception as e:
            print(f"Error loading FAISS index '{index_path}': {e}")

        retrieved_embeddings = np.array([self.total_embeddings[i] for i in indices[0]]).astype(np.float32)
        results = [(self.chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]
        return results, indices[0]

    def get_all_chunks(self):
        return self.chunks


class RecursiveChunker(ChunkerBase):
    def __init__(self, model_name="all-MiniLM-L6-v2", max_chunk_size=150, max_paragraph_words=80):
        super().__init__(model_name, max_chunk_size)
        self.max_paragraph_words = max_paragraph_words

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def recursive_chunk_text(self, text):
        words = text.split()

        if len(words) <= self.max_chunk_size:
            return [text]

        # Find the best splitting point
        split_point = self.max_chunk_size
        while split_point > 0 and not words[split_point - 1].endswith(('.', '!', '?')):
            split_point -= 1

        if split_point == 0:  # No good splitting point found, force split at max_chunk_size
            split_point = self.max_chunk_size

        # Split the text
        chunk = ' '.join(words[:split_point])
        remaining_text = ' '.join(words[split_point:])

        # Recursive call
        chunks = [chunk] + self.recursive_chunk_text(remaining_text)

        return chunks

    def chunk_text(self, text):
        paragraphs = text.split('\n\n')
        self.chunks = []

        for paragraph in paragraphs:
            if len(paragraph.split()) > self.max_paragraph_words:
                indented_chunks = self.recursive_chunk_text(paragraph)
                self.chunks.extend(indented_chunks)
            else:
                self.chunks.append(paragraph)

        print('Total number of chunks:', len(self.chunks))


def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def read_docx(file_path):
    # Open the document
    doc = Document(file_path)

    full_text = []
    # Extract text from paragraphs
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text.strip())
            full_text.append('\t'.join(row_data))

    return '\n'.join(full_text)

def save_chunks_to_file(chunks, file_path):
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8'):
            pass  # Create the file if it doesn't exist

    with open(file_path, 'a', encoding='utf-8') as file:
        for chunk in chunks:
            file.write(chunk + '\n\n')  # Separate chunks by double newline
    print(f"Chunks saved to: {file_path}")

def load_chunks_and_add(file_path, chunker):
    if not os.path.exists(file_path):
        print(f"File '{file_path}' does not exist. Cannot load chunks.")
        return
    chunker.chunks = []  # Clear existing chunks
    loaded_chunks = read_chunks_from_file(file_path)
    chunker.chunks.extend(loaded_chunks)
    print(f"Loaded {len(loaded_chunks)} chunks from file: {file_path}")

def read_chunks_from_file(file_path):
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().strip()
        chunks = text.split('\n\n')  # Split by double newline to separate chunks
    return chunks

def query_similar_chunks(query, chunker):
    query_embedding = chunker.embed_query(query)

    top_k = 5
    results, indices = chunker.search_similar_chunks(query_embedding, top_k)

    if len(indices) == 0:
        print("No similar chunks found.")
        return

    for i in range(min(top_k, len(results))):
        result, similarity_score = results[i]
        idx = indices[i]
        print(f"{i+1}. Chunk: {result}")
        print(f"   Similarity Score: {similarity_score:.4f}")
        print(f"   Index in original list: {idx}")
        print("-" * 40)

# Example usage to load chunks from file and process a new file
file_path = 'jjj.txt'  # Replace with your file path
chunker = RecursiveChunker()



# Process a new file
new_file_path = 'Netflix_TC.txt'  # Replace with your new file path
new_text = read_text_from_file(new_file_path)
chunker.chunk_text(new_text)
chunker.embed_chunks()

# Save updated chunks back to file
save_chunks_to_file(chunker.chunks, file_path)

index_path = "jjj.index"

# Initialize FAISS index if it doesn't exist
if not os.path.exists(index_path):
    all_embeddings = chunker.store_embeddings_in_memory()  # Assuming chunker has been initialized and processed
    d = all_embeddings.shape[1]  # Dimensionality of embeddings
    index = faiss.IndexFlatIP(d)  # Initialize new index
    index.add(all_embeddings)  # Add embeddings to the index
    faiss.write_index(index, index_path)  # Save the index to file
    print(f"Created and saved FAISS index with {index.ntotal} embeddings.")
else:
    # Index file exists, load existing index
    try:
        index = faiss.read_index(index_path)
        print(f"Existing FAISS index loaded from '{index_path}'")

        # Add new embeddings to the existing index
        new_embeddings = chunker.store_embeddings_in_memory()
        index.add(new_embeddings)
        faiss.write_index(index, index_path)  # Save the updated index to file
        print(f"Added {new_embeddings.shape[0]} new embeddings to the existing index. Total embeddings: {index.ntotal}")

    except Exception as e:
        print(f"Error loading existing index from '{index_path}': {e}")




Total number of chunks: 36
Num of input chunks:  36
Num of retrieved embeddings for chunks:  36
Chunks saved to: jjj.txt
Existing FAISS index loaded from 'jjj.index'
Added 36 new embeddings to the existing index. Total embeddings: 81


In [128]:
# Querying similar chunks
query = "What are billing and cancellation terms"  # Replace with your query
load_chunks_and_add(file_path, chunker)  # Load chunks again for querying
query_similar_chunks(query, chunker)

Loaded 81 chunks from file: jjj.txt
Searching in index: jjj.index
Indices of chunks are: [[50 54 52 47 56]]
Distances are: [[0.5222173  0.4541843  0.39985222 0.3970065  0.38186896]]
Chunk 1: Terms and Conditions
IMPORTANT – PLEASE READ CAREFULLY BEFORE REQUESTING A REPAIR SERVICE AND/OR OTHER SERVICES. THESE TERMS AND CONDITIONS CONSTITUTE YOUR AGREEMENT WITH SAMSUNG GULF ELECTRONICS FZE (“SAMSUNG”) FOR REPAIR SERVICE AND ANY RELEVANT DEVICE OR SOFTWARE UPGRADE PROVIDED BY SAMSUNG, ITS AFFILIATES, ITS THIRD-PARTY SUPPLIERS, AND/OR WORKSHOPS (COLLECTIVELY, THE “SERVICE”).
----------------------------------------
Chunk 2: THE BENEFITS CONFERRED BY THESE TERMS AND CONDITIONS ARE IN ADDITION TO ALL RIGHTS AND REMEDIES CONVEYED BY THE CONSUMER PROTECTION LAWS AND REGULATIONS APPLICABLE IN YOUR COUNTRY OF RESIDENCE. PLEASE NOTE THAT SAMSUNG’S THIRD-PARTY SUPPLIERS MAY ALSO HAVE THEIR OWN TERMS AND CONDITIONS THAT GOVERN THE PROVISION OF THE SERVICE IN ADDITION TO THESE TERMS AND CONDITIONS I