# Mounting and merging all the file in folder to one txt file


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
import os


def merge_text_files(folder_path, output_file):
    # Create/Open the output file in write mode
    with open(output_file, 'w') as outfile:
        # Check if folder exists
        if not os.path.isdir(folder_path):
            print(f"Folder '{folder_path}' does not exist.")
            return

        # Check if the folder contains any text files
        text_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        if not text_files:
            print(f"No text files found in folder '{folder_path}'.")
            return

        # Iterate over each text file in the folder
        for filename in text_files:
            file_path = os.path.join(folder_path, filename)
            # Print the name of the file being processed
            print(f'Processing file: {filename}')
            try:
                # Open the text file in read mode
                with open(file_path, 'r') as infile:
                    # Read the content of the file
                    content = infile.read()
                    # Print the content of the file
                    print(f'Content of {filename}:\n{content}')
                    # Write the content to the output file
                    outfile.write(content)
                    outfile.write("\n")  # Optional: Add a newline to separate the contents of different files
            except Exception as e:
                print(f"Error reading file {filename}: {e}")

# Example usage
folder_path = '/content/drive/MyDrive/Data/business'  # Replace with the path to your folder containing the text files
output_file = '/content/drive/MyDrive/Data/mergedbusiness.txt'  # Output file name

merge_text_files(folder_path, output_file)
print(f'All files have been merged into {output_file}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing file: business_100.txt
Content of business_100.txt:
US economy still growing says Fed

Most areas of the US saw their economy continue to expand in December and early January, the US Federal Reserve said in its latest Beige Book report.

Of the 12 US regions it identifies for the study, 11 showed stronger economic growth, with only the Cleveland area falling behind with a "mixed" rating. Consumer spending was higher in December than November, and festive sales were also up on 2003. The employment picture also improved, the Fed said.

"Labour markets firmed in a number of districts, but wage pressures generally remained modest," the Beige Book said. "Several districts reported higher prices for building materials and manufacturing inputs, but most reported steady or only slightly higher overall price levels." The report added that residential real e

In [None]:
import os

# Specify the path to your folder in Google Drive containing the PDF files
folder_path = "/content/drive/MyDrive/Data"

# Change the current working directory to the specified folder
os.chdir(folder_path)

# List all files in the current directory (optional)
files = os.listdir()
print("Files in the folder:", files)


def read_txt(file_path):
    """
    Reads text from a .txt file and returns it as a string.

    Args:
        file_path (str): Path to the .txt file.

    Returns:
        str: The extracted text from the .txt file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Example usage
file_path = 'mergedbusiness.txt'
txt_text = read_txt(file_path)
# print(txt_text)


Files in the folder: ['paper.pdf', 'bert_embeddings.npy', 'resume.pdf', 'inputFile.txt', 'indices.faiss', 'food', 'merged.txt', 'Book.txt', 'business', 'mergedbusiness.txt']


# STORING THE EMBEDDINGS IN VECTOR DB

In [None]:
import re
import numpy as np
import faiss  # Import Faiss for vector database functionality
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5"):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.index = faiss.IndexFlatL2(768)  # Initialize Faiss index for 768-dimensional embeddings

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def _embed_sentences(self, sentences):
        embeddings = self.embeddings_model.embed_documents(sentences)
        return embeddings

    def _add_to_faiss_index(self, embeddings):
        # Convert embeddings to numpy array
        embeddings_np = np.array(embeddings)
        # Add embeddings to Faiss index
        self.index.add(embeddings_np.astype(np.float32))

    def _search_in_faiss_index(self, query_embedding, top_k=5):
        # Perform similarity search in Faiss index
        distances, indices = self.index.search(np.array([query_embedding]).astype(np.float32), top_k)
        return distances, indices

    def chunk_text(self, text):
        # Split text into individual sentences
        sentences = self._split_sentences(text)

        # Embed sentences
        embeddings = self._embed_sentences(sentences)

        # Add embeddings to Faiss index
        self._add_to_faiss_index(embeddings)

        # Initialize variables
        chunks = []
        current_chunk = sentences[0]
        current_embedding = embeddings[0]

        # Iterate over sentences to form chunks
        for i in range(1, len(sentences)):
            next_sentence = sentences[i]
            next_embedding = embeddings[i]

            # Calculate cosine similarity between current and next sentence embeddings
            similarity_score = cosine_similarity([current_embedding], [next_embedding])[0][0]

            # Check if similarity score is above threshold
            if similarity_score >= self.similarity_threshold:
                current_chunk += ' ' + next_sentence
                current_embedding = np.mean([current_embedding, next_embedding], axis=0)
            else:
                chunks.append(current_chunk)
                current_chunk = next_sentence
                current_embedding = next_embedding

        # Append the last chunk
        chunks.append(current_chunk)

        # Print the number of chunks
        print(f'Number of chunks formed: {len(chunks)}')
        # Print each chunk
        for i, chunk in enumerate(chunks):
            print(f'Chunk {i+1}: {chunk}\n')

        return chunks

    def similarity_search(self, query, text, top_k=5):
        # Chunk the text
        chunks = self.chunk_text(text)

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Search in Faiss index
        distances, indices = self._search_in_faiss_index(query_embedding, top_k)

        # Retrieve top-k chunks based on Faiss indices
        top_k_chunks = [chunks[idx] for idx in indices[0]]
        top_k_similarities = [1 - dist for dist in distances[0]]  # Convert distance to similarity

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"

# Example usage
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

file_path = "mergedbusiness.txt"
text = read_text_from_file(file_path)

sentence_chunker = SentenceChunker()
chunks = sentence_chunker.chunk_text(text)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

AssertionError: 

# SEMANTIC CHUNKING FOR BGE-SMALL
embedding of chunks is mean of combined sentences

In [None]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5"):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def _embed_sentences(self, sentences):
        embeddings = self.embeddings_model.embed_documents(sentences)
        return embeddings

    def chunk_text(self, text):
        # Split text into individual sentences
        sentences = self._split_sentences(text)

        # Embed sentences
        embeddings = self._embed_sentences(sentences)

        # Initialize variables
        self.chunks = []
        self.chunk_embeddings = []
        current_chunk = sentences[0]
        current_embedding = embeddings[0]

        # Iterate over sentences to form chunks
        for i in range(1, len(sentences)):
            next_sentence = sentences[i]
            next_embedding = embeddings[i]

            # Calculate cosine similarity between current and next sentence embeddings
            similarity_score = cosine_similarity([current_embedding], [next_embedding])[0][0]

            # Check if similarity score is above threshold
            if similarity_score >= self.similarity_threshold:
                current_chunk += ' ' + next_sentence
                current_embedding = np.mean([current_embedding, next_embedding], axis=0)
            else:
                self.chunks.append(current_chunk)
                self.chunk_embeddings.append(current_embedding)
                current_chunk = next_sentence
                current_embedding = next_embedding

        # Append the last chunk
        self.chunks.append(current_chunk)
        self.chunk_embeddings.append(current_embedding)

        # Print the number of chunks
        print(f'Number of chunks formed: {len(self.chunks)}')
        # Print each chunk
        for i, chunk in enumerate(self.chunks):
            print(f'Chunk {i+1}: {chunk}\n')

    def similarity_search(self, query, top_k=5):
        # Check if chunks are already formed
        if not self.chunks or not self.chunk_embeddings:
            raise ValueError("Text must be chunked before performing a similarity search.")

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Calculate similarity between query embedding and each chunk
        similarities = []
        for chunk_embedding in self.chunk_embeddings:
            similarity_score = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
            similarities.append(similarity_score)

        # Sort chunks based on similarity scores (descending order)
        sorted_indices = np.argsort(similarities)[::-1]
        top_k_chunks = [self.chunks[idx] for idx in sorted_indices[:top_k]]
        top_k_similarities = [similarities[idx] for idx in sorted_indices[:top_k]]

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"

# Example usage
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

file_path = "mergedbusiness.txt"
text = read_text_from_file(file_path)

sentence_chunker = SentenceChunker()

# Chunk the text once
sentence_chunker.chunk_text(text)






Number of chunks formed: 186
Chunk 1: US economy still growing says Fed

Most areas of the US saw their economy continue to expand in December and early January, the US Federal Reserve said in its latest Beige Book report.

Chunk 2: Of the 12 US regions it identifies for the study, 11 showed stronger economic growth, with only the Cleveland area falling behind with a "mixed" rating.

Chunk 3: Consumer spending was higher in December than November, and festive sales were also up on 2003.

Chunk 4: The employment picture also improved, the Fed said.

Chunk 5: "Labour markets firmed in a number of districts, but wage pressures generally remained modest," the Beige Book said. "Several districts reported higher prices for building materials and manufacturing inputs, but most reported steady or only slightly higher overall price levels." The report added that residential real estate activity remained strong and that commercial real estate activity strengthened in most districts. "Office leas

In [None]:
# Perform similarity search against a query
query ="What are the effects of rising fuel costs?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print the top-k chunks and their similarity scores
for i, (chunk, similarity) in enumerate(zip(top_k_chunks, top_k_similarities)):
    print(f"Top {i+1} Chunk: {chunk}\nSimilarity Score: {similarity}")

Top 1 Chunk: Small firms 'hit by rising costs'

Rising fuel and materials costs are hitting confidence among the UK's small manufacturers despite a rise in output, business lobby group the CBI says. A CBI quarterly survey found output had risen by the fastest rate in seven years but many firms were seeing the benefits offset by increasing expenses. The CBI also found spending on innovation, training and retraining is forecast to go up over the next year. However, firms continue to scale back investment in buildings and machinery.
Similarity Score: 0.7137263600257702
Top 2 Chunk: The curbs were introduced earlier this year to ward off the risk that rapid expansion might lead to soaring prices. There were also fears that too much stress might be placed on the fragile banking system.
Similarity Score: 0.702712698325919
Top 3 Chunk: World oil prices have risen by more than 60% since the start of the year as production struggles to keep pace with soaring demand.
Similarity Score: 0.64916575

# Embedding the CHUNKS
embedding is done after chunks are formed
more similarity score in this case

In [None]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5"):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def chunk_text(self, text):
        # Split text into individual sentences
        sentences = self._split_sentences(text)

        # Initialize variables
        self.chunks = []
        current_chunk = sentences[0]

        # Iterate over sentences to form chunks
        for i in range(1, len(sentences)):
            next_sentence = sentences[i]

            # Calculate cosine similarity between current and next sentence embeddings
            similarity_score = self._calculate_similarity(current_chunk, next_sentence)

            # Check if similarity score is above threshold
            if similarity_score >= self.similarity_threshold:
                current_chunk += ' ' + next_sentence
            else:
                self.chunks.append(current_chunk)
                current_chunk = next_sentence

        # Append the last chunk
        self.chunks.append(current_chunk)

        # Print the number of chunks formed
        print(f'Number of chunks formed: {len(self.chunks)}')

    def _calculate_similarity(self, sentence1, sentence2):
        # Embed the sentences
        embeddings = self.embeddings_model.embed_documents([sentence1, sentence2])
        embedding1 = embeddings[0]
        embedding2 = embeddings[1]

        # Calculate cosine similarity between embeddings
        similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity_score

    def embed_chunks(self):
        # Embed each chunk using embeddings model
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)

    def similarity_search(self, query, top_k=5):
        # Check if chunks are already formed and embedded
        if not self.chunks or not self.chunk_embeddings:
            raise ValueError("Text must be chunked and embedded before performing a similarity search.")

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Calculate similarity between query embedding and each chunk embedding
        similarities = []
        for chunk_embedding in self.chunk_embeddings:
            similarity_score = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
            similarities.append(similarity_score)

        # Sort chunks based on similarity scores (descending order)
        sorted_indices = np.argsort(similarities)[::-1]
        top_k_chunks = [self.chunks[idx] for idx in sorted_indices[:top_k]]
        top_k_similarities = [similarities[idx] for idx in sorted_indices[:top_k]]

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"

# Example usage
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

file_path = "mergedbusiness.txt"
text = read_text_from_file(file_path)

sentence_chunker = SentenceChunker()

# Chunk the text once
sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()






Number of chunks formed: 248


In [None]:
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What are the effects of rising fuel costs?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

# Generate prompt example
# query = "How does globalization impact business operations?"
# similar_text = "Globalization affects businesses by opening up new markets and increasing competition."
# prompt = sentence_chunker.generate_prompt(query, similar_text)
# print(prompt)

Chunk 1: US economy still growing says Fed

Most areas of the US saw their economy continue to expand in December and early January, the US Federal Reserve said in its latest Beige Book report.
Chunk 2: Of the 12 US regions it identifies for the study, 11 showed stronger economic growth, with only the Cleveland area falling behind with a "mixed" rating.
Chunk 3: Consumer spending was higher in December than November, and festive sales were also up on 2003.
Chunk 4: The employment picture also improved, the Fed said.
Chunk 5: "Labour markets firmed in a number of districts, but wage pressures generally remained modest," the Beige Book said. "Several districts reported higher prices for building materials and manufacturing inputs, but most reported steady or only slightly higher overall price levels." The report added that residential real estate activity remained strong and that commercial real estate activity strengthened in most districts.
Chunk 6: "Office leasing was especially brisk

# Recursive Chunking

In [54]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5", max_chunk_size=200, max_paragraph_words=100):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []
        self.max_chunk_size = max_chunk_size
        self.max_paragraph_words = max_paragraph_words

    def _split_paragraphs(self, text):
        # Split text into paragraphs based on newline characters
        return text.split('\n')

    def _split_sentences(self, paragraph):
        # Split paragraph into sentences
        return re.split(r'(?<=[.?!])\s+', paragraph)

    def chunk_text(self, text):
        # Split text into paragraphs
        paragraphs = self._split_paragraphs(text)

        # Initialize variables
        self.chunks = []

        for paragraph in paragraphs:
            # Check if paragraph exceeds word limit
            if len(paragraph.split()) > self.max_paragraph_words:
                # Split paragraph into sentences
                sentences = self._split_sentences(paragraph)

                current_chunk = sentences[0]

                # Iterate over sentences to form chunks
                for i in range(1, len(sentences)):
                    next_sentence = sentences[i]

                    # Check if similarity score is above threshold or chunk size is within limit
                    similarity_score = self._calculate_similarity(current_chunk, next_sentence)
                    if similarity_score >= self.similarity_threshold or len(current_chunk.split()) + len(next_sentence.split()) <= self.max_chunk_size:
                        current_chunk += ' ' + next_sentence
                    else:
                        self.chunks.append(current_chunk)
                        current_chunk = next_sentence

                # Append the last chunk
                self.chunks.append(current_chunk)

            else:
                self.chunks.append(paragraph)

        # Print the number of chunks formed
        print(f'Number of chunks formed: {len(self.chunks)}')

    def _calculate_similarity(self, sentence1, sentence2):
        # Embed the sentences
        embeddings = self.embeddings_model.embed_documents([sentence1, sentence2])
        embedding1 = embeddings[0]
        embedding2 = embeddings[1]

        # Calculate cosine similarity between embeddings
        similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity_score

    def embed_chunks(self):
        # Embed each chunk using embeddings model
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)

    def similarity_search(self, query, top_k=5):
        # Check if chunks are already formed and embedded
        if not self.chunks or not self.chunk_embeddings:
            raise ValueError("Text must be chunked and embedded before performing a similarity search.")

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Calculate similarity between query embedding and each chunk embedding
        similarities = []
        for chunk_embedding in self.chunk_embeddings:
            similarity_score = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
            similarities.append(similarity_score)

        # Sort chunks based on similarity scores (descending order)
        sorted_indices = np.argsort(similarities)[::-1]
        top_k_chunks = [self.chunks[idx] for idx in sorted_indices[:top_k]]
        top_k_similarities = [similarities[idx] for idx in sorted_indices[:top_k]]

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"

# Example usage
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

file_path = "mergedbusiness.txt"
text = read_text_from_file(file_path)

sentence_chunker = SentenceChunker(max_paragraph_words=100)

# Chunk the text using paragraph and sentence chunking
sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()




Number of chunks formed: 294


In [55]:
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What are the effects of rising fuel costs?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Chunk 1: US economy still growing says Fed
Chunk 2: 
Chunk 3: Most areas of the US saw their economy continue to expand in December and early January, the US Federal Reserve said in its latest Beige Book report.
Chunk 4: 
Chunk 5: Of the 12 US regions it identifies for the study, 11 showed stronger economic growth, with only the Cleveland area falling behind with a "mixed" rating. Consumer spending was higher in December than November, and festive sales were also up on 2003. The employment picture also improved, the Fed said.
Chunk 6: 
Chunk 7: "Labour markets firmed in a number of districts, but wage pressures generally remained modest," the Beige Book said. "Several districts reported higher prices for building materials and manufacturing inputs, but most reported steady or only slightly higher overall price levels." The report added that residential real estate activity remained strong and that commercial real estate activity strengthened in most districts. "Office leasing was espec

# Testing Semantic Chunking on Letter
Result=> Good chunks formed

In [75]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5"):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []

    def _split_sentences(self, text):
        return re.split(r'(?<=[.?!])\s+', text)

    def chunk_text(self, text):
        # Split text into individual sentences
        sentences = self._split_sentences(text)

        # Initialize variables
        self.chunks = []
        current_chunk = sentences[0]

        # Iterate over sentences to form chunks
        for i in range(1, len(sentences)):
            next_sentence = sentences[i]

            # Calculate cosine similarity between current and next sentence embeddings
            similarity_score = self._calculate_similarity(current_chunk, next_sentence)

            # Check if similarity score is above threshold
            if similarity_score >= self.similarity_threshold:
                current_chunk += ' ' + next_sentence
            else:
                self.chunks.append(current_chunk)
                current_chunk = next_sentence

        # Append the last chunk
        self.chunks.append(current_chunk)

        # Print the number of chunks formed
        print(f'Number of chunks formed: {len(self.chunks)}')

    def _calculate_similarity(self, sentence1, sentence2):
        # Embed the sentences
        embeddings = self.embeddings_model.embed_documents([sentence1, sentence2])
        embedding1 = embeddings[0]
        embedding2 = embeddings[1]

        # Calculate cosine similarity between embeddings
        similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity_score

    def embed_chunks(self):
        # Embed each chunk using embeddings model
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)

    def similarity_search(self, query, top_k=5):
        # Check if chunks are already formed and embedded
        if not self.chunks or not self.chunk_embeddings:
            raise ValueError("Text must be chunked and embedded before performing a similarity search.")

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Calculate similarity between query embedding and each chunk embedding
        similarities = []
        for chunk_embedding in self.chunk_embeddings:
            similarity_score = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
            similarities.append(similarity_score)

        # Sort chunks based on similarity scores (descending order)
        sorted_indices = np.argsort(similarities)[::-1]
        top_k_chunks = [self.chunks[idx] for idx in sorted_indices[:top_k]]
        top_k_similarities = [similarities[idx] for idx in sorted_indices[:top_k]]

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"

text="""
Acme Corporation
123 Business Street
Business City, BC 12345
Tel: (123) 456-7890
Email: info@acmecorp.com

[Date]

[Recipient's Name]
[Recipient's Title]
XYZ Solutions
456 Tech Avenue
Tech City, TC 54321

Dear [Recipient's Name],

I hope this letter finds you in good health and high spirits. I am writing to follow up on our recent discussions regarding the upcoming collaboration between Acme Corporation and XYZ Solutions.

As agreed upon, Acme Corporation will be providing software integration services to enhance the efficiency of your production processes. Our team has conducted a thorough analysis of your current systems and has formulated a detailed plan to integrate the new software seamlessly.

The project timeline has been outlined as follows:
- Phase 1: Requirements Gathering and Analysis (Estimated Completion: [Date])
- Phase 2: Software Development and Testing (Estimated Completion: [Date])
- Phase 3: Implementation and Training (Estimated Completion: [Date])
- Phase 4: Post-Implementation Support and Optimization (Estimated Completion: [Date])

Please find attached the detailed project proposal, which includes scope, objectives, deliverables, and timelines. We are committed to ensuring that each phase of the project is executed with precision and meets the highest standards of quality.

Should you have any questions or require further information, please do not hesitate to contact me directly at (123) 456-7890 or via email at info@acmecorp.com. Our team looks forward to working closely with XYZ Solutions to achieve mutual success and deliver tangible results.

Thank you for your attention to this matter. I appreciate your continued partnership and collaboration.

Best regards,

[Your Name]
[Your Position]
Acme Corporation
"""
sentence_chunker = SentenceChunker()

# Chunk the text once
sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()






Number of chunks formed: 6


In [58]:
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What is the project timeline ?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Chunk 1:  
Acme Corporation
123 Business Street
Business City, BC 12345
Tel: (123) 456-7890
Email: info@acmecorp.com

[Date]

[Recipient's Name]
[Recipient's Title]
XYZ Solutions
456 Tech Avenue
Tech City, TC 54321

Dear [Recipient's Name],

I hope this letter finds you in good health and high spirits. I am writing to follow up on our recent discussions regarding the upcoming collaboration between Acme Corporation and XYZ Solutions. As agreed upon, Acme Corporation will be providing software integration services to enhance the efficiency of your production processes.
Chunk 2: Our team has conducted a thorough analysis of your current systems and has formulated a detailed plan to integrate the new software seamlessly.
Chunk 3: The project timeline has been outlined as follows:
- Phase 1: Requirements Gathering and Analysis (Estimated Completion: [Date])
- Phase 2: Software Development and Testing (Estimated Completion: [Date])
- Phase 3: Implementation and Training (Estimated Completion

# Testing Semantic on Data with numbers having dots after them (1.)
result=bad forms seperate chunks for numbers

In [76]:
text="""1. In the midst of winter, I found there was, within me, an invincible summer.
2. And that makes me happy. For it says that no matter how hard the world pushes against me, within me, there’s something stronger – something better, pushing right back.
3. A person is a person because they are there, they are alive, and they are made in the image of God."""

sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What happened in the midst of winter"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Number of chunks formed: 7
Chunk 1: 1.
Chunk 2: In the midst of winter, I found there was, within me, an invincible summer.
Chunk 3: 2.
Chunk 4: And that makes me happy.
Chunk 5: For it says that no matter how hard the world pushes against me, within me, there’s something stronger – something better, pushing right back.
Chunk 6: 3.
Chunk 7: A person is a person because they are there, they are alive, and they are made in the image of God.
Similar Chunk 1: In the midst of winter, I found there was, within me, an invincible summer.
Similarity Score: 0.7178308683418031

Similar Chunk 2: 3.
Similarity Score: 0.5189677969452556

Similar Chunk 3: 1.
Similarity Score: 0.5163402516378007

Similar Chunk 4: 2.
Similarity Score: 0.5075511573425232

Similar Chunk 5: A person is a person because they are there, they are alive, and they are made in the image of God.
Similarity Score: 0.5054104757617178



# Testing Letters using Recursive Chunking  
results=> bad chunks

In [103]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

class SentenceChunker:
    def __init__(self, similarity_threshold=0.6, model_name="BAAI/bge-small-en-v1.5", max_chunk_size=200, max_paragraph_words=100):
        self.similarity_threshold = similarity_threshold
        self.embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        self.chunks = []
        self.chunk_embeddings = []
        self.max_chunk_size = max_chunk_size
        self.max_paragraph_words = max_paragraph_words

    def _split_paragraphs(self, text):
        # Split text into paragraphs based on newline characters
        return text.split('\n\n')

    def _split_sentences(self, paragraph):
        # Split paragraph into sentences
        return re.split(r'(?<=[.?!])\s+', paragraph)

    def chunk_text(self, text):
        # Split text into paragraphs
        paragraphs = self._split_paragraphs(text)

        # Initialize variables
        self.chunks = []

        for paragraph in paragraphs:
            # Check if paragraph exceeds word limit
            if len(paragraph.split()) > self.max_paragraph_words:
                # Split paragraph into sentences
                sentences = self._split_sentences(paragraph)

                current_chunk = sentences[0]

                # Iterate over sentences to form chunks
                for i in range(1, len(sentences)):
                    next_sentence = sentences[i]

                    # Check if similarity score is above threshold or chunk size is within limit
                    similarity_score = self._calculate_similarity(current_chunk, next_sentence)
                    if similarity_score >= self.similarity_threshold or len(current_chunk.split()) + len(next_sentence.split()) <= self.max_chunk_size:
                        current_chunk += ' ' + next_sentence
                    else:
                        self.chunks.append(current_chunk)
                        current_chunk = next_sentence

                # Append the last chunk
                self.chunks.append(current_chunk)

            else:
                self.chunks.append(paragraph)

        # Print the number of chunks formed
        print(f'Number of chunks formed: {len(self.chunks)}')

    def _calculate_similarity(self, sentence1, sentence2):
        # Embed the sentences
        embeddings = self.embeddings_model.embed_documents([sentence1, sentence2])
        embedding1 = embeddings[0]
        embedding2 = embeddings[1]

        # Calculate cosine similarity between embeddings
        similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity_score

    def embed_chunks(self):
        # Embed each chunk using embeddings model
        self.chunk_embeddings = self.embeddings_model.embed_documents(self.chunks)

    def similarity_search(self, query, top_k=5):
        # Check if chunks are already formed and embedded
        if not self.chunks or not self.chunk_embeddings:
            raise ValueError("Text must be chunked and embedded before performing a similarity search.")

        # Embed the query
        query_embedding = self.embeddings_model.embed_documents([query])[0]

        # Calculate similarity between query embedding and each chunk embedding
        similarities = []
        for chunk_embedding in self.chunk_embeddings:
            similarity_score = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
            similarities.append(similarity_score)

        # Sort chunks based on similarity scores (descending order)
        sorted_indices = np.argsort(similarities)[::-1]
        top_k_chunks = [self.chunks[idx] for idx in sorted_indices[:top_k]]
        top_k_similarities = [similarities[idx] for idx in sorted_indices[:top_k]]

        return top_k_chunks, top_k_similarities

    def generate_prompt(self, query, similar_text):
        return f"Query: {query}\nSimilar Text: {similar_text}"


sentence_chunker = SentenceChunker(max_paragraph_words=100)

# Chunk the text using paragraph and sentence chunking

text="""
Acme Corporation
123 Business Street
Business City, BC 12345
Tel: (123) 456-7890
Email: info@acmecorp.com

[Date]

[Recipient's Name]
[Recipient's Title]
XYZ Solutions
456 Tech Avenue
Tech City, TC 54321

Dear [Recipient's Name],

I hope this letter finds you in good health and high spirits. I am writing to follow up on our recent discussions regarding the upcoming collaboration between Acme Corporation and XYZ Solutions.

As agreed upon, Acme Corporation will be providing software integration services to enhance the efficiency of your production processes. Our team has conducted a thorough analysis of your current systems and has formulated a detailed plan to integrate the new software seamlessly.

The project timeline has been outlined as follows:
- Phase 1: Requirements Gathering and Analysis (Estimated Completion: [Date])
- Phase 2: Software Development and Testing (Estimated Completion: [Date])
- Phase 3: Implementation and Training (Estimated Completion: [Date])
- Phase 4: Post-Implementation Support and Optimization (Estimated Completion: [Date])

Please find attached the detailed project proposal, which includes scope, objectives, deliverables, and timelines. We are committed to ensuring that each phase of the project is executed with precision and meets the highest standards of quality.

Should you have any questions or require further information, please do not hesitate to contact me directly at (123) 456-7890 or via email at info@acmecorp.com. Our team looks forward to working closely with XYZ Solutions to achieve mutual success and deliver tangible results.

Thank you for your attention to this matter. I appreciate your continued partnership and collaboration.

Best regards,

[Your Name]
[Your Position]
Acme Corporation
"""
sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()


Number of chunks formed: 12


In [104]:
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What is the project timeline ?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Chunk 1:  
Acme Corporation
123 Business Street
Business City, BC 12345
Tel: (123) 456-7890
Email: info@acmecorp.com
Chunk 2: [Date]
Chunk 3: [Recipient's Name]
[Recipient's Title]
XYZ Solutions
456 Tech Avenue
Tech City, TC 54321
Chunk 4: Dear [Recipient's Name],
Chunk 5: I hope this letter finds you in good health and high spirits. I am writing to follow up on our recent discussions regarding the upcoming collaboration between Acme Corporation and XYZ Solutions.
Chunk 6: As agreed upon, Acme Corporation will be providing software integration services to enhance the efficiency of your production processes. Our team has conducted a thorough analysis of your current systems and has formulated a detailed plan to integrate the new software seamlessly.
Chunk 7: The project timeline has been outlined as follows:
- Phase 1: Requirements Gathering and Analysis (Estimated Completion: [Date])
- Phase 2: Software Development and Testing (Estimated Completion: [Date])
- Phase 3: Implementation an

# Testing Recursive Chunking on Data with numbers having dots after them (1.)
result=good form chunks having numbers with in them

In [73]:
text="""1. In the midst of winter, I found there was, within me, an invincible summer.
2. And that makes me happy. For it says that no matter how hard the world pushes against me, within me, there’s something stronger – something better, pushing right back.
3. A person is a person because they are there, they are alive, and they are made in the image of God."""

sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="What happened in the midst of winter"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Number of chunks formed: 3
Chunk 1: 1. In the midst of winter, I found there was, within me, an invincible summer.
Chunk 2: 2. And that makes me happy. For it says that no matter how hard the world pushes against me, within me, there’s something stronger – something better, pushing right back.
Chunk 3: 3. A person is a person because they are there, they are alive, and they are made in the image of God.
Similar Chunk 1: 1. In the midst of winter, I found there was, within me, an invincible summer.
Similarity Score: 0.7483681803770207

Similar Chunk 2: 3. A person is a person because they are there, they are alive, and they are made in the image of God.
Similarity Score: 0.4713376051650267

Similar Chunk 3: 2. And that makes me happy. For it says that no matter how hard the world pushes against me, within me, there’s something stronger – something better, pushing right back.
Similarity Score: 0.47061808967632635



# Doing on Spotify Terms and Condition Semantic Chunking

In [77]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [96]:
from docx import Document

def read_docx(file_path):
    # Open the document
    doc = Document(file_path)

    full_text = []
    # Extract text from paragraphs
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text.strip())
            full_text.append('\t'.join(row_data))

    return '\n'.join(full_text)

# Example usage
file_path = 'data.docx'  # Replace with your file path
text = read_docx(file_path)


In [80]:

sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')
# Perform similarity search
query ="How to create a spotify account?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Number of chunks formed: 41
Chunk 1: Spotify Terms of Use
1.
Chunk 2: Introduction
2.
Chunk 3: The Spotify Service Provided by Us
3. Your Use of the Spotify Service
4.
Chunk 4: Content and Intellectual Property Rights
5.
Chunk 5: Customer Support, Information, Questions and Complaints
6. Problems and Disputes
7.
Chunk 6: About these Terms
1. Introduction
Please read these Terms of Use (these "Terms") carefully as they govern your use of (which includes access to) Spotify's personalized services for streaming music and other content, including all of our websites and software applications that incorporate or link to these Terms (collectively, the "Spotify Service") and any music, videos, podcasts, or other material that is made available through the Spotify Service (the "Content"). Use of the Spotify Service may be subject to additional terms and conditions presented by Spotify, which are hereby incorporated by this reference into these Terms. By signing up for, or otherwise using, the 

# Doing on Spotify Terms and Condition Recursive Chunking

In [100]:
from docx import Document

def read_docx(file_path):
    # Open the document
    doc = Document(file_path)

    full_text = []
    # Extract text from paragraphs
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text.strip())
            full_text.append('\t'.join(row_data))

    return '\n'.join(full_text)

# Example usage
file_path = 'data.docx'  # Replace with your file path
text = read_docx(file_path)
sentence_chunker.chunk_text(text)

# Embed the chunks
sentence_chunker.embed_chunks()
for i, chunk in enumerate(sentence_chunker.chunks):
    print(f'Chunk {i+1}: {chunk}')

print('\n\n')
# Perform similarity search
query ="How to create a spotify account?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Number of chunks formed: 18
Chunk 1: Spotify Terms of Use
1. Introduction
2. The Spotify Service Provided by Us
3. Your Use of the Spotify Service
4. Content and Intellectual Property Rights
5. Customer Support, Information, Questions and Complaints
6. Problems and Disputes
7. About these Terms
1. Introduction
Please read these Terms of Use (these "Terms") carefully as they govern your use of (which includes access to) Spotify's personalized services for streaming music and other content, including all of our websites and software applications that incorporate or link to these Terms (collectively, the "Spotify Service") and any music, videos, podcasts, or other material that is made available through the Spotify Service (the "Content"). Use of the Spotify Service may be subject to additional terms and conditions presented by Spotify, which are hereby incorporated by this reference into these Terms. By signing up for, or otherwise using, the Spotify Service, you agree to these Terms. If

In [101]:
query ="Tell me about withdrawal rights?"
top_k_chunks, top_k_similarities = sentence_chunker.similarity_search(query)

# Print top-k similar chunks
for i, chunk in enumerate(top_k_chunks):
    print(f"Similar Chunk {i+1}: {chunk}")
    print(f"Similarity Score: {top_k_similarities[i]}\n")

Similar Chunk 1: Contact our Customer Support team here for instructions on how to cancel. The cancellation will take effect the day after the last day of the current subscription period, and you will be downgraded to the free version of the Spotify Service. We do not provide refunds or credits for any partial subscription periods, except as expressly stated in these Terms. If you have purchased a Paid Subscription using a Code, your subscription will automatically terminate at the end of the period stated in the Code, or when there is an insufficient pre-paid balance to pay for the Spotify Service. Withdrawal right
If you sign up for a Trial, you agree that the withdrawal right for the Paid Subscription for which you are receiving a Trial ends fourteen (14) days after you start the Trial. If you don't cancel the Paid Subscription before the Trial ends, you lose your right of withdrawal and authorise Spotify to automatically charge you the agreed price each month until you cancel the P

Sending to LLM for a response

In [88]:
from transformers import pipeline

# Define the pipeline with the T5 model
pipe = pipeline("text2text-generation", model="MBZUAI/LaMini-Flan-T5-248M")

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [89]:
combined_text = ' '.join(top_k_chunks)
print(combined_text)

Contact our Customer Support team here for instructions on how to cancel. The cancellation will take effect the day after the last day of the current subscription period, and you will be downgraded to the free version of the Spotify Service. We do not provide refunds or credits for any partial subscription periods, except as expressly stated in these Terms. If you have purchased a Paid Subscription using a Code, your subscription will automatically terminate at the end of the period stated in the Code, or when there is an insufficient pre-paid balance to pay for the Spotify Service. Withdrawal right
If you sign up for a Trial, you agree that the withdrawal right for the Paid Subscription for which you are receiving a Trial ends fourteen (14) days after you start the Trial. If you don't cancel the Paid Subscription before the Trial ends, you lose your right of withdrawal and authorise Spotify to automatically charge you the agreed price each month until you cancel the Paid Subscription.

In [93]:
def generate_responses(query_text, combined_text):
    """
    Generates responses using the T5 model based on the query and similar paragraphs as context.
    """
    responses = []

    # Format input for the T5 model
    input_text = f"Query: {query_text}. Context: {combined_text} Give answer from the content given above. If you are not sure about the answer, reply NOT SURE"

    # Generate response using the T5 model
    response = pipe(input_text, max_length=150, num_return_sequences=1)
    responses.append(response[0]['generated_text'])

    return responses

response=generate_responses(query,combined_text)
print(response)

['The withdrawal rights for the Paid Subscription are limited and exclusive, and you have the right to withdraw and authorise Spotify to automatically charge you automatically each month until you cancel the Paid Subscription.']
