## Step 1: Extract content from PDF files using PyPDF2

In [3]:
!pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


## Step 2: Chunk the extracted text into short contiguous texts using NLTK (and Step 1 together)

In [4]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [5]:
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def chunk_text(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_tokens = word_tokenize(sentence)
        if current_length + len(sentence_tokens) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = sentence_tokens
            current_length = len(sentence_tokens)
        else:
            current_chunk.extend(sentence_tokens)
            current_length += len(sentence_tokens)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def embed_chunks(chunks):
    embeddings = model.encode(chunks)
    return embeddings

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')


pdf_path1 = '/content/21 Lessons for the 21st Century ( PDFDrive ).pdf'
pdf_path2 = '/content/Homo Deus_ A Brief History of Tomorrow ( PDFDrive ).pdf'
pdf_path3 = '/content/Sapiens_ A Brief History of Humankind ( PDFDrive ).pdf'

text1 = extract_text_from_pdf(pdf_path1)
text2 = extract_text_from_pdf(pdf_path2)
text3 = extract_text_from_pdf(pdf_path3)
all_texts = text1 + text2 + text3

# Print the extracted text (entire books)
# print("Text from textbook 1:")
# print(text1)
# print("\nText from textbook 2:")
# print(text2[:1000])
# print("\nText from textbook 3:")
# print(text3[:1000])

chunks1 = chunk_text(text1)
chunks2 = chunk_text(text2)
chunks3 = chunk_text(text3)

embeddings1 = embed_chunks(chunks1)
embeddings2 = embed_chunks(chunks2)
embeddings3 = embed_chunks(chunks3)

print(embeddings1.shape)
print(embeddings2.shape)
print(embeddings3.shape)

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(1522, 384)
(1896, 384)
(1864, 384)



##Step-3: implement the RAPTOR indexing method

- Clustering the Embedded Chunks: We'll use Gaussian Mixture Models (GMMs) with soft clustering.
- Summarizing the Clusters: We'll use a Local Large Language Model (LLM) like transformers from Hugging Face to create concise representations of the grouped texts.
- Recursive Clustering and Summarization: We'll recursively apply the clustering and summarization process until a hierarchical tree structure is formed.

For summarization, we'll use a model like BART or T5 from Hugging Face.

In [1]:
!pip install scikit-learn



## Step 3.1: Clustering with Gaussian Mixture Models

In [6]:
import numpy as np
from sklearn.mixture import GaussianMixture
all_embeddings = np.vstack([embeddings1, embeddings2, embeddings3])


def cluster_embeddings(embeddings, n_components=5):
    # Reshape embeddings to 2D if it's 1D
    if embeddings.ndim == 1:
        embeddings = embeddings.reshape(-1, 1)  # Reshape to a column vector

    gmm = GaussianMixture(n_components=n_components, covariance_type='full')
    gmm.fit(embeddings)
    cluster_labels = gmm.predict(embeddings)
    return cluster_labels, gmm

# Assuming `embeddings` is the array of embeddings from the previous steps
cluster_labels, gmm = cluster_embeddings(all_embeddings)



## step - 3.2: Milvus Integration
After obtaining the embeddings and cluster_labels, insert the embeddings into Milvus for efficient retrieval and further processing.

In [10]:
!pip uninstall -y grpcio google-cloud-pubsub

Found existing installation: grpcio 1.65.1
Uninstalling grpcio-1.65.1:
  Successfully uninstalled grpcio-1.65.1
Found existing installation: google-cloud-pubsub 2.22.0
Uninstalling google-cloud-pubsub-2.22.0:
  Successfully uninstalled google-cloud-pubsub-2.22.0


In [11]:
!pip install grpcio==1.49.1
!pip install pymilvus==2.1.0

Collecting grpcio==1.49.1
  Using cached grpcio-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Using cached grpcio-1.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
Installing collected packages: grpcio
Successfully installed grpcio-1.49.1


In [None]:
## one more dependency to be installed here
## docker set up for milvus database
## link for reference - https://github.com/zilliz-bootcamp/milvus_tutorials_on_googlecolab/blob/main/Milvus_tutorial.ipynb

### Milvus Connection

In [12]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Connect to Milvus server
connections.connect("default", host="localhost", port="19530")  # Update host and port if different

# Define schema for the collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)
]
schema = CollectionSchema(fields, "Collection for sentence embeddings")

# Create the collection
collection_name = "sentence_embeddings"
collection = Collection(name=collection_name, schema=schema)

# Insert embeddings into Milvus
ids = list(range(len(embeddings)))  # Generate IDs for each embedding
entities = [
    ids,
    embeddings
]

collection.insert(entities)
collection.load()

# Now you can perform further operations with Milvus like querying or updating embeddings

MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530. Timeout)>

## Step 3.3: Summarize Clusters Using an LLM

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Function to summarize a list of texts
def summarize_texts(texts, model_name="facebook/bart-large-cnn"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summaries = []

    for text in texts:
        inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

# Assuming `all_texts` contains the original texts and `cluster_labels` contains the cluster labels from the clustering step
# Group the texts by their cluster labels
cluster_texts = {i: [] for i in range(max(cluster_labels) + 1)}
for text, label in zip(all_texts, cluster_labels):
    cluster_texts[label].append(text)

# Summarize each cluster
cluster_summaries = {}
for label, texts in cluster_texts.items():
    cluster_summaries[label] = summarize_texts(texts)

# Print the summaries
for label, summaries in cluster_summaries.items():
    print(f"Cluster {label} Summaries:")
    for i, summary in enumerate(summaries):
        print(f"  Summary {i + 1}: {summary}")


## Step 3.4: Re-embed the Summarized Texts

In [None]:
def reembed_texts(texts, model):
    return model.encode(texts)

# Assuming `model` is the SentenceTransformer model from previous steps
summary_texts = [" ".join(summaries) for summaries in cluster_summaries.values()]
summary_embeddings = reembed_texts(summary_texts, model)

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 8f1b23c4e2b7484f8150940a9b734983
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: llamacollection
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: llamacollection


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

## Step 3.5: Recursive Clustering and Summarization

In [None]:
def recursive_clustering(embeddings, model, depth=3, n_components=5):
    if depth == 0 or len(embeddings) <= 1:
        return embeddings

    cluster_labels, gmm = cluster_embeddings(embeddings, n_components=n_components)
    cluster_texts = {i: [] for i in range(n_components)}

    for idx, label in enumerate(cluster_labels):
        cluster_texts[label].append(embeddings[idx])

    summarized_texts = {}
    for label, texts in cluster_texts.items():
        if len(texts) > 0:
            summarized_text = summarize_texts([" ".join(text) for text in texts])
            summarized_texts[label] = summarized_text

    reembedded_texts = {label: reembed_texts(summ_texts, model) for label, summ_texts in summarized_texts.items()}

    new_embeddings = []
    for label, reembeds in reembedded_texts.items():
        new_embeddings.extend(recursive_clustering(reembeds, model, depth=depth-1, n_components=n_components))

    return new_embeddings

# Start the recursive clustering process
final_embeddings = recursive_clustering(summary_embeddings, model)