In [6]:
%%capture
!pip install qdrant_client
!pip install sentence_transformers
!pip install langchain-community
!pip install pandas
!pip install nltk

# generate test question

In [None]:
import random
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import os

QDRANT_API_KEY= os.getenv('QDRANT_API_KEY')
QDRANT_URL=os.getenv('QDRANT_URL')
COLLECTION_NAME = 'esa-data-indus'

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=200
)

In [2]:
MAX_POINTS = 100000
all_points = []
next_page = None

while len(all_points) < MAX_POINTS:
    points, next_page = client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=None,
        limit=10000,
        with_payload=True,
        with_vectors=False,
        offset=next_page
    )
    all_points.extend(points)
    if next_page is None:
        break

print(f"Total points fetched: {len(all_points)}")

Total points fetched: 100000


In [3]:
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)

def get_first_sentences(text, n=4):
    """Extract first n sentences from a text."""
    if not text or not text.strip():
        return ""
    sentences = sent_tokenize(text.strip())
    return " ".join(sentences[:n]).strip()

records = []
for idx, p in enumerate(all_points):
    doc_id = p.payload.get("id")  # document ID in payload
    text = p.payload.get("content", "")
    chunk_text = get_first_sentences(text, n=4)
    if chunk_text:
        records.append({
            "index": idx,          # index in the all_points list
            "point_id": p.id,      # unique point ID
            "doc_id": doc_id,      # document ID
            "chunk_text": chunk_text
        })

df_chunks = pd.DataFrame(records)
print(f"Total chunks prepared: {len(df_chunks)}")
print(df_chunks.head())

Total chunks prepared: 100000
   index        point_id  doc_id  \
0      0   4929342381722  294357   
1      1   6978102708919  133210   
2      2  10546386806096   29464   
3      3  13662255764141   31897   
4      4  15862757000551  234778   

                                          chunk_text  
0  ### Covariate layers  \nThe spatially explicit...  
1  The other is named as the DA experiment, which...  
2  Due to the intense time investment required fo...  
3  ## 7 Summary and conclusions  \nThis paper pre...  
4  ### Atmospheric chemistry satellites used for ...  


In [9]:
import json, random

def generate_multi_chunk_queries(df, out_file="multi_chunk_queries.jsonl", same_doc=True, n_queries=500):
    """
    Generate n_queries multi-chunk queries by sampling docs first (no for-loop).
    Each query = two chunks (from same doc if same_doc=True, else from different docs).
    Saves results directly to JSONL.
    """
    queries = []

    if same_doc:
        # Eligible docs with at least 2 chunks
        eligible_docs = df.groupby("doc_id").filter(lambda x: len(x) > 1)["doc_id"].unique()
        if len(eligible_docs) == 0:
            print(" No eligible documents with ≥2 chunks")
            return

        # Sample up to n_queries docs
        sampled_docs = random.sample(list(eligible_docs), min(n_queries, len(eligible_docs)))

        for doc_id in sampled_docs:
            doc_chunks = df[df["doc_id"] == doc_id]
            idx1, idx2 = random.sample(doc_chunks.index.tolist(), 2)

            chunk1 = df.at[idx1, "chunk_text"]
            chunk2 = df.at[idx2, "chunk_text"]
            doc1_id = df.at[idx1, "doc_id"]
            doc2_id = df.at[idx2, "doc_id"]
            p1_id = df.at[idx1, "point_id"]
            p2_id = df.at[idx2, "point_id"]

            queries.append({
                "query_text": chunk1 + "\n" + chunk2,
                "chunk1_text": chunk1,
                "chunk2_text": chunk2,
                "doc1_id": str(doc1_id),
                "doc2_id": str(doc2_id),
                "chunk1_id": str(p1_id),
                "chunk2_id": str(p2_id)
            })

    else:
        # Different-doc case: sample random pairs
        all_docs = df["doc_id"].unique()
        sampled_docs = random.sample(list(all_docs), min(n_queries * 2, len(all_docs)))  # grab enough docs
        doc_pairs = zip(sampled_docs[::2], sampled_docs[1::2])  # make pairs

        for doc1_id, doc2_id in list(doc_pairs)[:n_queries]:
            idx1 = random.choice(df[df["doc_id"] == doc1_id].index.tolist())
            idx2 = random.choice(df[df["doc_id"] == doc2_id].index.tolist())

            chunk1 = df.at[idx1, "chunk_text"]
            chunk2 = df.at[idx2, "chunk_text"]
            p1_id = df.at[idx1, "point_id"]
            p2_id = df.at[idx2, "point_id"]

            queries.append({
                "query_text": chunk1 + "\n" + chunk2,
                "chunk1_text": chunk1,
                "chunk2_text": chunk2,
                "doc1_id": str(doc1_id),
                "doc2_id": str(doc2_id),
                "chunk1_id": str(p1_id),
                "chunk2_id": str(p2_id)
            })

    # Save all queries
    with open(out_file, "w", encoding="utf-8") as f:
        for q in queries:
            f.write(json.dumps(q, ensure_ascii=False) + "\n")

    print(f" Generated & saved {len(queries)} queries to {out_file}")


In [27]:
generate_multi_chunk_queries(df_chunks, out_file="multi_chunk_queries_diff.jsonl", same_doc=False, n_queries=500)

 Generated & saved 500 queries to multi_chunk_queries_diff.jsonl


In [None]:
generate_multi_chunk_queries(df_chunks, out_file="multi_chunk_queries.jsonl", same_doc=True, n_queries=500)

# test

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

class qwen_embedder:
    def __init__(self, model_name="Qwen/Qwen3-Embedding-4B"):
        # Load the sentence-transformers model
        self.model = SentenceTransformer(
                                    model_name,
                                    model_kwargs={
                                        "torch_dtype": "auto",       # important: will use float16/bfloat16 automatically
                                        "device_map": "auto",
                                    },
                                    tokenizer_kwargs={"padding_side": "left",
                                                      "max_length": 2048,
                                                      "truncation": True
                                                      }
                                                      )

    def embed_documents(self,
                        texts,
                        batch_size=8,
                        padding=True,
                        truncation=True,
                        max_length=2048,
                        normalize=True):
        """
        Encodes a list of texts into embeddings.

        Args:
            texts (list[str]): Documents to embed
            padding (bool/str): True = dynamic padding, 'max_length' = fixed length
            truncation (bool): Whether to truncate texts beyond max_length
            max_length (int): Max tokens allowed
            normalize (bool): Whether to L2 normalize embeddings

        Returns:
            np.ndarray: Embeddings array (num_texts x embedding_dim)
        """
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            normalize_embeddings=normalize,
            convert_to_numpy=True,
            convert_to_tensor=False
        )
        embeddings = embeddings.tolist()
        return embeddings


    def embed_query(self,query):

        embeddings = self.model.encode( query,prompt_name="query")

        embeddings = embeddings.tolist()
        return embeddings

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
class IndusEmbedder:
    def __init__(self, model_name: str = "Tulsikumar/indus-sde-st-v0.2", device: str = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, documents: list[str], batch_size: int = 100) -> list[list[float]]:
        """Embed a batch of documents (list of strings) in smaller chunks and return a list of vectors."""
        all_embeddings = []

    
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)

            embeddings = self._mean_pooling(outputs, inputs["attention_mask"])
            all_embeddings.extend(embeddings.cpu().tolist())  # convert to list of lists

        return all_embeddings

    def embed_query(self, query: str) -> list[float]:
        """Embed a single query string and return a list of floats."""
        return self.embed_documents([query], batch_size=1)[0]

In [None]:
if COLLECTION_NAME == 'esa-data-indus' or COLLECTION_NAME == 'esa-data-indus-quant' or COLLECTION_NAME =='esa-data-indus-512-1024':
    model_name="nasa-impact/nasa-smd-ibm-st-v2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
elif COLLECTION_NAME == 'esa-data-qwen' or COLLECTION_NAME == 'esa-data-qwen-quant' or COLLECTION_NAME =='esa-data-qwen-1024':
    embedder=qwen_embedder(model_name="Qwen/Qwen3-Embedding-4B")

elif COLLECTION_NAME == "esa-data-indus-1024" or COLLECTION_NAME == "esa-data-indus-1024-quant":
    model_name="Tulsikumar/indus-sde-st-v0.2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    #embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
    embedder=IndusEmbedder()


In [8]:
import pandas as pd
from tqdm import tqdm
import json

K = 10
results = []
both_found = one_found = none_found = 0

# Load pre-generated queries
queries = []
with open("multi_chunk_queries_diff.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        queries.append(json.loads(line.strip()))

for q in tqdm(queries, desc="Multi-chunk retrieval"):
    query_text = q["query_text"]
    chunk1_id = q["chunk1_id"]
    chunk2_id = q["chunk2_id"]

    # Embed the query (can be original or paraphrased)
    query_vector = embedder.embed_query(query_text)

    # Perform search
    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=K,
        with_payload=True
    )

    retrieved_chunk_ids = [res.id for res in search_result]
    retrieved_doc_ids = [res.payload.get("id") for res in search_result]  # optional

    found1 = int(chunk1_id) in retrieved_chunk_ids
    found2 = int(chunk2_id) in retrieved_chunk_ids

    if found1 and found2:
        both_found += 1
    elif found1 or found2:
        one_found += 1
    else:
        none_found += 1

    # Save results with all IDs
    q.update({
        "retrieved_chunk_ids": retrieved_chunk_ids,
        "retrieved_doc_ids": retrieved_doc_ids,
        "found_chunk1": found1,
        "found_chunk2": found2
    })
    results.append(q)

# Optionally save results to CSV or JSONL
df_results = pd.DataFrame(results)
df_results.to_csv("multi_chunk_search_results_diff.csv", index=False)

print("\nSummary:")
total_queries = len(results)
print(f"Both found: {both_found}/{total_queries} ({both_found/total_queries*100:.2f}%)")
print(f"One found : {one_found}/{total_queries} ({one_found/total_queries*100:.2f}%)")
print(f"None found: {none_found}/{total_queries} ({none_found/total_queries*100:.2f}%)")


  search_result = client.search(
Multi-chunk retrieval: 100%|██████████| 500/500 [22:46<00:00,  2.73s/it]


Summary:
Both found: 43/500 (8.60%)
One found : 385/500 (77.00%)
None found: 72/500 (14.40%)





In [11]:
import pandas as pd
import ast

df = pd.read_csv("multi_chunk_search_results_diff.csv")

# Convert the string representation of list into an actual Python list
df["retrieved_chunk_ids"] = df["retrieved_chunk_ids"].apply(ast.literal_eval)

df["retrieved_chunk_ids"] = df["retrieved_chunk_ids"].apply(lambda ids: [str(i) for i in ids])
df["chunk1_id"] = df["chunk1_id"].astype(str)
df["chunk2_id"] = df["chunk2_id"].astype(str)

# Check presence
df["found_chunk1"] = df.apply(lambda x: x["chunk1_id"] in x["retrieved_chunk_ids"], axis=1)
df["found_chunk2"] = df.apply(lambda x: x["chunk2_id"] in x["retrieved_chunk_ids"], axis=1)

both_found = ((df["found_chunk1"]) & (df["found_chunk2"])).sum()
one_found  = ((df["found_chunk1"]) ^ (df["found_chunk2"])).sum()
none_found = (~(df["found_chunk1"] | df["found_chunk2"])).sum()

print("Summary:")
print(f"Both found: {both_found}")
print(f"One found : {one_found}")
print(f"None found: {none_found}")
print(f"Total score: {(both_found + one_found*0.5)/500}")

Summary:
Both found: 43
One found : 385
None found: 72
Total score: 0.471
