# Retrival test

In [5]:
%%capture
!pip install qdrant_client
!pip install sentence_transformers
!pip install langchain-community
!pip install replicate
!pip install pandas
!pip install nltk
!pip install langchain pydantic
!pip install accelerate

In [None]:
from huggingface_hub import login

HF_TOKEN = ""
login(token=HF_TOKEN)

In [None]:
from qdrant_client import QdrantClient, models
import random
from tqdm import tqdm
import pandas as pd
import nltk
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from transformers import AutoTokenizer, AutoModel
import os

cluster='llm4eo'
COLLECTION_NAME = "esa-data-qwen-1024"


if cluster=='llm4eo':
    QDRANT_API_KEY=os.getenv('QDRANT_API_KEY_1')
    QDRANT_URL=os.getenv('QDRANT_URL_1')        
else:
    QDRANT_API_KEY=os.getenv('QDRANT_API_KEY')
    QDRANT_URL=os.getenv('QDRANT_URL')    

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=120
)



In [10]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

class qwen_embedder:
    def __init__(self, model_name="Qwen/Qwen3-Embedding-4B"):
        # Load the sentence-transformers model
        self.model = SentenceTransformer(
                                    model_name,
                                    model_kwargs={
                                        "torch_dtype": "auto",       # important: will use float16/bfloat16 automatically
                                        "device_map": "auto",
                                    },
                                    tokenizer_kwargs={"padding_side": "left",
                                                      "max_length": 2048,
                                                      "truncation": True
                                                      }
                                                      )

    def embed_documents(self,
                        texts,
                        batch_size=8,
                        padding=True,
                        truncation=True,
                        max_length=2048,
                        normalize=True):
        """
        Encodes a list of texts into embeddings.

        Args:
            texts (list[str]): Documents to embed
            padding (bool/str): True = dynamic padding, 'max_length' = fixed length
            truncation (bool): Whether to truncate texts beyond max_length
            max_length (int): Max tokens allowed
            normalize (bool): Whether to L2 normalize embeddings

        Returns:
            np.ndarray: Embeddings array (num_texts x embedding_dim)
        """
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            normalize_embeddings=normalize,
            convert_to_numpy=True,
            convert_to_tensor=False
        )
        embeddings = embeddings.tolist()
        return embeddings


    def embed_query(self,query):

        embeddings = self.model.encode( query,prompt_name="query")

        embeddings = embeddings.tolist()
        return embeddings

In [15]:
import torch
from transformers import AutoTokenizer, AutoModel
class IndusEmbedder:
    def __init__(self, model_name: str = "Tulsikumar/indus-sde-st-v0.2", device: str = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, documents: list[str], batch_size: int = 100) -> list[list[float]]:
        """Embed a batch of documents (list of strings) in smaller chunks and return a list of vectors."""
        all_embeddings = []

    
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)

            embeddings = self._mean_pooling(outputs, inputs["attention_mask"])
            all_embeddings.extend(embeddings.cpu().tolist())  # convert to list of lists

        return all_embeddings

    def embed_query(self, query: str) -> list[float]:
        """Embed a single query string and return a list of floats."""
        return self.embed_documents([query], batch_size=1)[0]

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from qdrant_client.http.exceptions import UnexpectedResponse
import torch
from transformers import AutoTokenizer, AutoModel

MAX_RETRIES = 5
RETRY_DELAY = 2  # seconds


In [12]:
if COLLECTION_NAME == 'esa-data-indus' or COLLECTION_NAME == 'esa-data-indus-quant' or COLLECTION_NAME =='esa-data-indus-512-1024':
    model_name="nasa-impact/nasa-smd-ibm-st-v2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
elif COLLECTION_NAME == 'esa-data-qwen' or COLLECTION_NAME == 'esa-data-qwen-quant' or COLLECTION_NAME =='esa-data-qwen-1024':
    embedder=qwen_embedder(model_name="Qwen/Qwen3-Embedding-4B")

elif COLLECTION_NAME == "esa-data-indus-1024" or COLLECTION_NAME == "esa-data-indus-1024-quant":
    model_name="Tulsikumar/indus-sde-st-v0.2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    #embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
    embedder=IndusEmbedder()


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [None]:
import time
import pandas as pd
from tqdm import tqdm

K = 20
csv_path = "data_with_unique_questions.csv"
df_queries = pd.read_csv(csv_path)

In [14]:
results = []
rank_counts = {i: 0 for i in range(1, K+1)}
rank_counts["not_in_topk"] = 0

for _, row in tqdm(df_queries.iterrows(), total=len(df_queries), desc="Testing retrieval"):
    chunk_id = row["id"]
    query_text = row["question"]
    references = row["references"] if pd.notna(row["references"]) else []
#    references = row['']

    query_vector = embedder.embed_query(query_text)

    # Measure retrieval time
    start_time = time.time()
    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=K,
        with_payload=True,
        search_params=models.SearchParams(
                quantization=models.QuantizationSearchParams(
                    ignore=False,
                    rescore=True,
                    oversampling=4.0,
                )
            )
        )
    
    retrieval_time = time.time() - start_time  # in seconds

    retrieved_ids = [res.id for res in search_result]
    retrieved_scores = [res.score for res in search_result]

    if chunk_id in retrieved_ids:
        rank = retrieved_ids.index(chunk_id) + 1
        score_at_rank = retrieved_scores[rank - 1]
        rank_counts[rank] += 1
    else:
        rank = None
        score_at_rank = None
        rank_counts["not_in_topk"] += 1

    # Get top chunk content and compute coverage
    if search_result:
        retrieved_texts = [res.payload.get("content", "") for res in search_result]
        sep = "<DOC_SEP>"
        retrieved_text = sep.join(retrieved_texts)
    
        # Check each reference in all chunks
        found_refs = 0
        for ref in references:
            if any(ref in chunk for chunk in retrieved_texts):
                found_refs += 1
    
        coverage = found_refs / len(references) if references else 0
    else:
        retrieved_text = ""
        coverage = 0


    results.append({
        "doc_id": chunk_id,
        "query_text": query_text,
        "references":references,
        "retrieved_ids": retrieved_ids,
        "retrieved_scores": retrieved_scores,
        "retrieved_text": retrieved_text,
        "coverage": coverage,
        "retrieval_time": retrieval_time,
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = (c / len(df_queries))*100
    print(f"Rank {r}: {c} ({pct:.2f}%)")

# Save results
output_file = "llm_qa_qwen_1024_test_20.csv"
df_results.to_csv(output_file, index=False)
print(f"\nSaved results including retrieval time and retrieved text to {output_file}")

  search_result = client.search(
Testing retrieval: 100%|██████████| 1140/1140 [36:59<00:00,  1.95s/it]



Rank distribution:
Rank 1: 0 (0.00%)
Rank 2: 0 (0.00%)
Rank 3: 0 (0.00%)
Rank 4: 0 (0.00%)
Rank 5: 0 (0.00%)
Rank 6: 0 (0.00%)
Rank 7: 0 (0.00%)
Rank 8: 0 (0.00%)
Rank 9: 0 (0.00%)
Rank 10: 0 (0.00%)
Rank 11: 0 (0.00%)
Rank 12: 0 (0.00%)
Rank 13: 0 (0.00%)
Rank 14: 0 (0.00%)
Rank 15: 0 (0.00%)
Rank 16: 0 (0.00%)
Rank 17: 0 (0.00%)
Rank 18: 0 (0.00%)
Rank 19: 0 (0.00%)
Rank 20: 0 (0.00%)
Rank not_in_topk: 1140 (100.00%)

Saved results including retrieval time and retrieved text to llm_qa_qwen_1024_test_20.csv


# Test

In [None]:
import pandas as pd
from collections import Counter
import string
from typing import List, Dict

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join(text.split())
    return text

def is_reference_present_fuzzy(reference: str, document: str, threshold: float = 0.8) -> bool:
    ref_tokens = normalize_text(reference).split()
    doc_tokens = normalize_text(document).split()
    if not ref_tokens:
        return False
    matched_tokens = sum(1 for t in ref_tokens if t in doc_tokens)
    fraction_matched = matched_tokens / len(ref_tokens)
    return fraction_matched >= threshold

def compute_token_metrics_single_doc(
    references: List[str],
    retrieved_texts: List[str],
    threshold: float = 0.8
) -> Dict[str, float]:
    all_ref_tokens = []
    matched_tokens = []
    found_count = 0

    for ref in references:
        ref_tokens = normalize_text(ref).split()
        all_ref_tokens.extend(ref_tokens)

        matched_docs = [doc for doc in retrieved_texts if is_reference_present_fuzzy(ref, doc, threshold)]
        if matched_docs:
            found_count += 1
            for doc in matched_docs:
                matched_tokens.extend(normalize_text(doc).split())

    all_doc_tokens = []
    for doc in retrieved_texts:
        all_doc_tokens.extend(normalize_text(doc).split())

    if not all_ref_tokens:
        return {"iou": 0.0, "precision": 0.0, "recall": 0.0, "ref_found_ratio": 0.0}

    ref_counter = Counter(all_ref_tokens)
    match_counter = Counter(matched_tokens)
    doc_counter = Counter(all_doc_tokens)

    intersection_count = sum((ref_counter & match_counter).values())
    ref_count = sum(ref_counter.values())
    doc_count = sum(doc_counter.values())
    union_count = ref_count + doc_count - intersection_count

    iou = intersection_count / union_count if union_count > 0 else 0.0
    precision = intersection_count / doc_count if doc_count > 0 else 0.0
    recall = intersection_count / ref_count if ref_count > 0 else 0.0
    ref_found_ratio = found_count / len(references) if references else 0.0

    return {
        "iou": iou,
        "precision": precision,
        "recall": recall,
        "ref_found_ratio": ref_found_ratio,
    }

def compute_metrics_single_row(
    ref_str: str,
    ret_str: str,
    K: int,
    ref_sep: str = "|",
    doc_sep: str = "<DOC_SEP>",
    token_threshold: float = 0.95,
    rr_threshold: float = 1.0
) -> Dict[str, float]:
    references = [r.strip() for r in str(ref_str).split(ref_sep) if r.strip()]
    retrieved_texts = [doc.strip() for doc in str(ret_str).split(doc_sep) if doc.strip()]

    metrics = compute_token_metrics_single_doc(references, retrieved_texts[:K], threshold=token_threshold)

    rr = 0.0
    found_rank = None
    for rank, doc in enumerate(retrieved_texts[:K], start=1):
        if any(is_reference_present_fuzzy(ref, doc, threshold=rr_threshold) for ref in references):
            rr = 1.0 / rank
            found_rank = rank
            break

    metrics["reciprocal_rank"] = rr
    metrics["ref_rank"] = found_rank  # None if not found
    return metrics


In [None]:
from tqdm import tqdm

df = pd.read_csv(output_file)

top_k = 5
metrics_list = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Computing metrics"):
    metrics = compute_metrics_single_row(row["references"], row["retrieved_text"], top_k)
    metrics_list.append(metrics)

df_metrics = pd.DataFrame(metrics_list)
df = pd.concat([df, df_metrics], axis=1)

average_metrics = df_metrics.mean().to_dict()
print(f"\nAverage metrics across all rows @{top_k}:")
print(average_metrics)

ref_tok_10 = average_metrics['ref_found_ratio']
mrr_tok_10 = average_metrics['reciprocal_rank']

rank_counts = df_metrics["ref_rank"].value_counts().sort_index()
rank_percents = (rank_counts / len(df_metrics)) * 100

print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = rank_percents[r]
    print(f"Rank {r}: {c} docs ({pct:.2f}%)")
