In [1]:
%%capture
!pip install qdrant_client
!pip install sentence_transformers
!pip install langchain-community
!pip install replicate
!pip install pandas
!pip install nltk
!pip install langchain pydantic
!pip install accelerate

# Generate test Q/A

In [None]:
from qdrant_client import QdrantClient, models
import random
from tqdm import tqdm
import pandas as pd
import nltk
from langchain_community.embeddings import HuggingFaceEmbeddings

QDRANT_API_KEY= os.getenv('QDRANT_API_KEY')
QDRANT_URL=os.getenv('QDRANT_URL')
COLLECTION_NAME = 'esa-data-indus'

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=200
)

COLLECTION_NAME = "esa-data-qwen"

In [3]:
all_points = []
next_page = None
MAX_POINTS = 10_000

while len(all_points) < MAX_POINTS:
    scroll_result = client.scroll(
        collection_name=COLLECTION_NAME,
        limit=1000,   # fetch 1k at a time
        with_payload=True,
        with_vectors=False,   # no need to fetch stored vectors
        offset=next_page,
    )
    points, next_page = scroll_result
    all_points.extend(points)
    if next_page is None:
        break

all_points = all_points[:MAX_POINTS]
print(f"Total points fetched: {len(all_points)}")

Total points fetched: 10000


In [12]:
import os
import replicate
import json
import time
from typing import Optional
from pydantic import BaseModel, Field, ValidationError
from langchain.output_parsers import PydanticOutputParser
import re

def flatten_output(output) -> str:
    """
    Convert Replicate output (list or string) into a clean string.
    Non-string items are coerced with str().
    """
    if isinstance(output, list):
        return "".join(str(o) for o in output)
    return str(output)



def remove_think_blocks(text: str) -> str:
    """Remove any <think>...</think> blocks or reasoning chatter."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

class ParaphraseResponse(BaseModel):
    original: str = Field(
        description="The original input text before paraphrasing."
    )
    paraphrased: str = Field(
        description="The paraphrased version of the input text, meaning preserved."
    )

def extract_original_paraphrased(text: str) -> dict:
    """
    Extract "original" and "paraphrased" from a JSON-like text block
    using regex, safely handling braces, quotes, and LaTeX content.
    """
    result = {}

    # Raw strings for regex to avoid invalid escape warnings
    orig_match = re.search(
        r'"original"\s*:\s*"((?:\\.|[^"\\])*)"', text, flags=re.DOTALL
    )
    para_match = re.search(
        r'"paraphrased"\s*:\s*"((?:\\.|[^"\\])*)"', text, flags=re.DOTALL
    )

    if orig_match:
        # Only replace escaped quotes/backslashes; avoid decode()
        result["original"] = orig_match.group(1).replace('\\"', '"').replace('\\\\', '\\')
    else:
        result["original"] = None

    if para_match:
        result["paraphrased"] = para_match.group(1).replace('\\"', '"').replace('\\\\', '\\')
    else:
        result["paraphrased"] = None

    return result


# Parser for LangChain
parser = PydanticOutputParser(pydantic_object=ParaphraseResponse)


def paraphrase_with_replicate( prompt: str,model_name: str, api_token: str, max_retries: int = 5, delay: float = 2.0) -> ParaphraseResponse:
    """
    Paraphrase the given text using a Replicate model, enforcing JSON schema via Pydantic+LangChain.

    Args:
        prompt: Text to paraphrase.
        model_name: Replicate model name.
        api_token: Replicate API token.
        max_retries: Retry count if parsing fails.
        delay: Delay (s) between retries.

    Returns:
        ParaphraseResponse object with original and paraphrased text.
    """

    if not api_token:
        raise ValueError("REPLICATE_API_TOKEN not provided.")
    if not model_name:
        raise ValueError("Model name must be provided.")

    client = replicate.Client(api_token=api_token)

    for attempt in range(max_retries):
        try:
            # Add parser instructions to force JSON
            formatted_prompt = f"""
Paraphrase following text without changing context. Dont just paraphrase words but complete sentences:

{prompt}

output formatting:
{parser.get_format_instructions()}
            """

            output = client.run(
                model_name,
                input={"prompt": formatted_prompt}
            )


            output = flatten_output(output)

            output = remove_think_blocks(output)
            output = extract_original_paraphrased(output)
            paraphrased = output.get("paraphrased", "")


            return paraphrased

        except (ValidationError, ValueError) as e:
            print(f"[Retry {attempt+1}/{max_retries}] Invalid output: {e}. Waiting {delay}s...")
            time.sleep(delay)

    raise RuntimeError(f"Failed to get valid structured output after {max_retries} attempts.")



In [13]:
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize

REPLICATE_API_TOKEN="r8_GRMb15kpMvxHx5hjF87RMkqeEGsBnD13m3jrt"
import re
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize

def get_first_sentences(text, n=3):
    """
    Extract the first n sentences using NLTK, without breaking LaTeX/math.
    """
    if not text or not text.strip():
        return ""

    math_blocks = {}

    # Replace math blocks with placeholders
    def math_replacer(match):
        key = f"__MATH_{len(math_blocks)}__"
        math_blocks[key] = match.group(0)
        return key

    # Inline $...$ math
    text_protected = re.sub(r'\$(.+?)\$', math_replacer, text)
    # Display math \[...\] and \(...\)
    text_protected = re.sub(r'\\\[(.+?)\\\]', math_replacer, text_protected)
    text_protected = re.sub(r'\\\((.+?)\\\)', math_replacer, text_protected)

    # Tokenize sentences with NLTK
    sentences = sent_tokenize(text_protected)

    # Restore math blocks
    restored_sentences = []
    for s in sentences:
        for key, val in math_blocks.items():
            s = s.replace(key, val)
        restored_sentences.append(s)

    first_sentences = restored_sentences[:n]
    return " ".join(first_sentences).strip()


queries = []
filtered_points = [
    p for p in all_points
    if "###" in p.payload.get("content", "")
]
sampled_points = random.sample(filtered_points, 500)

In [None]:
output_file = "single_queries.jsonl"

with open(output_file, "a", encoding="utf-8") as f:
    for p in tqdm(sampled_points, desc="Processing points"):
        text = p.payload.get("content", "")
        if not text.strip():
            continue

        query_text = get_first_sentences(text, n=4)

        paraphrased = paraphrase_with_replicate(
            query_text,
            model_name="openai/gpt-4o-mini",
            api_token=REPLICATE_API_TOKEN
        )

        if query_text:
            record = {
                "id": p.id,
                "query_text": query_text,
                "original_text": text,
                "paraphrased_query": paraphrased # convert Pydantic object to dict
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

Processing points: 100%|██████████| 500/500 [43:39<00:00,  5.24s/it]  


# Test

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

class qwen_embedder:
    def __init__(self, model_name="Qwen/Qwen3-Embedding-4B"):
        # Load the sentence-transformers model
        self.model = SentenceTransformer(
                                    model_name,
                                    model_kwargs={
                                        "torch_dtype": "auto",       
                                        "device_map": "auto",
                                    },
                                    tokenizer_kwargs={"padding_side": "left",
                                                      "max_length": 2048,
                                                      "truncation": True
                                                      }
                                                      )

    def embed_documents(self,
                        texts,
                        batch_size=8,
                        padding=True,
                        truncation=True,
                        max_length=2048,
                        normalize=True):
        """
        Encodes a list of texts into embeddings.

        Args:
            texts (list[str]): Documents to embed
            padding (bool/str): True = dynamic padding, 'max_length' = fixed length
            truncation (bool): Whether to truncate texts beyond max_length
            max_length (int): Max tokens allowed
            normalize (bool): Whether to L2 normalize embeddings

        Returns:
            np.ndarray: Embeddings array (num_texts x embedding_dim)
        """
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            normalize_embeddings=normalize,
            convert_to_numpy=True,
            convert_to_tensor=False
        )
        embeddings = embeddings.tolist()
        return embeddings


    def embed_query(self,query):

        embeddings = self.model.encode( query,prompt_name="query")

        embeddings = embeddings.tolist()
        return embeddings

In [5]:
if COLLECTION_NAME == 'esa-data-indus':
    model_name="nasa-impact/nasa-smd-ibm-st-v2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
elif COLLECTION_NAME == 'esa-data-qwen':
    embedder=qwen_embedder(model_name="Qwen/Qwen3-Embedding-4B")

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [None]:
use_paraphrased = True  # Set this flag to True or False
K = 5

df_queries = pd.read_json("single_queries.jsonl", lines=True)



results = []
rank_counts = {i: 0 for i in range(1, K+1)}
rank_counts["not_in_topk"] = 0

for _, row in tqdm(df_queries.iterrows(), total=len(df_queries), desc="Testing retrieval"):
    query_id = row["id"]

    if use_paraphrased and row.get("paraphrased_query"):
        query_text = row["paraphrased_query"]
    else:
        query_text = row["query_text"]

    query_vector = embedder.embed_query(query_text)


    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=K,
        with_payload=True,
    )

    retrieved_ids = [res.id for res in search_result]
    retrieved_scores = [res.score for res in search_result]

    if query_id in retrieved_ids:
        rank = retrieved_ids.index(query_id) + 1
        score_at_rank = retrieved_scores[rank - 1]
        rank_counts[rank] += 1
    else:
        rank = None
        score_at_rank = None
        rank_counts["not_in_topk"] += 1

    results.append({
        "query_id": query_id,
        "query_text": query_text,
        "retrieved_ids": retrieved_ids,
        "retrieved_scores": retrieved_scores,
        "rank": rank,
        "score_at_rank": score_at_rank,
    })


df_results = pd.DataFrame(results)

print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = c / len(df_queries)
    print(f"Rank {r}: {c} ({pct:.2f}%)")

df_results.to_csv("single_chunk_qwen_para.csv", index=False)


In [None]:
use_paraphrased = False  # Set this flag to True or False
K = 5

df_queries = pd.read_json("single_queries.jsonl", lines=True)



results = []
rank_counts = {i: 0 for i in range(1, K+1)}
rank_counts["not_in_topk"] = 0

for _, row in tqdm(df_queries.iterrows(), total=len(df_queries), desc="Testing retrieval"):
    query_id = row["id"]

    
    if use_paraphrased and row.get("paraphrased_query"):
        query_text = row["paraphrased_query"]
    else:
        query_text = row["query_text"]

    query_vector = embedder.embed_query(query_text)


    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=K,
        with_payload=True,
    )

    retrieved_ids = [res.id for res in search_result]
    retrieved_scores = [res.score for res in search_result]

    if query_id in retrieved_ids:
        rank = retrieved_ids.index(query_id) + 1
        score_at_rank = retrieved_scores[rank - 1]
        rank_counts[rank] += 1
    else:
        rank = None
        score_at_rank = None
        rank_counts["not_in_topk"] += 1

    results.append({
        "query_id": query_id,
        "query_text": query_text,
        "retrieved_ids": retrieved_ids,
        "retrieved_scores": retrieved_scores,
        "rank": rank,
        "score_at_rank": score_at_rank,
    })

df_results = pd.DataFrame(results)

print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = c / len(df_queries)
    print(f"Rank {r}: {c} ({pct:.2f}%)")

df_results.to_csv("single_chunk_qwen.csv", index=False)