In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [1]:
%%capture
!pip install qdrant-client
!pip install s3fs
!pip install pymysql
!pip install tqdm
!pip install replicate
!pip install sqlalchemy
!pip install sentence_transformers
!pip install langchain-community

# Get meta_data

In [None]:
DB_HOST=''
DB_PORT=0
DB_NAME=''
DB_USER=''
DB_PASSWORD=''
AWS_ACCESS_KEY_ID=''
AWS_SECRET_ACCESS_KEY=''
bucket_path = "s3://llm4eo-s3/oa-data-cleaned-spellchecked/"

In [None]:
import pandas as pd
import pymysql
import s3fs
import os
from tqdm import tqdm
from sqlalchemy import create_engine
import json
import re

def get_rds_metadata(DB_HOST=None, DB_PORT=None, DB_USER=None, DB_PASSWORD=None, DB_NAME=None, num_of_docs=None):
    engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

    meta_data_rds = pd.read_sql("SELECT * FROM metadata;", engine)

    meta_data_rds = meta_data_rds.drop(columns=[
        'created_at', 'updated_at', 'scraper', 'authors', 'volume', 'issue', 'page', 'month'
    ])
    meta_data_rds = meta_data_rds.dropna()

    if num_of_docs is not None:
        meta_data_rds = meta_data_rds[:num_of_docs]

    return meta_data_rds


In [None]:
df=get_rds_metadata(DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME, num_of_docs=-1)

In [None]:
df["folder"] = df["file_path"].str.split("/").str[0]
len(df["folder"].unique())

In [None]:
fs = s3fs.S3FileSystem(
    key=AWS_ACCESS_KEY_ID,
    secret=AWS_SECRET_ACCESS_KEY,
)

In [None]:
# Get all filenames in the bucket
all_s3_files = set()
for dirpath, dirs, files in fs.walk(bucket_path):
    for f in files:
        all_s3_files.add(f)  # store only the filename

skipped_files = []
for key in tqdm(df['file_path'], total=len(df), desc="Checking files"):
    filename = key.split("/")[-1]  # get filename after last "/"
    if filename not in all_s3_files:
        skipped_files.append(key)

print("Missing files:", len(skipped_files))


with open("/content/drive/MyDrive/llm_qa/skipped_files.json", "w") as f:
    json.dump(skipped_files, f, indent=2)

print("Saved skipped files to skipped_files.json")

Checking files: 100%|██████████| 276592/276592 [00:00<00:00, 1072386.50it/s]

Missing files: 21375
Saved skipped files to skipped_files.json





In [None]:
df = df[df['file_path'].str.count("/") == 1]

In [None]:
df = df[~df['file_path'].isin(skipped_files)].reset_index(drop=True)

In [None]:
len(df['folder'].unique())

23

In [None]:
import pandas as pd

total_samples = 2000

folder_counts = df["folder"].value_counts()

sample_sizes = pd.Series(2, index=folder_counts.index)

remaining = total_samples - sample_sizes.sum()

proportions = (folder_counts / folder_counts.sum()) * remaining
extra_samples = proportions.round().astype(int)

sample_sizes += extra_samples

diff = total_samples - sample_sizes.sum()

if diff != 0:
    adjust_folders = sample_sizes.nlargest(abs(diff)).index
    for folder in adjust_folders:
        sample_sizes[folder] += int(diff/abs(diff))  # +1 or -1
        diff = total_samples - sample_sizes.sum()
        if diff == 0:
            break

sampled_dfs = []
for folder, n_samples in sample_sizes.items():
    subset = df[df["folder"] == folder]
    sampled = subset.sample(n=min(n_samples, len(subset)), random_state=42)
    sampled_dfs.append(sampled)

final_df = pd.concat(sampled_dfs)

print("Final sample size:", len(final_df))

Final sample size: 2000


In [None]:
final_df['folder'].value_counts()

Unnamed: 0_level_0,count
folder,Unnamed: 1_level_1
mdpi,795
ncbi,392
copernicus,383
wiley,96
isprs,87
ieee,83
cambridge_university_press,45
elsevier,32
springer,22
taylor_and_francis,13


In [None]:
final_df = final_df.reset_index(drop=True)
final_df.to_csv("/content/drive/MyDrive/llm_qa/final_df.csv", index=False)

# Generate Q/A

In [None]:
import replicate
import time
def deepseek_request_replicate(prompt:str) -> str:
  # replicate = replicate.Client()

  #os.environ["REPLICATE_API_TOKEN"] = ''
  api = replicate.Client(api_token=os.environ["REPLICATE_API_TOKEN"])
  output = api.run(
      "deepseek-ai/deepseek-r1",
          input={"prompt": prompt}
      )
  return "".join(output)

def extract_think_content(text):
    """
    Extracts the content between <think> and </think> tags in a given string.

    Args:
        text (str): The input string.

    Returns:
        str: The extracted content, or an empty string if not found.
    """
    match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    return match.group(1) if match else ''

def remove_think_content(text):
    return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)

def extract_and_parse_json(text):
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        json_content = match.group(1)
        try:
            return json.loads(json_content)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON in ```json block: {e}")

    match_braces = re.search(r'(\{.*\})', text, re.DOTALL)
    if match_braces:
        json_content = match_braces.group(1)
        json_content_fixed = json_content.replace("'", '"')
        try:
            return json.loads(json_content_fixed)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON in braces: {e}")

    print("No valid JSON content found.")
    return None

def process_prompt(index, prompt, max_retries=3, delay=2):
    """Processes a single prompt with retries and saves the result to a file.

    Args:
        index (int): The index of the prompt.
        prompt (str): The prompt to process.
        max_retries (int): Maximum number of retries for the API call.
        delay (int): Delay between retries in seconds.
    """
    print(f"\nRunning process: {index}")

    output_dir = "/content/drive/MyDrive/llm_qa/output_raw"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    output_filename = f"{output_dir}/output_{index}.json"

    #if filenam already exists return withouth doing anything
    if os.path.exists(output_filename):
        print(f"File already exists: {output_filename}")
        return

    attempts = 0
    while attempts < max_retries:
        try:
            res = deepseek_request_replicate(prompt)
            break  # If the function succeeds, break out of the loop
        except Exception as e:
            print(f"Attempt {attempts + 1} failed: {e}")
            attempts += 1
            if attempts >= max_retries:
                print(f"Max retries reached for process {index}.")
                return  # Optionally, handle this case further or raise an exception
            time.sleep(delay)

    with open(output_filename, "w") as file:
        file.write(res)

    print(f"Saved raw file: {output_filename}")

In [None]:
import re
def postprocess_raw_results_ints(df, filepath:str):
    #get index from file path
    index = filepath.split("_")[-1].split(".")[0]

    sample = df['file_path'].iloc[int(index)]

    #read text from txt file
    with open(filepath, "r") as f:
        text = f.read()

    text = remove_think_content(text)

    json_text = extract_and_parse_json(text)

    #print(json_text)
    #print(sample)

    if not os.path.exists("outupt_processed"):
      os.mkdir("outupt_processed")

    # Save output to a file
    index = filepath.split("_")[-1].split(".")[0]
    output_filename = f"/content/drive/MyDrive/llm_qa/outupt_processed/output_{index}.json"

    with open(output_filename, "w") as f:
        json.dump(json_text, f,indent=4)

    print(f"Saved: {output_filename}")


In [None]:
instructions="""You are an agent that generates questions from a provided research paper. Your job is to generate one specific question and provide the relevant sections from the paper as references.

Instructions:

Generate a question that can be answered solely by the facts in the provided paper.

Extract up to 5 significant sections from the paper that answer the question. These must be *exact copies* from the text and should be whole sentences where possible.

Focus on the most relevant information; avoid background or unrelated sections.

Format the response in JSON with three fields:

"oath": "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise."

"question": A concise question directly answerable using the references.

"references": A list of the extracted sections from the paper.

Notes:

Make the question specific; do not ask about multiple topics.

DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN.

Do not repeat a question that has already been used.

When the paper is long, scan all sections but only pick the most relevant ones to answer the question.

Example:

Paper Text:
"Section 1: Introduction: Climate change has accelerated glacier melt in the Himalayas, affecting water resources downstream.

Section 2: Methodology: Remote sensing data from 2000–2020 were analyzed to quantify changes in glacier area.

Section 3: Results: Glacier area decreased by 12% over 20 years, with the highest retreat in the eastern Himalayas. Streamflow measurements confirmed increased seasonal variability.

Section 4: Discussion: The retreat impacts hydropower generation and agriculture. Communities relying on glacier-fed rivers experience water stress during summer months.

Section 5: Conclusion: Urgent adaptation strategies are needed to mitigate the socioeconomic impact of glacier retreat."


Expected JSON Response:
```json
{
  "oath": "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise.",
  "question": "How has glacier retreat affected downstream water resources in the Himalayas?",
  "references": [
    "Section 3: Results: Glacier area decreased by 12% over 20 years, with the highest retreat in the eastern Himalayas. Streamflow measurements confirmed increased seasonal variability.",
    "Section 4: Discussion: The retreat impacts hydropower generation and agriculture. Communities relying on glacier-fed rivers experience water stress during summer months."
  ]
}
```
"""



In [None]:
final_df = pd.read_csv("/content/drive/MyDrive/llm_qa/final_df.csv")
fs = s3fs.S3FileSystem(
    key=AWS_ACCESS_KEY_ID,
    secret=AWS_SECRET_ACCESS_KEY,
)

In [None]:
from concurrent.futures import ThreadPoolExecutor

def worker(index, row):
    key = row['file_path']
    s3_path = os.path.join(bucket_path, key)

    try:
        with fs.open(s3_path, 'r', encoding='utf-8') as s3_file:
            content = s3_file.read()
    except Exception as e:
        print(f"Error reading file {key} from S3: {e}")
        return

    input_text = f"Context:\n{content}\n\nInstructions:\n{instructions}"

    try:
        process_prompt(index, input_text, max_retries=3, delay=2)
    except Exception as e:
        print(f"Error processing file {key}: {e}")

with ThreadPoolExecutor(max_workers=5) as executor:  # adjust workers if needed
    executor.map(lambda args: worker(*args), final_df.iterrows())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Running process: 337
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_336.json

Running process: 338
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_334.json

Running process: 339
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_335.json

Running process: 340
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_338.json

Running process: 341
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_340.json

Running process: 342
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_337.json

Running process: 343
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_339.json

Running process: 344
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_329.json

Running process: 345
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/output_341.json

Running process: 346
Saved raw file: /content/drive/MyDrive/llm_qa/output_raw/ou

In [None]:
import re
import json

def extract_and_parse_json(text):
    # 1. Try to extract fenced ```json blocks
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if not match:
        # 2. Fallback: try to find the first {...} block
        match = re.search(r'(\{.*\})', text, re.DOTALL)

    if not match:
        print("No JSON content found.")
        return None

    json_content = match.group(1).strip()

    # 3. Fix common issues: invalid backslashes
    json_content = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', json_content)

    try:
        return json.loads(json_content)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None


In [None]:
for index in range(0,200):
  print(index)
  postprocess_raw_results_ints(final_df, f"/content/drive/MyDrive/llm_qa/output_raw/output_{index}.json".format(index))

0
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_0.json
1
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_1.json
2
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_2.json
3
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_3.json
4
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_4.json
5
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_5.json
6
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_6.json
7
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_7.json
8
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_8.json
9
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_9.json
10
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_10.json
11
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_11.json
12
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_12.json
13
Saved: /content/drive/MyDrive/llm_qa/outupt_processed/output_13.json
14
Saved: /c

In [None]:
import pandas as pd
import json
import os

csv_file = "/content/drive/MyDrive/llm_qa/final_df.csv"
json_folder = "/content/drive/MyDrive/llm_qa/outupt_processed"
output_csv = "data_with_questions.csv"

df = pd.read_csv(csv_file)

questions = []
references = []

for idx, row in df.iterrows():
    json_path = os.path.join(json_folder, f"output_{idx}.json")

    if os.path.exists(json_path):
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                content = f.read()
                # Extract JSON string by finding first '{' and last '}'
                start = content.find("{")
                end = content.rfind("}") + 1
                json_str = content[start:end] if start != -1 and end != -1 else "{}"

                try:
                    data = json.loads(json_str)
                except json.JSONDecodeError:
                    print(f"Invalid JSON in {json_path}")
                    data = {}

            questions.append(data.get("question", ""))
            references.append(" | ".join(data.get("references", [])))
        except Exception as e:
            print(f"Error reading {json_path}: {e}")
            questions.append("")
            references.append("")
    else:
        questions.append("")
        references.append("")

df["question"] = questions
df["references"] = references
df.to_csv(output_csv, index=False)
print(f"Saved updated CSV to {output_csv}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/llm_qa/final_df.csv'

# Duplicate check

In [None]:
import pandas as pd
import ast

df = pd.read_excel("/content/Combined - Formatted - Data Delivery SatComm1 - 0827.xlsx",sheet_name=1)

results = []

for idx, row in df.iterrows():
    notes = str(row["Note(;)"]).split(";") if pd.notna(row["Note(;)"]) else []
    annotations = row["Hallucinations Bold Spans"]

    try:
        spans = ast.literal_eval(annotations)
    except Exception as e:
        results.append((idx, f"Invalid JSON: {e}"))
        continue

    note_count = len([n for n in notes if n.strip()])  # count non-empty notes
    span_count = len(spans)

    if note_count == span_count:
        #results.append((idx, f"Match | Notes={note_count} | Spans={span_count}"))
        pass
    else:
        results.append((idx, f"Mismatch | Notes={note_count} | Spans={span_count}"))

for r in results:
    print(f"Row {r[0]}: {r[1]}")



Row 1: Mismatch | Notes=3 | Spans=4
Row 3: Mismatch | Notes=3 | Spans=4
Row 152: Mismatch | Notes=4 | Spans=5
Row 255: Mismatch | Notes=4 | Spans=3
Row 259: Mismatch | Notes=4 | Spans=3
Row 440: Mismatch | Notes=3 | Spans=4
Row 442: Mismatch | Notes=3 | Spans=4
Row 523: Mismatch | Notes=4 | Spans=3
Row 1158: Mismatch | Notes=2 | Spans=1
Row 1621: Mismatch | Notes=6 | Spans=4
Row 1657: Mismatch | Notes=9 | Spans=10
Row 1802: Mismatch | Notes=1 | Spans=0


In [None]:
df = pd.read_excel("/content/Combined - Formatted - Data Delivery SatComm1 - 0827.xlsx",sheet_name=1)

In [None]:
print(df.columns.tolist())

['Question', 'Answer', 'Topic', 'Note(;)', 'Source 1', 'Source 2', 'Source 3', 'Hallucinations Bold Spans', 'Unnamed: 8']


In [3]:
import pandas as pd

# Path to your CSV file
csv_file = "/content/drive/MyDrive/llm_qa/data_with_questions.csv"
output_csv = "/content/drive/MyDrive/llm_qa/data_with_questions_cleaned.csv"

# Read CSV
df = pd.read_csv(csv_file)

# Remove rows with any empty (NaN) values
df_cleaned = df.dropna().reset_index(drop=True)

# Save the cleaned CSV
df_cleaned.to_csv(output_csv, index=False)

print(f"Saved cleaned CSV to {output_csv}")


Saved cleaned CSV to /content/drive/MyDrive/llm_qa/data_with_questions_cleaned.csv


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

# Paths
csv_file = "/content/drive/MyDrive/llm_qa/data_with_questions_cleaned.csv"
output_csv = "/content/drive/MyDrive/llm_qa/data_with_unique_questions.csv"

# Read CSV
df = pd.read_csv(csv_file)
questions = df['question'].tolist()

# Embeddings
model_name = "nasa-impact/nasa-smd-ibm-st-v2"
embedder = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs={"normalize_embeddings": True})
embeddings = embedder.embed_documents(questions)

  embedder = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs={"normalize_embeddings": True})
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
# Threshold for similarity
threshold = 0.6

# Keep unique questions
unique_indices = []
for i, emb_i in enumerate(embeddings):
    keep = True
    for j in unique_indices:
        sim = cosine_similarity([emb_i], [embeddings[j]])[0][0]
        if sim > threshold:
            keep = False
            break
    if keep:
        unique_indices.append(i)

# Filter dataframe
df_unique = df.iloc[unique_indices].reset_index(drop=True)

# Save
df_unique.to_csv(output_csv, index=False)
print(f"Saved {len(df_unique)} unique questions to {output_csv}")


Saved 1140 unique questions to /content/drive/MyDrive/llm_qa/data_with_unique_questions.csv


In [3]:
len(df_unique)

1140

# Retrival test

In [3]:
%%capture
!pip install qdrant_client
!pip install sentence_transformers
!pip install langchain-community
!pip install replicate
!pip install pandas
!pip install nltk
!pip install langchain pydantic
!pip install accelerate

In [4]:
from huggingface_hub import login

HF_TOKEN = "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"
login(token=HF_TOKEN)

In [13]:
from qdrant_client import QdrantClient, models
import random
from tqdm import tqdm
import pandas as pd
import nltk
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from transformers import AutoTokenizer, AutoModel

cluster='llm4eo'
COLLECTION_NAME = "esa-data-qwen-1024"


if cluster=='llm4eo':
    QDRANT_API_KEY='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.Ocds3v2-2Szf_4DYVbliUX8xv_fjpHq_PI_U-Qcm7sw'
    QDRANT_URL='https://82e69449-1678-4cdd-9b36-1daccdbc58e9.eu-central-1-0.aws.cloud.qdrant.io:6333'         
else:
    QDRANT_API_KEY='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.M5B43daH5yk3a32RzZ_4T_CFMWpr-jx1FOt6vd4tNEI'
    QDRANT_URL='https://1402380c-a401-4b5e-b041-f36b35ba63e8.eu-central-1-0.aws.cloud.qdrant.io:6333'

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=120
)



In [14]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

class qwen_embedder:
    def __init__(self, model_name="Qwen/Qwen3-Embedding-4B"):
        # Load the sentence-transformers model
        self.model = SentenceTransformer(
                                    model_name,
                                    model_kwargs={
                                        "torch_dtype": "auto",       # important: will use float16/bfloat16 automatically
                                        "device_map": "auto",
                                    },
                                    tokenizer_kwargs={"padding_side": "left",
                                                      "max_length": 2048,
                                                      "truncation": True
                                                      }
                                                      )

    def embed_documents(self,
                        texts,
                        batch_size=8,
                        padding=True,
                        truncation=True,
                        max_length=2048,
                        normalize=True):
        """
        Encodes a list of texts into embeddings.

        Args:
            texts (list[str]): Documents to embed
            padding (bool/str): True = dynamic padding, 'max_length' = fixed length
            truncation (bool): Whether to truncate texts beyond max_length
            max_length (int): Max tokens allowed
            normalize (bool): Whether to L2 normalize embeddings

        Returns:
            np.ndarray: Embeddings array (num_texts x embedding_dim)
        """
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            normalize_embeddings=normalize,
            convert_to_numpy=True,
            convert_to_tensor=False
        )
        embeddings = embeddings.tolist()
        return embeddings


    def embed_query(self,query):

        embeddings = self.model.encode( query,prompt_name="query")

        embeddings = embeddings.tolist()
        return embeddings

In [15]:
import torch
from transformers import AutoTokenizer, AutoModel
class IndusEmbedder:
    def __init__(self, model_name: str = "Tulsikumar/indus-sde-st-v0.2", device: str = None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, documents: list[str], batch_size: int = 100) -> list[list[float]]:
        """Embed a batch of documents (list of strings) in smaller chunks and return a list of vectors."""
        all_embeddings = []

    
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)

            embeddings = self._mean_pooling(outputs, inputs["attention_mask"])
            all_embeddings.extend(embeddings.cpu().tolist())  # convert to list of lists

        return all_embeddings

    def embed_query(self, query: str) -> list[float]:
        """Embed a single query string and return a list of floats."""
        return self.embed_documents([query], batch_size=1)[0]

In [9]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from qdrant_client.http.exceptions import UnexpectedResponse
import torch
from transformers import AutoTokenizer, AutoModel

MAX_RETRIES = 5
RETRY_DELAY = 2  # seconds

from huggingface_hub import login

HF_TOKEN = "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"
login(token=HF_TOKEN)

from tokenizers import ByteLevelBPETokenizer
from transformers import AutoTokenizer, AutoModel
import torch

class IndusEmbedder_with_pooling:
    def __init__(self, model_name="Tulsikumar/indus-sde-st-v0.2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.max_len = self.tokenizer.model_max_length
 

        # Byte-level tokenizer for splitting
        self.splitter = ByteLevelBPETokenizer.from_file(
            vocab_filename="/workspace/tests/vocab.json",
            merges_filename="/workspace/tests/merges.txt"
        )

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def _chunk_text(self, text, stride=20):
        words = text.split()
        chunks = []
        current_chunk = []
        current_len = 0
            
        for word in words:
            
            token_len = len(self.splitter.encode(word).ids)
            
            if current_len + token_len > self.max_len:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = current_chunk[-stride:] if stride < len(current_chunk) else current_chunk
                    current_len = sum(len(self.splitter.encode(w).ids) for w in current_chunk)
            current_chunk.append(word)
            current_len += token_len

        if current_chunk:
            chunks.append(" ".join(current_chunk))
        return chunks


    def embed_query(self, query, stride=20):
        return self._embed_text(query, stride=stride)

    def embed_documents(self, documents, batch_size=100, stride=100):
        results = []
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i+batch_size]
            for doc in batch:
                results.append(self._embed_text(doc, stride=stride))
        return results
        
    def _embed_text(self, text, stride=20):
        chunks = self._chunk_text(text, stride=stride)
        embeddings = []
        chunk_weights = []
    
        for chunk in chunks:
            inputs = self.tokenizer(chunk, padding=True, truncation=True, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs, output_attentions=True)
    
            emb = self._mean_pooling(outputs, inputs['attention_mask'])
            embeddings.append(emb)
    
            # Compute attention-based weight for this chunk
            # Use last layer, average over heads, take [CLS] attention
            attn = outputs.attentions[-1]  # shape: [batch, heads, seq_len, seq_len]
            attn_cls = attn.mean(dim=1)[:, 0, :].mean(dim=1)  # shape: [batch]
            chunk_weights.append(attn_cls.squeeze().cpu())
    
        # Normalize weights to sum=1
        weights = torch.tensor(chunk_weights)
        weights = weights / weights.sum()
    
        # Weighted sum of chunk embeddings
        final_emb = sum(w * e for w, e in zip(weights, embeddings)).squeeze()
        return final_emb.cpu().tolist()


In [16]:
if COLLECTION_NAME == 'esa-data-indus' or COLLECTION_NAME == 'esa-data-indus-quant' or COLLECTION_NAME =='esa-data-indus-512-1024':
    model_name="nasa-impact/nasa-smd-ibm-st-v2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
elif COLLECTION_NAME == 'esa-data-qwen' or COLLECTION_NAME == 'esa-data-qwen-quant' or COLLECTION_NAME =='esa-data-qwen-1024':
    embedder=qwen_embedder(model_name="Qwen/Qwen3-Embedding-4B")

elif COLLECTION_NAME == "esa-data-indus-1024" or COLLECTION_NAME == "esa-data-indus-1024-quant":
    model_name="Tulsikumar/indus-sde-st-v0.2"
    normalize=True
    encode_kwargs = {"normalize_embeddings": normalize}
    embedder=HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)

elif COLLECTION_NAME == "esa-data-indus-1024-auto":
    embedder=IndusEmbedder()

elif COLLECTION_NAME == "esa-data-indus-1024-1024":
    embedder=IndusEmbedder_with_pooling()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [17]:
import time
import pandas as pd
from tqdm import tqdm

K = 10
csv_path = "/workspace/tests/data_with_unique_questions.csv"
df_queries = pd.read_csv(csv_path)

results = []
rank_counts = {i: 0 for i in range(1, K+1)}
rank_counts["not_in_topk"] = 0

for _, row in tqdm(df_queries.iterrows(), total=len(df_queries), desc="Testing retrieval"):
    doc_id = row["id"]
    query_text = row["question"]
    references = row["references"].split(" | ") if pd.notna(row["references"]) else []

    query_vector = embedder.embed_query(query_text)

    # Measure retrieval time
    start_time = time.time()
    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=K,
        with_payload=True,
        search_params=models.SearchParams(
                quantization=models.QuantizationSearchParams(
                    ignore=False,
                    rescore=True,
                    oversampling=4.0,
                )
            )
        )
    
    retrieval_time = time.time() - start_time  # in seconds

    retrieved_ids = [res.payload.get("id") for res in search_result]
    retrieved_scores = [res.score for res in search_result]

    if doc_id in retrieved_ids:
        rank = retrieved_ids.index(doc_id) + 1
        score_at_rank = retrieved_scores[rank - 1]
        rank_counts[rank] += 1
    else:
        rank = None
        score_at_rank = None
        rank_counts["not_in_topk"] += 1

    # Get top chunk content and compute coverage
    if search_result:
        retrieved_texts = [res.payload.get("content", "") for res in search_result]
        retrieved_text = "|".join(retrieved_texts)  # optional: store all text
    
        # Check each reference in all chunks
        found_refs = 0
        for ref in references:
            if any(ref in chunk for chunk in retrieved_texts):
                found_refs += 1
    
        coverage = found_refs / len(references) if references else 0
    else:
        retrieved_text = ""
        coverage = 0


    results.append({
        "doc_id": doc_id,
        "query_text": query_text,
        "references":row["references"],
        "retrieved_ids": retrieved_ids,
        "retrieved_scores": retrieved_scores,
        "retrieved_text": retrieved_text,
        "rank": rank,
        "score_at_rank": score_at_rank,
        "coverage": coverage,
        "retrieval_time": retrieval_time,
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = c / len(df_queries)
    print(f"Rank {r}: {c} ({pct:.2f}%)")

# Save results
output_file = "llm_qa_qwen_1024.csv"
df_results.to_csv(output_file, index=False)
print(f"\nSaved results including retrieval time and retrieved text to {output_file}")

  search_result = client.search(
Testing retrieval: 100%|██████████| 1140/1140 [15:34<00:00,  1.22it/s]



Rank distribution:
Rank 1: 991 (0.87%)
Rank 2: 49 (0.04%)
Rank 3: 19 (0.02%)
Rank 4: 12 (0.01%)
Rank 5: 9 (0.01%)
Rank 6: 5 (0.00%)
Rank 7: 9 (0.01%)
Rank 8: 2 (0.00%)
Rank 9: 1 (0.00%)
Rank 10: 0 (0.00%)
Rank not_in_topk: 43 (0.04%)

Saved results including retrieval time and retrieved text to llm_qa_qwen_1024.csv


In [10]:
# Load results CSV
results_csv = "llm_qa_qwen_1024.csv"
df = pd.read_csv(results_csv)


# Compute stats
mean_time = df["retrieval_time"].mean()
median_time = df["retrieval_time"].median()
std_time = df["retrieval_time"].std()
min_time = df["retrieval_time"].min()
max_time = df["retrieval_time"].max()

# Compute average coverage
avg_coverage = df["coverage"].mean()

print(f"Average coverage: {avg_coverage:.4f}")

# Print results
print(f"Mean retrieval time   : {mean_time:.4f} seconds")
print(f"Median retrieval time : {median_time:.4f} seconds")
print(f"Std dev retrieval time: {std_time:.4f} seconds")
print(f"Min retrieval time    : {min_time:.4f} seconds")
print(f"Max retrieval time    : {max_time:.4f} seconds")

Average coverage: 0.2144
Mean retrieval time   : 1.1247 seconds
Median retrieval time : 1.0369 seconds
Std dev retrieval time: 0.5166 seconds
Min retrieval time    : 0.3860 seconds
Max retrieval time    : 7.1132 seconds


In [11]:
rank_counts = df["rank"].value_counts()
total=0
print("\nRank distribution:")
for r, c in rank_counts.items():
    pct = c / len(df)
    total+=c
    print(f"Rank {r}: {c} ({pct:.2%})")
print(total/len(df))


Rank distribution:
Rank 1.0: 686 (60.18%)
Rank 2.0: 89 (7.81%)
Rank 3.0: 47 (4.12%)
Rank 4.0: 35 (3.07%)
Rank 5.0: 20 (1.75%)
Rank 9.0: 20 (1.75%)
Rank 7.0: 15 (1.32%)
Rank 6.0: 13 (1.14%)
Rank 8.0: 10 (0.88%)
Rank 10.0: 3 (0.26%)
0.8228070175438597


In [10]:
!pip install tiktoken

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tiktoken
  Downloading tiktoken-0.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m210.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.11.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [8]:
import pandas as pd
from collections import Counter
import string
from typing import List, Dict

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join(text.split())
    return text

def is_reference_present_fuzzy(reference: str, document: str, threshold: float = 0.8) -> bool:
    ref_tokens = normalize_text(reference).split()
    doc_tokens = normalize_text(document).split()
    if not ref_tokens:
        return False
    matched_tokens = sum(1 for t in ref_tokens if t in doc_tokens)
    fraction_matched = matched_tokens / len(ref_tokens)
    return fraction_matched >= threshold

def compute_token_metrics_single_doc(references: List[str], retrieved_texts: List[str], threshold: float = 0.8) -> Dict[str, float]:
    all_ref_tokens = []
    all_doc_tokens = []

    # Track which references are found
    for ref in references:
        found = any(is_reference_present_fuzzy(ref, doc, threshold) for doc in retrieved_texts)
        ref_tokens = normalize_text(ref).split()
        all_ref_tokens.extend(ref_tokens)
        if found:
            # Add tokens from retrieved documents
            for doc in retrieved_texts:
                all_doc_tokens.extend(normalize_text(doc).split())
    
    if not all_ref_tokens:
        return {"iou": 0.0, "precision": 0.0, "recall": 0.0}

    ref_counter = Counter(all_ref_tokens)
    doc_counter = Counter(all_doc_tokens)

    intersection_tokens = ref_counter & doc_counter
    intersection_count = sum(intersection_tokens.values())

    ref_count = sum(ref_counter.values())
    doc_count = sum(doc_counter.values())

    union_count = ref_count + doc_count - intersection_count

    iou = intersection_count / union_count if union_count > 0 else 0.0
    precision = intersection_count / doc_count if doc_count > 0 else 0.0
    recall = intersection_count / ref_count if ref_count > 0 else 0.0

    return {"iou": iou, "precision": precision, "recall": recall}





results_csv = "llm_qa_qwen_quant.csv"
df = pd.read_csv(results_csv)
df = df.head(250)

def compute_metrics_single_row(ref_str: str, ret_str: str) -> Dict[str, float]:
    references = [r.strip() for r in ref_str.split("|") if r.strip()]
    retrieved_texts = [r.strip() for r in ret_str.split("|") if r.strip()]
    return compute_token_metrics_single_doc(references, retrieved_texts, threshold=0.8)

df_metrics = df.apply(
    lambda row: pd.Series(compute_metrics_single_row(row["references"], row["retrieved_text"])),
    axis=1
)

df = pd.concat([df, df_metrics], axis=1)
average_metrics = df_metrics.mean().to_dict()
print("Average metrics across all rows:")
print(average_metrics)


Average metrics across all rows:
{'iou': 0.010371321357710085, 'precision': 0.010391856961105216, 'recall': 0.884027091784911}


In [None]:
llm_qa_qwen_quant

In [22]:
results_csv = "llm_qa_qwen_quant.csv"
df = pd.read_csv(results_csv)
df = df.head(250)



df_metrics = df.apply(
    lambda row: pd.Series(compute_metrics_single_row(row["references"], row["retrieved_text"])),
    axis=1
)

df = pd.concat([df, df_metrics], axis=1)
average_metrics = df_metrics.mean().to_dict()
print("Average metrics across all rows:")
print(average_metrics)

Average metrics across all rows:
{'iou': 0.11848559090897277, 'precision': 0.1491738053709684, 'recall': 0.42037997667789784}


In [34]:
from typing import List
import string

def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join(text.split())
    return text

def is_reference_present_fuzzy(reference: str, document: str, threshold: float = 0.7) -> bool:
    """
    Returns True if enough of the reference tokens appear in the document.
    threshold: fraction of tokens that must match
    """
    ref_tokens = normalize_text(reference).split()
    doc_tokens = normalize_text(document).split()
    
    if not ref_tokens:
        return False
    
    matched_tokens = sum(1 for t in ref_tokens if t in doc_tokens)
    fraction_matched = matched_tokens / len(ref_tokens)
    
    return fraction_matched >= threshold

# Example usage
references = ["This is a /b reference without issues .", "Another one!"]
retrieved_text = "This is a reference /n ## without issues."
for ref in references:
    present = is_reference_present_fuzzy(ref, retrieved_text, threshold=0.7)
    print(f"Reference '{ref}' present? {present}")


Reference 'This is a /b reference without issues .' present? True
Reference 'Another one!' present? False


In [2]:
from typing import List, Dict, Tuple, Union

def sum_of_ranges(ranges: List[Tuple[int,int]]) -> int:
    return sum(end - start for start, end in ranges)

def union_ranges(ranges: List[Tuple[int,int]]) -> List[Tuple[int,int]]:
    if not ranges:
        return []
    sorted_ranges = sorted(ranges, key=lambda x: x[0])
    merged_ranges = [sorted_ranges[0]]
    for start, end in sorted_ranges[1:]:
        last_start, last_end = merged_ranges[-1]
        if start <= last_end:
            merged_ranges[-1] = (last_start, max(last_end, end))
        else:
            merged_ranges.append((start, end))
    return merged_ranges

def intersect_two_ranges(range1: Tuple[int,int], range2: Tuple[int,int]) -> Union[Tuple[int,int], None]:
    start1, end1 = range1
    start2, end2 = range2
    intersect_start = max(start1, start2)
    intersect_end = min(end1, end2)
    if intersect_start <= intersect_end:
        return (intersect_start, intersect_end)
    return None

def difference(ranges: List[Tuple[int,int]], target: Tuple[int,int]) -> List[Tuple[int,int]]:
    result = []
    t_start, t_end = target
    for start, end in ranges:
        if end < t_start or start > t_end:
            result.append((start, end))
        elif start < t_start and end > t_end:
            result.append((start, t_start))
            result.append((t_end, end))
        elif start < t_start:
            result.append((start, t_start))
        elif end > t_end:
            result.append((t_end, end))
    return result

def find_target_in_text(text: str, target: str) -> Union[Tuple[int,int], None]:
    start = text.find(target)
    if start == -1:
        return None
    return start, start + len(target)

def compute_metrics_multiple_docs(
    references: List[str],
    retrieved_texts: List[str]
) -> Dict[str, Union[List[Dict[str,float]], Dict[str,float]]]:
    """
    references: List of reference strings
    retrieved_texts: List of retrieved documents as strings
    Returns:
        - "per_doc_scores": list of dicts with IoU, precision, recall per document
        - "average_scores": dict with mean IoU, precision, recall
    """
    per_doc_scores = []

    for doc in retrieved_texts:
        reference_ranges = []
        for ref in references:
            rng = find_target_in_text(doc, ref)
            if rng:
                reference_ranges.append(rng)

        retrieved_range = [(0, len(doc))]

        numerator_sets = []
        unused_refs = reference_ranges.copy()
        for ref_rng in reference_ranges:
            intersection = intersect_two_ranges(retrieved_range[0], ref_rng)
            if intersection:
                numerator_sets = union_ranges([intersection] + numerator_sets)
                unused_refs = difference(unused_refs, intersection)

        denominator_iou = union_ranges(retrieved_range + unused_refs)
        iou_score = sum_of_ranges(numerator_sets) / sum_of_ranges(denominator_iou) if denominator_iou else 0.0
        recall_score = sum_of_ranges(numerator_sets) / sum_of_ranges(reference_ranges) if reference_ranges else 0.0
        precision_score = sum_of_ranges(numerator_sets) / sum_of_ranges(retrieved_range) if retrieved_range else 0.0

        per_doc_scores.append({
            "iou": iou_score,
            "recall": recall_score,
            "precision": precision_score
        })

    # Compute average metrics
    avg_iou = sum(d['iou'] for d in per_doc_scores) / len(per_doc_scores) if per_doc_scores else 0.0
    avg_recall = sum(d['recall'] for d in per_doc_scores) / len(per_doc_scores) if per_doc_scores else 0.0
    avg_precision = sum(d['precision'] for d in per_doc_scores) / len(per_doc_scores) if per_doc_scores else 0.0

    return {
        "per_doc_scores": per_doc_scores,
        "average_scores": {
            "iou": avg_iou,
            "recall": avg_recall,
            "precision": avg_precision
        }
    }

# Example usage
references = ["quick brown fox", "lazy dog"]
retrieved_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A quick brown fox is very fast.",
    "Nothing matches here."
]

scores = compute_metrics_multiple_docs(references, retrieved_texts)
print(scores['average_scores'])


{'iou': 0.3355327468230694, 'recall': 0.6666666666666666, 'precision': 0.3355327468230694}


In [12]:
import pandas as pd


results_csv = "llm_qa_indus_quant.csv"
df = pd.read_csv(results_csv)
df = df.head(250)

# Function to compute IoU, precision, and recall for a single row
def compute_metrics_single_row(ref_str: str, ret_str: str) -> Dict[str, float]:
    """
    ref_str: string of references separated by "|"
    retrieved_text: string of retrieved document
    Returns a dict with IoU, precision, and recall
    """
    references = [r.strip() for r in ref_str.split("|")]
    #retrieved_text=[r.strip() for r in ret_str.split("|")]
    metrics = compute_metrics_multiple_docs(references, [ret_str])
    # metrics["per_doc_scores"] has only one element since we passed a single document
    return metrics["per_doc_scores"][0]

# Apply metrics computation to each row
df_metrics = df.apply(
    lambda row: pd.Series(compute_metrics_single_row(row["references"], row["retrieved_text"])),
    axis=1
)

# Add the computed metrics to the original dataframe
df = pd.concat([df, df_metrics], axis=1)

# Compute averages
average_metrics = df_metrics.mean().to_dict()

print("Average metrics across all rows:")
print(average_metrics)


Average metrics across all rows:
{'iou': 0.009660970202262194, 'recall': 0.6043178135966732, 'precision': 0.009660970202262194}
