<a href="https://colab.research.google.com/github/funCode9/semantic-search-engine/blob/main/IEEE_ML_R2_2025A7PS0030H.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/ieee_ml
!tar -xvzf collection.tar.gz
!tar -xvzf queries.tar.gz

/content/drive/MyDrive/ieee_ml
collection.tsv
queries.dev.tsv
queries.eval.tsv
queries.train.tsv


In [5]:
import pandas as pd
import random
import csv

queries_df = pd.read_csv('queries.dev.small.tsv', sep='\t', names=['qid', 'text'], dtype={'qid': str})
qrels_df = pd.read_csv('qrels.dev.small.tsv', sep='\t', names=['qid', 'fixed', 'pid', 'rel'], dtype={'qid': str, 'pid': str})

all_qids = list(queries_df['qid'].unique())

random.seed(42)

# 0.20 becuase in realistic dataset not all queries have the correct answer at the top, it is to ensure recall is not too high artificially.
answerable_qids = set(random.sample(all_qids, int(len(all_qids) * 0.20)))
answerable_pids = set(qrels_df[qrels_df['qid'].isin(answerable_qids)]['pid'].astype(str))

print(f"building realistic system where {len(answerable_qids)} queries out of {len(all_qids)} will have an answer.")


building realistic system where 1396 queries out of 6980 will have an answer.


In [6]:
import random
import csv
import pandas as pd

subset_passages = []
target_size = 100000

# probability = approx 0.012 = 100k/8.8M

sampling_probability = 0.012

with open('/content/drive/MyDrive/ieee_ml/collection.tsv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    grab_next_count = 0

    for row in reader:
        if len(row) < 2:
            continue
        pid = str(row[0])

        # including gold answers
        if pid in answerable_pids:
            subset_passages.append({'pid': pid, 'text': row[1]})
            grab_next_count = 3 # Tell the loop to grab the next 3 rows
            continue

        # including neighbor negatives (Hard negatives)
        if grab_next_count > 0 and len(subset_passages) < target_size:
            subset_passages.append({'pid': pid, 'text': row[1]})
            grab_next_count -= 1
            continue

        # Random sampling (filling up to reach 100k subset)
        if len(subset_passages) < target_size:
            if random.random() < sampling_probability:
                subset_passages.append({'pid': pid, 'text': row[1]})

# trimming subset to 100k if it went beyond it
if len(subset_passages) > target_size:
    subset_passages = subset_passages[:target_size]

subset_df = pd.DataFrame(subset_passages)
subset_df['pid'] = subset_df['pid'].astype(str)
subset_df.to_csv('subset_collection.csv', index=False)
print(f"Subset Created: {len(subset_df)} passages.")

Subset Created: 100000 passages.


In [7]:
!pip install -U sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print(f"Model loaded on: {device}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded on: cuda


In [8]:
# Loading and encoding 100k subset
subset_df = pd.read_csv('subset_collection.csv')

print("Encoding 100k passages")
passage_embeddings = model.encode(subset_df['text'].tolist(), batch_size=128, show_progress_bar=True, convert_to_numpy=True)

# Initializing FAISS Index
# MiniLM uses 384 integers to code sentence into a vector
dimension = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

# Normalizing vectors and adding to Index
faiss.normalize_L2(passage_embeddings)
index.add(passage_embeddings.astype('float32'))

print(f"Total passages indexed: {index.ntotal}")

Encoding 100k passages


Total passages indexed: 100000


In [9]:
# evaluation metrics (BASELINE MODEL = dense embedding)

import math

metrics = {
    'mrr': 0,
    'ndcg_10': 0,
    'recall_10': 0,
    'precision_10': 0
}

# Creating the Answer Key (mapping Query ID to a list of correct Passage IDs)
relevant_map = qrels_df.groupby('qid')['pid'].apply(list).to_dict()

total_queries = len(queries_df)

print(f"Evaluating all {total_queries} queries...")

for _, row in queries_df.iterrows():
    qid, q_text = row['qid'], row['text']

    # pids of gold answers
    target_pids = relevant_map.get(qid, [])

    # Searching top 10 results
    q_emb = model.encode([q_text]).astype('float32')
    faiss.normalize_L2(q_emb)
    distances, indices = index.search(q_emb, k=10)

    found_at_rank = -1
    num_relevant_found = 0

    # Calculating metrics for top 10 results
    for rank, idx in enumerate(indices[0]):
        retrieved_pid = str(subset_df.iloc[idx]['pid'])

        if retrieved_pid in target_pids:
            num_relevant_found += 1
            if found_at_rank == -1:
                found_at_rank = rank

                # MRR calculation
                metrics['mrr'] += 1 / (rank + 1)

                # NDCG calculation (using log2 for position discount)
                metrics['ndcg_10'] += 1 / math.log2(rank + 2)

    # Recall
    if found_at_rank != -1:
        metrics['recall_10'] += 1

    # Precision - proportion of top 10 that were relevant
    metrics['precision_10'] += num_relevant_found / 10

# Final Average over ALL queries
print(f"\n--- FINAL BASELINE METRICS ---")
print(f"MRR@10:       {metrics['mrr'] / total_queries:.4f}")
print(f"NDCG@10:      {metrics['ndcg_10'] / total_queries:.4f}")
print(f"Recall@10:    {metrics['recall_10'] / total_queries:.4f}")
print(f"Precision@10: {metrics['precision_10'] / total_queries:.4f}")

Evaluating all 6980 queries...

--- FINAL BASELINE METRICS ---
MRR@10:       0.1661
NDCG@10:      0.1769
Recall@10:    0.2096
Precision@10: 0.0218


In [10]:
all_gold_pids = set(qrels_df['pid'].astype(str))

subset_passages = []
target_size = 100000

# probability 100k / 8.8M lines ~ 0.0113.
sampling_probability = 0.012

with open('/content/drive/MyDrive/ieee_ml/collection.tsv', 'r', encoding='utf-8') as f:

    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

    for row in reader:
        if len(row) < 2:
          continue
        pid = str(row[0])

        # including gold answers
        if pid in all_gold_pids:
            subset_passages.append({'pid': pid, 'text': row[1]})

        # random sampling for the rest
        # We check probability FIRST to ensure we scan the whole file roughly evenly
        elif len(subset_passages) < target_size and random.random() < sampling_probability:
            subset_passages.append({'pid': pid, 'text': row[1]})


subset_df = pd.DataFrame(subset_passages)
subset_df = subset_df.drop_duplicates(subset='pid')

# if subset is over 100k (over sampled), trimming it to 100k again
if len(subset_df) > target_size:
    subset_df = subset_df.sample(n=target_size, random_state=42)

subset_df.to_csv('subset_collection.csv', index=False)
print(f"Subset built: {len(subset_df)} passages")

Subset built: 100000 passages


In [11]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import faiss

bi_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')
cross_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cuda')

# Half-precision (from 32-bit to 16-int) increases speed to 2x
cross_model.model.half()

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, e

In [12]:
import faiss

# Building FAISS Index
passages = subset_df['text'].astype(str).tolist()

print("Encoding passages")
expert_embeddings = bi_model.encode(passages, batch_size=128, show_progress_bar=True, convert_to_numpy=True)
dimension = expert_embeddings.shape[1]

# Normalizing for Cosine Similarity
faiss.normalize_L2(expert_embeddings)

expert_index = faiss.IndexFlatIP(dimension)
expert_index.add(expert_embeddings.astype('float32'))

# Creating the fast lookup dictionary
pid_to_text = dict(zip(subset_df['pid'].astype(str), subset_df['text']))

print(f"FAISS Index initialized successfully, Vector Dimension: {dimension}")

Encoding passages


FAISS Index initialized successfully, Vector Dimension: 768


In [14]:
# Query Expansion
def expand_query_pseudo_relevance(query, bi_model, index, subset_df, k=3):
    q_emb = bi_model.encode([query]).astype('float32')
    faiss.normalize_L2(q_emb)
    _, indices = index.search(q_emb, k=k)
    expansion_terms = []
    for idx in indices[0]:
        if 0 <= idx < len(subset_df):
            text = subset_df.iloc[idx]['text']
            expansion_terms.extend(text.split()[:5])
    return query + " " + " ".join(list(set(expansion_terms)))

In [15]:
from tqdm.auto import tqdm

print("Pre-computing lookups and optimizing models")
pid_to_text = dict(zip(subset_df['pid'].astype(str), subset_df['text']))

cross_model.model.half()

K_LIST = [1, 5, 10, 20, 100]
metrics = {'mrr': 0}
for k in K_LIST:
    metrics[f'ndcg@{k}'] = 0
    metrics[f'recall@{k}'] = 0
    metrics[f'precision@{k}'] = 0

total_queries = len(queries_df)
valid_queries_count = 0
print(f"Starting evaluation on {total_queries} queries")

for _, row in tqdm(queries_df.iterrows(), total=total_queries):
    qid, q_text = row['qid'], row['text']
    target_pids = set(relevant_map.get(qid, []))

    if not target_pids:
        continue

    valid_queries_count += 1

    # using expanded query
    expanded_q_text = expand_query_pseudo_relevance(q_text, bi_model, expert_index, subset_df)

    # STAGE 1: Dense Retrieval
    q_emb = bi_model.encode([expanded_q_text], convert_to_numpy=True, show_progress_bar=False).astype('float32')
    faiss.normalize_L2(q_emb)


    # We retrieve 40 results
    _, d_indices = expert_index.search(q_emb, k=40)

    # Fast Dictionary Lookup
    candidates = []
    for idx in d_indices[0]:
        if 0 <= idx < len(subset_df):
            pid = str(subset_df.iloc[idx]['pid'])
            if pid in pid_to_text:
                candidates.append({'pid': pid, 'text': pid_to_text[pid]})

    # STAGE 2: Re-ranking (Cross-Encoder)
    if candidates:
        model_inputs = [[q_text, c['text']] for c in candidates]
        scores = cross_model.predict(model_inputs, batch_size=64, show_progress_bar=False)

        for i in range(len(candidates)):
            candidates[i]['score'] = scores[i]

        reranked_candidates = sorted(candidates, key=lambda x: x['score'], reverse=True)
    else:
        reranked_candidates = []

    # metric calculations

    # 1. MRR
    for rank, res in enumerate(reranked_candidates):
        if res['pid'] in target_pids:
            metrics['mrr'] += 1.0 / (rank + 1)
            break

    # 2. NDCG, Recall, Precision for all K
    for k in K_LIST:
        top_k = reranked_candidates[:k]
        num_rel = sum(1 for res in top_k if res['pid'] in target_pids)

        metrics[f'recall@{k}'] += num_rel / len(target_pids)
        metrics[f'precision@{k}'] += num_rel / k

        # NDCG Logic
        dcg = 0
        for rank, res in enumerate(top_k):
            if res['pid'] in target_pids:
                dcg += 1.0 / math.log2(rank + 2)

        idcg = 0
        for i in range(min(len(target_pids), k)):
            idcg += 1.0 / math.log2(i + 2)

        if idcg > 0:
            metrics[f'ndcg@{k}'] += dcg / idcg

# --- FINAL RESULTS ---
print("\n" + "="*50)
print(f"FINAL RESULTS (Evaluated on {valid_queries_count} queries)")
print("-"*50)
print(f"MRR: {metrics['mrr'] / valid_queries_count:.4f}")
print("-" * 50)
print(f"{'k':<5} | {'NDCG':<10} | {'Recall':<10} | {'Precision':<10}")
print("-" * 50)

for k in K_LIST:
    ndcg = metrics[f'ndcg@{k}'] / valid_queries_count
    rec = metrics[f'recall@{k}'] / valid_queries_count
    prec = metrics[f'precision@{k}'] / valid_queries_count
    print(f"{k:<5} | {ndcg:<10.4f} | {rec:<10.4f} | {prec:<10.4f}")
print("-"*50)

Pre-computing lookups and optimizing models
Starting evaluation on 6980 queries



FINAL RESULTS (Evaluated on 6980 queries)
--------------------------------------------------
MRR: 0.8654
--------------------------------------------------
k     | NDCG       | Recall     | Precision 
--------------------------------------------------
1     | 0.8284     | 0.8074     | 0.8284    
5     | 0.8722     | 0.9047     | 0.1899    
10    | 0.8741     | 0.9099     | 0.0956    
20    | 0.8744     | 0.9114     | 0.0479    
100   | 0.8744     | 0.9114     | 0.0096    
--------------------------------------------------


In [16]:
import time
import numpy as np
from tqdm.auto import tqdm

# Settings
num_bench_queries = 100
latencies = {'total': [], 'retrieval': [], 'rerank': []}

print(f"Starting Efficiency Analysis on {num_bench_queries} queries")

for _ in tqdm(range(num_bench_queries)):

    # Randomly sampling a query from the dataset
    query_text = queries_df.sample(1)['text'].values[0]
    start_total = time.time()

    # STAGE 1: RETRIEVAL (Bi-Encoder + FAISS)
    t1_start = time.time()
    q_emb = bi_model.encode([query_text], convert_to_tensor=True, show_progress_bar=False).cpu().numpy().astype('float32')
    faiss.normalize_L2(q_emb)
    _, d_indices = expert_index.search(q_emb, k=40)

    # Fast Dictionary Lookup
    candidates_text = []

    for idx in d_indices[0]:
        if 0 <= idx < len(subset_df):
            pid = str(subset_df.iloc[idx]['pid'])
            text = pid_to_text[pid]
            candidates_text.append(text)

    latencies['retrieval'].append((time.time() - t1_start) * 1000)

    # STAGE 2: RE-RANKING (Cross-Encoder)
    t2_start = time.time()

    if candidates_text:
        model_inputs = [[query_text, text] for text in candidates_text]
        _ = cross_model.predict(model_inputs, batch_size=64, show_progress_bar=False)

    latencies['rerank'].append((time.time() - t2_start) * 1000)

    latencies['total'].append((time.time() - start_total) * 1000)

# CALCULATING METRICS
def get_metrics(data):
    return {
        'mean': np.mean(data),
        'p50': np.percentile(data, 50),
        'p90': np.percentile(data, 90),
        'p95': np.percentile(data, 95),
        'p99': np.percentile(data, 99),
        'std': np.std(data)
    }

total_m = get_metrics(latencies['total'])
retrieval_m = get_metrics(latencies['retrieval'])
rerank_m = get_metrics(latencies['rerank'])



print("\nSYSTEM EFFICIENCY & LATENCY REPORT\n")
print(f"{'Metric':<15} | {'Value':<12}")
print("-" * 30)
print(f"{'Mean Latency':<15} | {total_m['mean']:.2f} ms")
print(f"{'Median (P50)':<15} | {total_m['p50']:.2f} ms")
print(f"{'P90 Latency':<15} | {total_m['p90']:.2f} ms")
print(f"{'P95 Latency':<15} | {total_m['p95']:.2f} ms")
print(f"{'P99 Latency':<15} | {total_m['p99']:.2f} ms")
print(f"{'Std Dev':<15} | {total_m['std']:.2f} ms")
print(f"{'Throughput':<15} | {1000/total_m['p50']:.2f} QPS")

print("\nCOMPONENT BREAKDOWN\n")
print("-"*50)
print(f"Retrieval (Stage 1): {retrieval_m['mean']:.2f} ms ({ (retrieval_m['mean']/total_m['mean'])*100:.1f}%)")
print(f"Re-ranking (Stage 2): {rerank_m['mean']:.2f} ms ({ (rerank_m['mean']/total_m['mean'])*100:.1f}%)")
print("-"*50)

Starting Efficiency Analysis on 100 queries



SYSTEM EFFICIENCY & LATENCY REPORT

Metric          | Value       
------------------------------
Mean Latency    | 77.80 ms
Median (P50)    | 76.89 ms
P90 Latency     | 88.24 ms
P95 Latency     | 91.50 ms
P99 Latency     | 100.92 ms
Std Dev         | 8.13 ms
Throughput      | 13.01 QPS

COMPONENT BREAKDOWN

--------------------------------------------------
Retrieval (Stage 1): 38.29 ms (49.2%)
Re-ranking (Stage 2): 39.50 ms (50.8%)
--------------------------------------------------


In [20]:
import sys
import os
import torch

# calculating FAISS index size on disk
index_file = 'expert_index.faiss'
faiss.write_index(expert_index, index_file)
faiss_size_mb = os.path.getsize(index_file) / (1024 * 1024)

# 2. Memory usage
# Embeddings: 100,000 rows * 768 dims * 4 bytes per float
emb_memory = (len(subset_df) * 768 * 4) / (1024 * 1024)

# Convert bytes to GB, 1e9  = 10^9
used_vram = torch.cuda.max_memory_allocated() / 1e9


print("STORAGE & MEMORY ANALYSIS\n")
print(f"Embedding Index Size:  {faiss_size_mb:.2f} MB")
print(f"RAM for Embeddings:     {emb_memory:.2f} MB")
print(f"Subset Text Size:       {sys.getsizeof(subset_df) / (1024*1024):.2f} MB")
print(f"Total Model Memory (Runtime):    {used_vram:.2f} GB (GPU VRAM)")


STORAGE & MEMORY ANALYSIS

Embedding Index Size:  292.97 MB
RAM for Embeddings:    292.97 MB
Subset Text Size:      42.54 MB
Total Model Memory (Runtime):    2.39 GB (GPU VRAM)


In [19]:
def run_simple_cli():
    print("-"*50)
    print("MS MARCO NEURAL SEARCH SYSTEM")
    print("Mode: Bi-Encoder Retrieval -> Cross-Encoder Re-Rank")
    print("-"*50)

    if 'bi_model' not in globals() or 'expert_index' not in globals():
        print("ERROR: Models or Index not found. Please run the indexing cell above.")
        return

    while True:
        print("\n" + "-"*30)
        query = input("Enter Search Query (or 'exit' to quit): ").strip()

        if query.lower() in ['exit', 'quit', 'q']:
            print("Exited from search system")
            break

        if not query:
            print("Empty query.")
            continue

        k_input = input("Enter Top-K results to return (default 10): ").strip()
        try:
            k = int(k_input) if k_input else 10
        except ValueError:
            k = 10


        # QUERY EXPANSION
        print(f"Original Query: {query}")
        # expanding query
        expanded_query = expand_query_pseudo_relevance(query, bi_model, expert_index, subset_df)
        print(f"Expanded Query: {expanded_query}")

        # RETRIEVAL
        # using expanded query
        q_emb = bi_model.encode([expanded_query], convert_to_numpy=True, show_progress_bar=False).astype('float32')
        faiss.normalize_L2(q_emb)

        # Searching FAISS and retrieving top 40 for re-ranking
        _, d_indices = expert_index.search(q_emb, k=40)

        # Mapping Indices to Candidates
        candidates = []
        for idx in d_indices[0]:
            if 0 <= idx < len(subset_df):
                pid = str(subset_df.iloc[idx]['pid'])
                if pid in pid_to_text:
                    candidates.append({
                        'pid': pid,
                        'text': pid_to_text[pid]
                    })

        if not candidates:
            print("No results found in the subset.")
            continue

        # RE-RANKING
        model_inputs = [[query, c['text']] for c in candidates]
        scores = cross_model.predict(model_inputs, batch_size=64, show_progress_bar=False)

        for i in range(len(candidates)):
            candidates[i]['score'] = float(scores[i])

        # Sorting by high-precision Cross-Encoder score
        results = sorted(candidates, key=lambda x: x['score'], reverse=True)[:k]


        # printing results
        print(f"\nTop {len(results)} Results for: '{query}'")
        print("-"*50)

        for i, res in enumerate(results):
            print(f"Rank {i+1} | [PID: {res['pid']}] | Score: {res['score']:.4f}")

            clean_text = res['text'].replace('\n', ' ').strip()
            if len(clean_text) > 150:
                snippet = clean_text[:150] + "..."
            else:
                snippet = clean_text

            print(f"Text:  {snippet}")
            print("-" * 50)

run_simple_cli()

--------------------------------------------------
MS MARCO NEURAL SEARCH SYSTEM
Mode: Bi-Encoder Retrieval -> Cross-Encoder Re-Rank
--------------------------------------------------

------------------------------
Enter Search Query (or 'exit' to quit): us population
Enter Top-K results to return (default 10): 7
Original Query: us population
Expanded Query: us population is 31, It In 1790, population the Aug year of Current 2010 2011 from

Top 7 Results for: 'us population'
--------------------------------------------------
Rank 1 | [PID: 6110584] | Score: 8.8281
Text:  Aug 31, 2011 It is not difficult to find the information about how many people there are in the United States, because the U.S. Census Bureau1 regular...
--------------------------------------------------
Rank 2 | [PID: 1170798] | Score: 7.6523
Text:  In 1790, the year of the first census of the U.S. population, there were 3,929,214 Americans. By 1900, the U.S.A. population jumped to 75,994,575. In ...
---------------