In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

file_path = "/kaggle/input/arxiv"
import os
for dirname, _, filenames in os.walk('/kaggle/input/arxiv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


**CLEANING, DROPPING UNNECESSARY ROWS AND COLUMNS**

In [3]:
from collections import defaultdict
from tqdm import tqdm

# --------- Config ---------
input_json_path = "/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"

# --------- Columns to Analyze ---------
target_columns = [
    "id", "submitter", "authors", "title", "comments",
    "journal-ref", "doi", "report-no", "categories",
    "license", "abstract", "update_date", "versions", "authors_parsed"
]

# --------- Initialize Counters ---------
null_counts = defaultdict(int)
total_count = 0

# --------- Line-by-Line Parsing ---------
with open(input_json_path, "r") as f:
    for line in tqdm(f, desc="Checking nulls"):
        try:
            paper = json.loads(line)
            total_count += 1
            for col in target_columns:
                value = paper.get(col, None)
                if value is None or (isinstance(value, str) and value.strip() == ""):
                    null_counts[col] += 1
        except json.JSONDecodeError:
            continue  # skip malformed lines

# --------- Output Report ---------
print(f"\nTotal records: {total_count}\n")
print("Missing value summary (null or empty):\n")
for col in target_columns:
    missing = null_counts[col]
    percent = (missing / total_count) * 100
    print(f"{col:15}: {missing:6} missing ({percent:.2f}%)")


Checking nulls: 2735264it [01:31, 29900.67it/s]


Total records: 2735264

Missing value summary (null or empty):

id             :      0 missing (0.00%)
submitter      :  15189 missing (0.56%)
authors        :      0 missing (0.00%)
title          :      0 missing (0.00%)
comments       : 713227 missing (26.08%)
journal-ref    : 1842970 missing (67.38%)
doi            : 1494843 missing (54.65%)
report-no      : 2550013 missing (93.23%)
categories     :      0 missing (0.00%)
license        : 452782 missing (16.55%)
abstract       :      0 missing (0.00%)
update_date    :      0 missing (0.00%)
versions       :      0 missing (0.00%)
authors_parsed :      0 missing (0.00%)





**DROPPING FILES WITHOUT DOI**

In [4]:
output_json_path = "/kaggle/working/papers_with_doi_only.jsonl"
# --------- Filter and Write ---------
total = 0
kept = 0

with open(input_json_path, "r") as infile, open(output_json_path, "w") as outfile:
    for line in tqdm(infile, desc="Filtering papers with DOI"):
        try:
            paper = json.loads(line)
            total += 1
            if paper.get("doi"):
                json.dump(paper, outfile)
                outfile.write("\n")
                kept += 1
        except json.JSONDecodeError:
            continue  # skip bad lines

# --------- Summary ---------
print(f"\n✅ Total papers processed: {total}")
print(f"📌 Papers with DOI retained: {kept}")
print(f"📝 Filtered output saved to: {output_json_path}")

Filtering papers with DOI: 2735264it [02:20, 19428.93it/s]


✅ Total papers processed: 2735264
📌 Papers with DOI retained: 1240421
📝 Filtered output saved to: /kaggle/working/papers_with_doi_only.jsonl





**CONVERTING TO URL**

In [5]:
import json
from tqdm import tqdm

# --------- Paths ---------
input_path = "/kaggle/working/papers_with_doi_only.jsonl"
output_path = "/kaggle/working/papers_with_urls.jsonl"

# --------- Counter for Papers with URL ---------
papers_with_url_count = 0

# --------- Transform and Save ---------
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for line in tqdm(infile, desc="Converting DOI to URLs"):
        try:
            paper = json.loads(line)
            doi_value = paper.pop("doi", None)
            if doi_value:
                paper["url"] = f"https://doi.org/{doi_value}"
                papers_with_url_count += 1  # Increment the counter when URL is added
                json.dump(paper, outfile)
                outfile.write("\n")
        except json.JSONDecodeError:
            continue  # skip malformed lines

# --------- Summary ---------
print(f"\n✅ Done! Converted DOI to URLs.")
print(f"📁 Output saved to: {output_path}")
print(f"📊 Total papers with URLs: {papers_with_url_count}")


Converting DOI to URLs: 1240421it [01:36, 12871.65it/s]


✅ Done! Converted DOI to URLs.
📁 Output saved to: /kaggle/working/papers_with_urls.jsonl
📊 Total papers with URLs: 1240421





**DELETIONS IF NEEDED**

In [6]:
# import os

# # Specify the path of the file you want to delete
# file_path = ""

# # Check if the file exists before deleting it
# if os.path.exists(file_path):
#     os.remove(file_path)
#     print(f"File {file_path} has been deleted.")
# else:
#     print(f"File {file_path} does not exist.")


In [7]:
# import os
# import shutil

# # Specify the path of the directory you want to delete
# dir_path = ""

# # Check if the directory exists before deleting it
# if os.path.exists(dir_path) and os.path.isdir(dir_path):
#     shutil.rmtree(dir_path)
#     print(f"Directory {dir_path} has been deleted.")
# else:
#     print(f"Directory {dir_path} does not exist.")



**EXTRACT EQUATIONS**

In [8]:
import json
import re
import pandas as pd

# Load research papers from JSONL file
file_path = "/kaggle/working/papers_with_urls.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

# Function to extract LaTeX equations
def extract_equations(text):
    if not isinstance(text, str):
        return []

    inline_eqs = re.findall(r'\$([^\$]+)\$', text)
    display_eqs = re.findall(r'\\\[(.*?)\\\]', text)
    commands = re.findall(r'\\[a-zA-Z]+(?:\{.*?\})*', text)
    begin_envs = re.findall(r'\\begin\{.*?\}(.*?)\\end\{.*?\}', text, re.DOTALL)

    all_eqs = inline_eqs + display_eqs + commands + begin_envs

    # Filter out too short or meaningless ones
    filtered_eqs = []
    for eq in all_eqs:
        eq = eq.strip()
        if len(eq) >= 2 and (any(c in eq for c in "\\{}_^")):
            filtered_eqs.append(eq)

    return list(set(filtered_eqs))


# Extract equations and store results
papers = []
for paper in data:
    abstract = paper.get("abstract", "")
    title = paper.get("title", "")
    url = paper.get("url", "") 
    equations = extract_equations(abstract + " " + title)
    
    if equations:
        papers.append({
            "id": paper.get("id", ""),
            "title": title.replace("\n", " ").strip(),
            "abstract": abstract.replace("\n", " ").strip(),
            "equations": equations,
            "url": url  

        })

# Save as JSONL
output_path = "/kaggle/working/extracted_equations.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for paper in papers:
        json.dump(paper, f)
        f.write("\n")

print(f"✅ Extracted equations saved to '{output_path}'. Total papers: {len(papers)}")


✅ Extracted equations saved to '/kaggle/working/extracted_equations.jsonl'. Total papers: 421249


**TO CONVERT TO ZIP**

In [9]:
# import shutil

# # Zip the papers_with_urls.jsonl file
# shutil.make_archive(
#     base_name="/kaggle/working/extracted_equations",  # Output path (without .zip)
#     format='zip',
#     root_dir="/kaggle/working",                   # Directory containing the file
#     base_dir="/kaggle/working/extracted_equations.jsonl"             # File to zip
# )


In [10]:
!pip install faiss-cpu --no-deps

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [11]:
# import json
# import gc
# import faiss
# import torch
# import numpy as np
# import pandas as pd
# from tqdm import tqdm
# from transformers import AutoTokenizer, AutoModel

# # --- CONFIG ---
# input_path = "/kaggle/working/extracted_equations.jsonl"
# output_text_faiss = "/kaggle/working/text_faiss_index.bin"
# output_eq_faiss = "/kaggle/working/eq_faiss_index.bin"
# output_text_mapping = "/kaggle/working/text_id_title_mapping.csv"
# output_eq_mapping = "/kaggle/working/eq_id_eq_mapping.csv"

# batch_size = 5000  # Good for Kaggle RAM

# # --- Load SciBERT ---
# model_name = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
# model.eval()

# # --- Helper ---
# def generate_embedding(text):
#     if not text.strip():
#         return None
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# # --- Initialize FAISS ---
# embedding_dim = 768
# text_index = faiss.IndexFlatL2(embedding_dim)
# eq_index = faiss.IndexFlatL2(embedding_dim)

# # --- Mappings ---
# text_paper_ids = []
# text_titles = []

# eq_paper_ids = []
# eq_strings = []

# # --- Buffers ---
# text_embeddings_buffer = []
# eq_embeddings_buffer = []

# # --- Start ---
# with open(input_path, 'r', encoding='utf-8') as infile:
#     for idx, line in tqdm(enumerate(infile), desc="Processing Papers"):
#         try:
#             paper = json.loads(line)
#             paper_id = paper.get("id", "")
#             title = paper.get("title", "")
#             abstract = paper.get("abstract", "")
#             equations = paper.get("equations", [])

#             # --- Process Text ---
#             combined_text = (title + " " + abstract).strip()
#             text_emb = generate_embedding(combined_text)

#             if text_emb is not None:
#                 text_embeddings_buffer.append(text_emb)
#                 text_paper_ids.append(paper_id)
#                 text_titles.append(title)

#             # --- Process Equations ---
#             for eq in equations:
#                 if not eq.strip():
#                     continue
#                 eq_emb = generate_embedding(eq)
#                 if eq_emb is not None:
#                     eq_embeddings_buffer.append(eq_emb)
#                     eq_paper_ids.append(paper_id)
#                     eq_strings.append(eq)

#             # --- When batch full ---
#             if (idx + 1) % batch_size == 0:
#                 if text_embeddings_buffer:
#                     text_index.add(np.vstack(text_embeddings_buffer))
#                     text_embeddings_buffer.clear()

#                 if eq_embeddings_buffer:
#                     eq_index.add(np.vstack(eq_embeddings_buffer))
#                     eq_embeddings_buffer.clear()

#                 gc.collect()

#         except Exception as e:
#             print(f"⚠️ Error at paper {paper.get('id', '')}: {e}")
#             continue

# # --- After last batch ---
# if text_embeddings_buffer:
#     text_index.add(np.vstack(text_embeddings_buffer))
# if eq_embeddings_buffer:
#     eq_index.add(np.vstack(eq_embeddings_buffer))
# gc.collect()

# # --- Save Outputs ---
# faiss.write_index(text_index, output_text_faiss)
# print(f"✅ Text FAISS index saved to: {output_text_faiss}")

# faiss.write_index(eq_index, output_eq_faiss)
# print(f"✅ Equation FAISS index saved to: {output_eq_faiss}")

# pd.DataFrame({
#     "id": text_paper_ids,
#     "title": text_titles
# }).to_csv(output_text_mapping, index=False)
# print(f"✅ Text ID-Title mapping saved to: {output_text_mapping}")

# pd.DataFrame({
#     "id": eq_paper_ids,
#     "equation": eq_strings
# }).to_csv(output_eq_mapping, index=False)
# print(f"✅ Equation ID-Equation mapping saved to: {output_eq_mapping}")



**EMBEDDING GENERATION USING SCIBERT AND ADDITION TO FAISS**

In [12]:
import json
import gc
import faiss
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# --- CONFIG ---
input_path = "/kaggle/working/extracted_equations.jsonl"
output_text_faiss = "/kaggle/working/text_faiss_index_1000.bin"
output_eq_faiss = "/kaggle/working/eq_faiss_index_1000.bin"
output_text_mapping = "/kaggle/working/text_id_title_mapping_1000.csv"
output_eq_mapping = "/kaggle/working/eq_id_eq_mapping_1000.csv"

num_papers_to_process = 3000  # <-- limit to 1000 only

# --- Load SciBERT ---
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cpu")  # <-- force CPU
model = model.to(device)
model.eval()

# --- Helper ---
def generate_embedding(text):
    if not text.strip():
        return None
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# --- Initialize FAISS ---
embedding_dim = 768
text_index = faiss.IndexFlatL2(embedding_dim)
eq_index = faiss.IndexFlatL2(embedding_dim)

# --- Mappings ---
text_paper_ids = []
text_titles = []

eq_paper_ids = []
eq_strings = []

# --- Embedding Buffers ---
text_embeddings = []
eq_embeddings = []

# --- Start ---
with open(input_path, 'r', encoding='utf-8') as infile:
    for idx, line in tqdm(enumerate(infile), desc="Processing 1000 Papers"):
        if idx >= num_papers_to_process:
            break

        try:
            paper = json.loads(line)
            paper_id = paper.get("id", "")
            title = paper.get("title", "")
            abstract = paper.get("abstract", "")
            equations = paper.get("equations", [])

            # Text Embedding
            combined_text = (title + " " + abstract).strip()
            text_emb = generate_embedding(combined_text)

            if text_emb is not None:
                text_embeddings.append(text_emb)
                text_paper_ids.append(paper_id)
                text_titles.append(title)

            # Equation Embeddings
            for eq in equations:
                if not eq.strip():
                    continue
                eq_emb = generate_embedding(eq)
                if eq_emb is not None:
                    eq_embeddings.append(eq_emb)
                    eq_paper_ids.append(paper_id)
                    eq_strings.append(eq)

        except Exception as e:
            print(f"⚠️ Error at paper {paper.get('id', '')}: {e}")
            continue

# --- Add to FAISS ---
if text_embeddings:
    text_index.add(np.vstack(text_embeddings))

if eq_embeddings:
    eq_index.add(np.vstack(eq_embeddings))

gc.collect()

# --- Save Outputs ---
faiss.write_index(text_index, output_text_faiss)
print(f"✅ Text FAISS index saved to: {output_text_faiss}")

faiss.write_index(eq_index, output_eq_faiss)
print(f"✅ Equation FAISS index saved to: {output_eq_faiss}")

pd.DataFrame({
    "id": text_paper_ids,
    "title": text_titles
}).to_csv(output_text_mapping, index=False)
print(f"✅ Text ID-Title mapping saved to: {output_text_mapping}")

pd.DataFrame({
    "id": eq_paper_ids,
    "equation": eq_strings
}).to_csv(output_eq_mapping, index=False)
print(f"✅ Equation ID-Equation mapping saved to: {output_eq_mapping}")


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

2025-05-20 16:11:19.139677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747757479.505850      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747757479.605611      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Processing 1000 Papers: 6it [00:03,  1.76it/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Processing 1000 Papers: 3000it [31:31,  1.59it/s]


✅ Text FAISS index saved to: /kaggle/working/text_faiss_index_1000.bin
✅ Equation FAISS index saved to: /kaggle/working/eq_faiss_index_1000.bin
✅ Text ID-Title mapping saved to: /kaggle/working/text_id_title_mapping_1000.csv
✅ Equation ID-Equation mapping saved to: /kaggle/working/eq_id_eq_mapping_1000.csv


**SEARCH USING ONLY SCIBERT AND FAISS**

In [13]:
# import faiss
# import numpy as np
# import pandas as pd
# from transformers import AutoTokenizer, AutoModel
# import torch

# # CONFIGS
# text_faiss_path = "/kaggle/working/text_faiss_index_1000.bin"
# eq_faiss_path = "/kaggle/working/eq_faiss_index_1000.bin"
# text_mapping_path = "/kaggle/working/text_id_title_mapping_1000.csv"
# eq_mapping_path = "/kaggle/working/eq_id_eq_mapping_1000.csv"

# top_k = 5  # How many results to show

# # Load SciBERT
# model_name = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# device = torch.device("cpu")  # Force CPU
# model = model.to(device)
# model.eval()

# # Embedding Function
# def generate_embedding(text):
#     if not text.strip():
#         return None
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# # Latex cleaner for equations
# def clean_equation_to_latex(eq_text):
#     # Basic: Add surrounding "$" if not already
#     eq_text = eq_text.strip()
#     if not (eq_text.startswith("$") and eq_text.endswith("$")):
#         eq_text = f"${eq_text}$"
#     return eq_text

# # Load FAISS Indexes
# text_index = faiss.read_index(text_faiss_path)
# eq_index = faiss.read_index(eq_faiss_path)

# # Load Mapping CSVs
# text_mapping = pd.read_csv(text_mapping_path)
# eq_mapping = pd.read_csv(eq_mapping_path)

# # --- User Input ---
# query_type = input("Enter query type (text / equation): ").strip().lower()
# query = input("Enter your query: ").strip()

# # --- Preprocess Query ---
# if query_type == "equation":
#     query = clean_equation_to_latex(query)  # Add LaTeX formatting

# # --- Embed Query ---
# query_emb = generate_embedding(query)
# query_emb = np.expand_dims(query_emb, axis=0)  # FAISS expects 2D

# # --- Search ---
# if query_type == "text":
#     distances, indices = text_index.search(query_emb, top_k)
#     print("\n🔎 Top matches for your TEXT query:")
#     for dist, idx in zip(distances[0], indices[0]):
#         paper_id = text_mapping.iloc[idx]["id"]
#         title = text_mapping.iloc[idx]["title"]
#         print(f"Paper ID: {paper_id} | Title: {title} | Distance: {dist:.4f}")

# elif query_type == "equation":
#     distances, indices = eq_index.search(query_emb, top_k)
#     print("\n🔎 Top matches for your EQUATION query:")
#     for dist, idx in zip(distances[0], indices[0]):
#         paper_id = eq_mapping.iloc[idx]["id"]
#         equation = eq_mapping.iloc[idx]["equation"]
#         print(f"Paper ID: {paper_id} | Equation: {equation} | Distance: {dist:.4f}")

# else:
#     print("❌ Invalid query type! Please type 'text' or 'equation'.")


In [14]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
!pip install rank_bm25


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


**SEARCH USING ONLY BM25**

In [16]:
# import json
# import pandas as pd
# from rank_bm25 import BM25Okapi
# from nltk.tokenize import word_tokenize

# # --- CONFIG ---
# input_jsonl = "/kaggle/working/extracted_equations.jsonl"
# top_k = 5  # How many results to show
# max_papers = 1000  # Limit to 1000 papers for faster processing

# # --- Load Papers ---
# papers = []
# with open(input_jsonl, 'r', encoding='utf-8') as f:
#     for idx, line in enumerate(f):
#         if idx >= max_papers:
#             break
#         papers.append(json.loads(line))

# # --- Preprocess ---
# text_corpus = []
# text_id_title = []

# eq_corpus = []
# eq_id_eq = []

# for paper in papers:
#     paper_id = paper.get("id", "")
#     title = paper.get("title", "")
#     abstract = paper.get("abstract", "")
#     equations = paper.get("equations", [])

#     # Text corpus: title + abstract
#     combined_text = (title + " " + abstract).strip()
#     tokens = word_tokenize(combined_text.lower())
#     text_corpus.append(tokens)
#     text_id_title.append((paper_id, title))

#     # Equation corpus: each equation separately
#     for eq in equations:
#         if eq.strip():
#             eq_tokens = word_tokenize(eq.lower())
#             eq_corpus.append(eq_tokens)
#             eq_id_eq.append((paper_id, eq))

# # --- Build BM25 Indexes ---
# text_bm25 = BM25Okapi(text_corpus)
# eq_bm25 = BM25Okapi(eq_corpus)

# # --- Query Time ---
# query_type = input("Enter query type (text / equation): ").strip().lower()
# query = input("Enter your query: ").strip()

# query_tokens = word_tokenize(query.lower())

# # --- Search ---
# if query_type == "text":
#     scores = text_bm25.get_scores(query_tokens)
#     top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    
#     print("\n🔎 Top matches for your TEXT query:")
#     for idx in top_indices:
#         paper_id, title = text_id_title[idx]
#         print(f"Paper ID: {paper_id} | Title: {title} | BM25 Score: {scores[idx]:.4f}")

# elif query_type == "equation":
#     scores = eq_bm25.get_scores(query_tokens)
#     top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

#     print("\n🔎 Top matches for your EQUATION query:")
#     for idx in top_indices:
#         paper_id, equation = eq_id_eq[idx]
#         print(f"Paper ID: {paper_id} | Equation: {equation} | BM25 Score: {scores[idx]:.4f}")

# else:
#     print("❌ Invalid query type! Please type 'text' or 'equation'.")


**HYBRID MODEL USING BOTH LEXICAL AND SEMANTIC SEARCH**

In [None]:
import json
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
import torch
from nltk.tokenize import word_tokenize

# --- CONFIG ---
input_jsonl = "/kaggle/working/extracted_equations.jsonl"
text_faiss_path = "/kaggle/working/text_faiss_index_1000.bin"
eq_faiss_path = "/kaggle/working/eq_faiss_index_1000.bin"
text_mapping_path = "/kaggle/working/text_id_title_mapping_1000.csv"
eq_mapping_path = "/kaggle/working/eq_id_eq_mapping_1000.csv"

max_papers = 200
top_k_bm25 = 100
top_k_final = 5

# --- Load Papers ---
papers = []
with open(input_jsonl, 'r', encoding='utf-8') as f:
    for idx, line in enumerate(f):
        if idx >= max_papers:
            break
        papers.append(json.loads(line))

# --- Preprocess for BM25 ---
text_corpus = []
text_id_title = []

for paper in papers:
    paper_id = paper.get("id", "")
    title = paper.get("title", "")
    abstract = paper.get("abstract", "")
    combined_text = (title + " " + abstract).strip()
    tokens = word_tokenize(combined_text.lower())
    text_corpus.append(tokens)
    text_id_title.append((paper_id, title))

bm25_model = BM25Okapi(text_corpus)

# --- Load FAISS Indexes ---
text_index = faiss.read_index(text_faiss_path)
eq_index = faiss.read_index(eq_faiss_path)

# --- Load Mappings ---
text_mapping = pd.read_csv(text_mapping_path)
eq_mapping = pd.read_csv(eq_mapping_path)

# --- Load SciBERT (for query embedding) ---
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cpu")  # Force CPU
model = model.to(device)
model.eval()

def generate_embedding(text):
    if not text.strip():
        return None
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def clean_equation_to_latex(eq_text):
    eq_text = eq_text.strip()
    if not (eq_text.startswith("$") and eq_text.endswith("$")):
        eq_text = f"${eq_text}$"
    return eq_text

# --- User Input ---
query_type = input("Enter query type (text / equation): ").strip().lower()
query = input("Enter your query: ").strip()

if query_type == "equation":
    query = clean_equation_to_latex(query)

# --- Embed Query ---
query_emb = generate_embedding(query)
query_emb = np.expand_dims(query_emb, axis=0)

# --- Hybrid Search ---
if query_type == "text":
    # BM25 phase
    query_tokens = word_tokenize(query.lower())
    bm25_scores = bm25_model.get_scores(query_tokens)
    bm25_top_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k_bm25]

    # FAISS phase: Build mini index of top BM25 results
    selected_vectors = np.vstack([text_index.reconstruct(idx) for idx in bm25_top_indices])

    mini_index = faiss.IndexFlatL2(768)
    mini_index.add(selected_vectors)

    distances, faiss_indices = mini_index.search(query_emb, top_k_final)

    threshold = 0.6  # You can tune it

    if np.all(distances[0] <= threshold):
        print("\n🔎 Top matches for your TEXT query (BM25 + FAISS Hybrid):")
        for dist, idx in zip(distances[0], faiss_indices[0]):
            real_idx = bm25_top_indices[idx]
            paper_id, title = text_id_title[real_idx]
            print(f"Paper ID: {paper_id} | Title: {title} | Distance: {dist:.4f}")
    else:
        print("\n⚠️ FAISS matches were not strong. Falling back to pure BM25 matches:")
        for idx in bm25_top_indices[:top_k_final]:
            paper_id, title = text_id_title[idx]
            print(f"Paper ID: {paper_id} | Title: {title}")

elif query_type == "equation":
    # Direct FAISS on equation index
    distances, indices = eq_index.search(query_emb, top_k_final)
    print("\n🔎 Top matches for your EQUATION query (FAISS only):")
    for dist, idx in zip(distances[0], indices[0]):
        paper_id = eq_mapping.iloc[idx]["id"]
        equation = eq_mapping.iloc[idx]["equation"]
        print(f"Paper ID: {paper_id} | Equation: {equation} | Distance: {dist:.4f}")

else:
    print("❌ Invalid query type! Please type 'text' or 'equation'.")


Enter query type (text / equation):  planet
Enter your query:  planet


❌ Invalid query type! Please type 'text' or 'equation'.


In [18]:
# import json
# import numpy as np
# import pandas as pd
# import faiss
# from transformers import AutoTokenizer, AutoModel
# from rank_bm25 import BM25Okapi
# import torch
# from nltk.tokenize import word_tokenize

# # --- CONFIG ---
# input_jsonl = "/kaggle/working/extracted_equations.jsonl"
# text_faiss_path = "/kaggle/working/text_faiss_index_1000.bin"
# eq_faiss_path = "/kaggle/working/eq_faiss_index_1000.bin"
# text_mapping_path = "/kaggle/working/text_id_title_mapping_1000.csv"
# eq_mapping_path = "/kaggle/working/eq_id_eq_mapping_1000.csv"

# max_papers = 1000
# top_k_bm25 = 50
# top_k_final = 5

# # --- Load Papers into Memory ---
# def load_extracted_equations(file_path):
#     """
#     Loads the extracted equations file into memory as a dictionary for fast lookups.
#     """
#     data = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             paper = json.loads(line)
#             data[paper["id"]] = paper  # Use paper ID as the key
#     return data

# extracted_data = load_extracted_equations(input_jsonl)

# # --- Preprocess for BM25 ---
# papers = list(extracted_data.values())[:max_papers]  # Limit number of papers to process
# text_corpus = []
# text_id_title = []

# for paper in papers:
#     paper_id = paper.get("id", "")
#     title = paper.get("title", "")
#     abstract = paper.get("abstract", "")
#     combined_text = (title + " " + abstract).strip()
#     tokens = word_tokenize(combined_text.lower())
#     text_corpus.append(tokens)
#     text_id_title.append((paper_id, title))

# bm25_model = BM25Okapi(text_corpus)

# # --- Load FAISS Indexes ---
# text_index = faiss.read_index(text_faiss_path)
# eq_index = faiss.read_index(eq_faiss_path)

# # --- Load Mappings ---
# text_mapping = pd.read_csv(text_mapping_path)
# eq_mapping = pd.read_csv(eq_mapping_path, dtype={"id": str})

# # --- Load SciBERT (for query embedding) ---
# model_name = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# device = torch.device("cpu")  # Force CPU
# model = model.to(device)
# model.eval()

# def generate_embedding(text):
#     if not text.strip():
#         return None
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# def clean_equation_to_latex(eq_text):
#     eq_text = eq_text.strip()
#     if not (eq_text.startswith("$") and eq_text.endswith("$")):
#         eq_text = f"${eq_text}$"
#     return eq_text

# # --- User Input ---
# query_type = input("Enter query type (text / equation): ").strip().lower()
# query = input("Enter your query: ").strip()

# if query_type == "equation":
#    query = clean_equation_to_latex(query)

# # --- Embed Query ---
# query_emb = generate_embedding(query)
# query_emb = np.expand_dims(query_emb, axis=0)

# # --- Hybrid Search ---
# if query_type == "text":
#     # BM25 phase
#     query_tokens = word_tokenize(query.lower())
#     bm25_scores = bm25_model.get_scores(query_tokens)
#     bm25_top_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k_bm25]

#     # FAISS phase: Build mini index of top BM25 results
#     selected_vectors = np.vstack([text_index.reconstruct(idx) for idx in bm25_top_indices])

#     mini_index = faiss.IndexFlatL2(768)
#     mini_index.add(selected_vectors)

#     distances, faiss_indices = mini_index.search(query_emb, top_k_final)

#     threshold = 0.6  # You can tune it

#     if np.all(distances[0] <= threshold):
#         print("\n🔎 Top matches for your TEXT query (BM25 + FAISS Hybrid):")
#         for dist, idx in zip(distances[0], faiss_indices[0]):
#             real_idx = bm25_top_indices[idx]
#             paper_id = text_id_title[real_idx][0]

#             # Fetch additional details from preloaded extracted_data
#             paper_details = extracted_data.get(paper_id, {})
#             url = paper_details.get("url", "Unavailable")
#             title = paper_details.get("title", "No Title")
#             authors = paper_details.get("authors", "Unknown")
#             abstract = paper_details.get("abstract", "No Abstract")

#             # Display results
#             print(f"Paper ID: {paper_id}")
#             print(f"Title: {title}")
#             print(f"Abstract: {abstract}")
#             print(f"URL: {url}")
#             print(f"Distance: {dist:.4f}")
#             print()

#     else:
#         print("\n⚠️ FAISS matches were not strong. Falling back to pure BM25 matches:")
#         for idx in bm25_top_indices[:top_k_final]:
#             paper_id = text_id_title[idx][0]

#             # Fetch additional details from preloaded extracted_data
#             paper_details = extracted_data.get(paper_id, {})
#             url = paper_details.get("url", "Unavailable")
#             title = paper_details.get("title", "No Title")
#             authors = paper_details.get("authors", "Unknown")
#             abstract = paper_details.get("abstract", "No Abstract")

#             # Display results
#             print(f"Paper ID: {paper_id}")
#             print(f"Title: {title}")
#             print(f"Abstract: {abstract}")
#             print(f"URL: {url}")
#             print()
# elif query_type == "equation":
#     distances, indices = eq_index.search(query_emb, top_k_final)
#     print("\n🔎 Top matches for your EQUATION query (FAISS only):")

#     for dist, idx in zip(distances[0], indices[0]):
#         # Get paper ID from mapping
#         raw_paper_id = str(eq_mapping.iloc[idx]["id"]).strip()  # <-- Fix here

#         # Correct padding if needed
#         if len(raw_paper_id.split(".")[0]) == 3:  # e.g., "705.0931"
#             paper_id = "0" + raw_paper_id         # make it "0705.0931"
#         else:
#             paper_id = raw_paper_id               # already correct

#         # Lookup full extracted paper
#         paper_details = extracted_data.get(paper_id, {})
#         title = paper_details.get("title", "No Title")
#         authors = paper_details.get("authors", "Unknown")
#         abstract = paper_details.get("abstract", "No Abstract")
#         url = paper_details.get("url", "Unavailable")

#         print(f"Paper ID: {paper_id}")
#         print(f"Title: {title}")
#         print(f"Authors: {authors}")
#         print(f"Abstract: {abstract}")
#         print(f"URL: {url}")
#         print(f"Distance: {dist:.4f}")
#         print()

# else:
#     print("❌ Invalid query type! Please type 'text' or 'equation'.")


In [None]:
!pip install pyngrok
!ngrok authtoken Authtokenillihaku
!pip install streamlit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Downloading pyngrok-7.2.8-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml                                


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.45.1


In [20]:
# %%writefile app.py
# import streamlit as st
# import json
# import numpy as np
# import faiss
# from transformers import AutoTokenizer, AutoModel
# import torch
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# import pandas as pd
# from rank_bm25 import BM25Okapi

# # --- CONFIG ---
# input_jsonl = "/kaggle/working/extracted_equations.jsonl"
# text_faiss_path = "/kaggle/working/text_faiss_index_1000.bin"
# eq_faiss_path = "/kaggle/working/eq_faiss_index_1000.bin"
# text_mapping_path = "//kaggle/working/text_id_title_mapping_1000.csv"
# eq_mapping_path = "/kaggle/working/eq_id_eq_mapping_1000.csv"

# max_papers = 1000
# top_k_bm25 = 10
# top_k_final = 5

# # --- Load Extracted Data ---
# def load_extracted_equations(file_path):
#     data = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             paper = json.loads(line)
#             data[paper["id"]] = paper
#     return data

# extracted_data = load_extracted_equations(input_jsonl)

# # --- Preprocess for BM25 ---
# papers = list(extracted_data.values())[:max_papers]  # Limit number of papers to process
# text_corpus = []
# text_id_title = []

# for paper in papers:
#     paper_id = paper.get("id", "")
#     title = paper.get("title", "")
#     abstract = paper.get("abstract", "")
#     combined_text = (title + " " + abstract).strip()
#     tokens = word_tokenize(combined_text.lower())
#     text_corpus.append(tokens)
#     text_id_title.append((paper_id, title))

# bm25_model = BM25Okapi(text_corpus)


# # --- Load FAISS Indexes ---
# text_index = faiss.read_index(text_faiss_path)
# eq_index = faiss.read_index(eq_faiss_path)
# # --- Load Mappings ---
# text_mapping = pd.read_csv(text_mapping_path)
# eq_mapping = pd.read_csv(eq_mapping_path, dtype={"id": str})

# # --- Load SciBERT Model ---
# model_name = "allenai/scibert_scivocab_uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# device = torch.device("cpu")
# model = model.to(device)
# model.eval()

# def generate_embedding(text):
#     """Generate embeddings for the query."""
#     if not text.strip():
#         return None
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# def clean_equation_to_latex(eq_text):
#     eq_text = eq_text.strip()
#     if not (eq_text.startswith("$") and eq_text.endswith("$")):
#         eq_text = f"${eq_text}$"
#     return eq_text

# # --- Streamlit UI ---
# st.title("Hybrid Search Engine")
# st.sidebar.header("Query Options")
# st.sidebar.info("**Note:** You can enter either a text query or an equation query, but not both!")

# # Input fields
# query_type = None
# text_query = st.text_input("Text Query")
# equation_query = st.text_input("Equation Query")

# if text_query.strip() and equation_query.strip():
#     st.error("⚠️ Please use only one input box. Clear one before submitting.")

# elif text_query.strip():
#     query_type = "text"
#     query = text_query.strip()
# elif equation_query.strip():
#     query_type = "equation"
#     query = clean_equation_to_latex(equation_query)  # Ensure LaTeX formatting for equations
# else:
#     st.info("Awaiting your input...")
# # --- Embed Query ---
# query_emb = generate_embedding(query)
# query_emb = np.expand_dims(query_emb, axis=0)

# # Submit button
# if st.button("Search"):
#     if query_type == "text":
#         # --- Text Query ---
#         query_tokens = word_tokenize(query.lower())
#         bm25_scores = bm25_model.get_scores(query_tokens)
#         bm25_top_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k_bm25]

#         # FAISS phase: Build mini index of top BM25 results
#         selected_vectors = np.vstack([text_index.reconstruct(idx) for idx in bm25_top_indices])

#         mini_index = faiss.IndexFlatL2(768)
#         mini_index.add(selected_vectors)

#         distances, faiss_indices = mini_index.search(query_emb, top_k_final)

#         threshold = 0.6
#         if np.all(distances[0] <= threshold):
#           st.success("\n🔎Top matches for your TEXT query (BM25 + FAISS Hybrid):")
#           for dist, idx in zip(distances[0], faiss_indices[0]):
#             real_idx = bm25_top_indices[idx]
#             paper_id = text_id_title[real_idx][0]

#             # Fetch additional details from preloaded extracted_data
#             paper_details = extracted_data.get(paper_id, {})
#             url = paper_details.get("url", "Unavailable")
#             title = paper_details.get("title", "No Title")
#             authors = paper_details.get("authors", "Unknown")
#             abstract = paper_details.get("abstract", "No Abstract")

            
#             st.write(f"**Paper ID**: {paper_id}")
#             st.write(f"**Title**: {paper_details.get('title', 'No Title')}")
#             st.write(f"**Abstract**: {paper_details.get('abstract', 'No Abstract')}")
#             st.write(f"[Link to Paper]({paper_details.get('url', '#')})")
#             st.write("---")
#         else:
#           st.success("\n⚠️FAISS matches were not strong. Falling back to pure BM25 matches:")
#           for idx in bm25_top_indices[:top_k_final]:
#              paper_id = text_id_title[idx][0]

#              # Fetch additional details from preloaded extracted_data
#              paper_details = extracted_data.get(paper_id, {})
#              url = paper_details.get("url", "Unavailable")
#              title = paper_details.get("title", "No Title")
#              authors = paper_details.get("authors", "Unknown")
#              abstract = paper_details.get("abstract", "No Abstract")

             
#              st.write(f"**Paper ID**: {paper_id}")
#              st.write(f"**Title**: {paper_details.get('title', 'No Title')}")
#              st.write(f"**Abstract**: {paper_details.get('abstract', 'No Abstract')}")
#              st.write(f"[Link to Paper]({paper_details.get('url', '#')})")
#              st.write("---")

#     elif query_type == "equation":
#         # --- Equation Query ---
#         distances, indices = eq_index.search(query_emb, top_k_final)
#         st.success("\n🔎Top matches for your EQUATION query (FAISS only):")

#         for dist, idx in zip(distances[0], indices[0]):
#             # Get paper ID from mapping
#             raw_paper_id = str(eq_mapping.iloc[idx]["id"]).strip()
#             # Correct padding if needed
#             if len(raw_paper_id.split(".")[0]) == 3:  # e.g., "705.0931"
#               paper_id = "0" + raw_paper_id         # make it "0705.0931"
#             else:
#               paper_id = raw_paper_id

#             paper_details = extracted_data.get(paper_id, {})
#             url = paper_details.get("url", "Unavailable")
#             title = paper_details.get("title", "No Title")
#             authors = paper_details.get("authors", "Unknown")
#             abstract = paper_details.get("abstract", "No Abstract")

#             st.write(f"**Paper ID**: {paper_id}")
#             st.write(f"**Title**: {paper_details.get('title', 'No Title')}")
#             st.write(f"**Abstract**: {paper_details.get('abstract', 'No Abstract')}")
#             st.write(f"[Link to Paper]({paper_details.get('url', '#')})")
#             st.write("---")
#     else:
#         st.error("⚠️ Please provide a valid query.")


**FINAL STREAMLIT CODE**

In [21]:
%%writefile app.py
import streamlit as st
import json
import numpy as np
import faiss
import torch
import nltk
import pandas as pd

from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# --- CONFIG ---
INPUT_JSONL          = "/kaggle/working/extracted_equations.jsonl"
TEXT_FAISS_PATH      = "/kaggle/working/text_faiss_index_1000.bin"
EQ_FAISS_PATH        = "/kaggle/working/eq_faiss_index_1000.bin"
TEXT_MAPPING_CSV     = "/kaggle/working/text_id_title_mapping_1000.csv"
EQ_MAPPING_CSV       = "/kaggle/working/eq_id_eq_mapping_1000.csv"

MAX_PAPERS           = 1000
TOP_K_BM25           = 10
TOP_K_FINAL          = 5
FAISS_DIM            = 768
TEXT_MODEL_NAME      = "allenai/scibert_scivocab_uncased"

# --- Load extracted_data into dict for fast lookup ---
@st.cache_data
def load_extracted_data(path):
    d = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            p = json.loads(line)
            d[p["id"]] = p
    return d

extracted_data = load_extracted_data(INPUT_JSONL)

# --- Build BM25 corpora ---
@st.cache_data
def build_bm25():
    papers = list(extracted_data.values())[:MAX_PAPERS]
    corpus, id_title = [], []
    for p in papers:
        pid = p["id"]
        txt = (p.get("title","") + " " + p.get("abstract","")).strip()
        tokens = word_tokenize(txt.lower())
        corpus.append(tokens)
        id_title.append((pid, p.get("title","No Title")))
    return BM25Okapi(corpus), id_title

bm25_model, text_id_title = build_bm25()

# --- Load FAISS indexes & mappings ---
@st.cache_resource
def load_faiss_and_maps():
    t_idx = faiss.read_index(TEXT_FAISS_PATH)
    e_idx = faiss.read_index(EQ_FAISS_PATH)
    t_map = pd.read_csv(TEXT_MAPPING_CSV, dtype={"id": str})
    e_map = pd.read_csv(EQ_MAPPING_CSV, dtype={"id": str})
    return t_idx, e_idx, t_map, e_map

text_index, eq_index, text_map, eq_map = load_faiss_and_maps()

# --- Load SciBERT ---
@st.cache_resource(show_spinner=False)
def load_scibert():
    tok = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
    mdl = AutoModel.from_pretrained(TEXT_MODEL_NAME).to("cpu").eval()
    return tok, mdl

tokenizer, model = load_scibert()

def embed(text: str):
    if not text:
        return None
    inps = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        out = model(**inps)
    return out.last_hidden_state.mean(1).squeeze().cpu().numpy()

def to_latex(eq: str):
    eq = eq.strip()
    if not (eq.startswith("$") and eq.endswith("$")):
        eq = f"${eq}$"
    return eq

# --- UI ---
st.title("🔍 Hybrid Search Engine")
st.sidebar.header("Enter *either* a Text Query *or* an Equation Query")

text_query      = st.sidebar.text_input("Text Query")
equation_query  = st.sidebar.text_input("Equation Query")

if st.sidebar.button("Search"):
    # Enforce single input
    if text_query and equation_query:
        st.sidebar.error("❗️ Please clear one box before searching.")
        st.stop()
    if not text_query and not equation_query:
        st.sidebar.info("Awaiting input…")
        st.stop()

    # Decide mode & prepare query
    if text_query:
        mode, q = "text", text_query.strip()
    else:
        mode, q = "equation", to_latex(equation_query.strip())

    emb = embed(q)
    if emb is None:
        st.error("Could not embed your input.")
        st.stop()
    emb = np.expand_dims(emb, 0)

    if mode == "text":
        # 1) BM25 top candidates
        toks   = word_tokenize(q.lower())
        scores = bm25_model.get_scores(toks)
        top_bm = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:TOP_K_BM25]

        # 2) FAISS re-ranking
        small_idx = faiss.IndexFlatL2(FAISS_DIM)
        mats = np.vstack([ text_index.reconstruct(i) for i in top_bm ])
        small_idx.add(mats)
        dists, idxs = small_idx.search(emb, TOP_K_FINAL)

        st.header("Top Results (Hybrid BM25 + FAISS)")
        for dist, i in zip(dists[0], idxs[0]):
            real = top_bm[i]
            pid, title = text_id_title[real]
            meta = extracted_data.get(pid, {})
            st.subheader(f"{title}  (ID: {pid})")
            st.write(meta.get("abstract",""))
            st.markdown(f"[View Paper]({meta.get('url','#')})  —  Distance: **{dist:.3f}**")
            st.write("---")

    else:  # equation
        dists, idxs = eq_index.search(emb, TOP_K_FINAL)
        st.header("Top Results (Equation FAISS)")
        for dist, i in zip(dists[0], idxs[0]):
            pid = eq_map.iloc[i]["id"]
            # normalize 3-digit IDs
            if len(pid.split(".")[0])==3: pid = "0"+pid
            meta = extracted_data.get(pid, {})
            st.subheader(f"{meta.get('title','No Title')}  (ID: {pid})")
            st.write(meta.get("abstract",""))
            st.markdown(f"[View Paper]({meta.get('url','#')})  —  Distance: **{dist:.3f}**")
            st.write("---")
else:
    st.info("Enter a query and press **Search**.")



Writing app.py


In [22]:
!npm install -g ngrok@5.0.0-beta.2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K
added 44 packages in 8s
[1G[0K⠋[1G[0K
[1G[0K⠋[1G[0K9 packages are looking for funding
[1G[0K⠋[1G[0K  run `npm fund` for details
[1G[0K⠋[1G[0K[1mnpm[22m [96mnotice[39m
[1mnpm[22m [96mnotice[39m New [31mmajor[39m version of npm available! [31m10.8.2[39m -> [34m11.4.0[39m
[1mnpm[22m [96mnotice[39m Changelog: [34mhttps://github.

In [None]:
!streamlit run app.py --server.port 8501 & npx ngrok http 8501

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1G[0K⠙[1G[0K7[?47h[?1h=[H[2J[m[38;5;6m[48;5;16m[1m[1;1Hngrok[m[38;5;16m[48;5;16m                                                           [m[38;5;7m[48;5;16m(Ctrl+C to quit)[m[38;5;16m[48;5;16m[2;1H                                                                                [m[38;5;6m[48;5;16m[3;1HSession Status                connecting[m[38;5;16m[48;5;16m                                        [m[38;5;7m[48;5;16m[4;1HVersion                       3.22.1[m[38;5;16m[48;5;16m                                            [m[38;5;7m[48;5;16m[5;1HWeb Interface                 http://127.0.0.1:4040[m[38;5;16m[48;5;16m                             [6;1H                                                                                [m[38;5;7m[48;5;16m[7;1HConnections                   ttl     opn     rt1     rt5     p50     p90     [m[38;5;16m[48;5;16m  [m[38;5;7m[48;5;16m[8;1H                              0       0       0.00    0