In [6]:
# Mount Drive so files persist across Colab sessions
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Set the project root to the shared Drive folder
project_root = '/content/drive/MyDrive/AMS 560 PROJECT'

# Create the standard project folders
os.makedirs(project_root + '/data/raw', exist_ok=True)
os.makedirs(project_root + '/data/processed', exist_ok=True)
os.makedirs(project_root + '/data/embeddings', exist_ok=True)
os.makedirs(project_root + '/indices', exist_ok=True)
os.makedirs(project_root + '/scripts', exist_ok=True)

- data/raw/ → original dumps/downloads; untouched.
- data/processed/ → cleaned, deduped, chunked text + metadata (Parquet).
- data/embeddings/ → vectors + mapping (id, domain, maybe chunk_id) stored as .parquet or .npy.
- indices/ → FAISS/Chroma index files (binary); heavy, not tracked in git.
- scripts/ → reusable Python scripts (ETL/embedding/index build/query).

In [None]:
!cd "/content/drive/MyDrive/AMS 560 PROJECT"
!pwd

/content


In [None]:
!pip install pandas numpy pyarrow tqdm sentence-transformers faiss-cpu chromadb

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting chromadb
  Downloading chromadb-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import uuid

# **TESTING WITH SAMPLE DATASET**

Step 1: Sample dataset

In [None]:
import pandas as pd
import uuid

# Minimal sample docs to smoke-test the pipeline end-to-end
sample_docs = [
    "PageRank is an algorithm used by Google Search to rank web pages in their search results.",
    "The mitochondrion is an organelle that produces energy in the form of ATP.",
    "Stack Overflow is a question and answer site for professional and enthusiast programmers."
]

# Create a small dataframe; UUID makes stable unique IDs for each document
df = pd.DataFrame({
    "id": [str(uuid.uuid4()) for _ in sample_docs],
    "domain": ["wikipedia", "pubmed", "stackexchange"],
    "text": sample_docs
})

# Store processed text as columnar parquet (fast IO, typed, splittable)
df.to_parquet(f"{project_root}/data/processed/sample.parquet")
print("Saved sample dataset:", f"{project_root}/data/processed/sample.parquet")

Saved sample dataset: /content/drive/MyDrive/AMS 560 PROJECT/data/processed/sample.parquet


Step 2: Generating Embeddings for That Sample

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a light, fast embedding model; good quality vs speed tradeoff for prototyping
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
texts = df["text"].tolist()

# Create normalized embeddings (cosine similarity)
embeddings = model.encode(texts, normalize_embeddings=True)

np.save(f"{project_root}/data/embeddings/sample_vectors.npy", embeddings)
print("Embeddings saved!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings saved!


Step 3: Building a Mini FAISS Index

In [None]:
import faiss

# Build an ANN index for fast nearest-neighbor search over the vectors
vectors = embeddings.astype("float32")
index = faiss.IndexHNSWFlat(vectors.shape[1], 32)
index.hnsw.efConstruction = 200
index.add(vectors)
faiss.write_index(index, f"{project_root}/indices/sample_hnsw.faiss")

print("FAISS index built and saved!")

FAISS index built and saved!


Step 4: Testing a query for retrieval

In [None]:
# Query: encode the question, search top-k in the index, print retrieved texts
query = "What algorithm ranks pages on Google?"
qvec = model.encode([query], normalize_embeddings=True).astype("float32")
D, I = index.search(qvec, 3)

print("Top retrieved chunks:")
for i in I[0]:
    print("-", df.iloc[i]["text"])

Top retrieved chunks:
- PageRank is an algorithm used by Google Search to rank web pages in their search results.
- Stack Overflow is a question and answer site for professional and enthusiast programmers.
- The mitochondrion is an organelle that produces energy in the form of ATP.


# **PREPROCESSING**

**Step 1: Data collection (Wikipedia - Raw)**

Purpose:
- Stream a manageable slice of English Wikipedia (default: 50k articles)
- Save as Parquet in Drive: data/raw/wikipedia/raw_wiki_50k.parquet
- Keep only fields we need for later steps (id, domain, source_id, title, url, text)

In [None]:
# 2) Create the raw folder for wikipedia
import os
os.makedirs(f"{project_root}/data/raw/wikipedia", exist_ok=True)

# Install dependencies for streaming datasets
!pip -q install datasets wikipedia pyarrow pandas

# Stream English Wikipedia and take a slice
from datasets import load_dataset
import pandas as pd
import uuid

# How many articles to pull
N_DOCS = 5000

# Load a simple Wikipedia dataset (intro/text).
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [None]:
# Collect the first N_DOCS items into a DataFrame with a uniform raw schema
records = []
for i, row in enumerate(ds):
    if i >= N_DOCS:
        break
    title = row.get("title", "")
    text  = row.get("text", "")
    url   = row.get("url", None)  # some versions include url; if None, we keep it as None
    # Standardized raw schema (no cleaning yet)
    records.append({
        "id": str(uuid.uuid4()),         # globally unique id we control
        "domain": "wikipedia",           # keep domain for later merges
        "source_id": title,              # use title as a human-readable key (no strict guarantee)
        "title": title,
        "url": url,
        "published_at": None,            # not available here, keep column for schema consistency
        "text": text                     # raw/unprocessed
    })

raw_df = pd.DataFrame.from_records(records)

In [None]:
# Save to Parquet in data/raw/wikipedia/
raw_path = f"{project_root}/data/raw/wikipedia/raw_wiki_{N_DOCS//1000}k.parquet"
raw_df.to_parquet(raw_path)
print(f"Saved {len(raw_df):,} raw Wikipedia articles to:\n{raw_path}")

Saved 5,000 raw Wikipedia articles to:
/content/drive/MyDrive/AMS 560 PROJECT/data/raw/wikipedia/raw_wiki_5k.parquet


In [None]:
# Quick peek (sanity check)
display(raw_df.head(5)[["domain","source_id","title","url"]])
print("Columns:", list(raw_df.columns))

Unnamed: 0,domain,source_id,title,url
0,wikipedia,Anarchism,Anarchism,https://en.wikipedia.org/wiki/Anarchism
1,wikipedia,Albedo,Albedo,https://en.wikipedia.org/wiki/Albedo
2,wikipedia,A,A,https://en.wikipedia.org/wiki/A
3,wikipedia,Alabama,Alabama,https://en.wikipedia.org/wiki/Alabama
4,wikipedia,Achilles,Achilles,https://en.wikipedia.org/wiki/Achilles


Columns: ['id', 'domain', 'source_id', 'title', 'url', 'published_at', 'text']


**Step 2: Cleaning and Parsing (Wikipedia)**

Purpose:
- Clean raw Wikipedia text by removing HTML, citations, and extra spaces
- Keep only meaningful articles (text length > 100 chars)
- Save cleaned dataset for chunking

Input : data/raw/wikipedia/raw_wiki_50k.parquet

Output: data/processed/wikipedia/wiki_cleaned.parquet

In [None]:
import pandas as pd, re, html

# Define paths
project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
raw_path = f"{project_root}/data/raw/wikipedia/raw_wiki_5k.parquet"
clean_path = f"{project_root}/data/processed/wikipedia/wiki_cleaned.parquet"

import os
os.makedirs(f"{project_root}/data/processed/wikipedia", exist_ok=True)

# Load raw data
df = pd.read_parquet(raw_path)
print(f"Loaded {len(df):,} raw records")

Loaded 5,000 raw records


In [None]:
# cleaning function
def clean_text(txt):
    if not isinstance(txt, str):
        return ""
    # decode HTML entities
    txt = html.unescape(txt)
    # remove brackets/citations like [1], [edit], etc.
    txt = re.sub(r"\[\d+\]|\[edit\]", " ", txt)
    # remove excessive whitespace/newlines
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

In [None]:
# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)

# Drop original uncleaned column to save space
df = df.drop(columns=["text"])

# Keep only articles with at least 100 characters (avoid stubs)
df = df[df["clean_text"].str.len() > 100]  # keep meaningful docs

# Save cleaned version
df.to_parquet(clean_path)
print(f"Cleaned {len(df):,} Wikipedia records saved to:\n{clean_path}")

# Quick preview of cleaned output
display(df.head(3)[["title","clean_text"]])

Cleaned 4,994 Wikipedia records saved to:
/content/drive/MyDrive/AMS 560 PROJECT/data/processed/wikipedia/wiki_cleaned.parquet


Unnamed: 0,title,clean_text
0,Anarchism,Anarchism is a political philosophy and moveme...
1,Albedo,Albedo (; ) is the fraction of sunlight that i...
2,A,"A, or a, is the first letter and the first vow..."


**Step 3: Chunking (Wikipedia)**

Chunking improves retrieval granularity and recall, and overlapping windows keep context across boundaries.

Goal:
- Split cleaned articles into ~500-token chunks (150-token overlap)
- Keep metadata (id, domain, source_id, title, url)

Notes:
- "tokens" here ≈ words (fast, good enough for pipeline)


In [None]:
import os, uuid
import pandas as pd

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
clean_path   = f"{project_root}/data/processed/wikipedia/wiki_cleaned.parquet"
chunks_path  = f"{project_root}/data/processed/wikipedia/wiki_chunks.parquet"

os.makedirs(f"{project_root}/data/processed/wikipedia", exist_ok=True)

# Load cleaned articles
df = pd.read_parquet(clean_path)

In [None]:
def chunk_text(text, max_len=500, stride=150, min_len=120):
    """Sliding window over words: returns list of chunk strings."""
    words = text.split()
    out = []
    i = 0
    while i < len(words):
        j = min(i + max_len, len(words))
        piece = words[i:j]
        if len(piece) >= min_len:
            out.append(" ".join(piece))
        if j == len(words):  # done
            break
        i += stride
    return out

In [None]:
# Build chunk rows with metadata
rows = []
for _, r in df.iterrows():
    chunks = chunk_text(r["clean_text"], max_len=500, stride=150, min_len=120)
    for cid, ch in enumerate(chunks):
        rows.append({
            "id": r["id"],                 # original doc id
            "domain": r["domain"],
            "source_id": r["source_id"],
            "title": r["title"],
            "url": r["url"],
            "chunk_id": cid,               # chunk index within the doc
            "chunk_text": ch,
            "chunk_tokens": len(ch.split())
        })

chunks_df = pd.DataFrame(rows)

In [None]:
# Simple quality filter: keep 120–800 tokens
chunks_df = chunks_df[(chunks_df["chunk_tokens"] >= 120) & (chunks_df["chunk_tokens"] <= 800)]

# Save chunks
chunks_df.to_parquet(chunks_path)
print(f"Wrote {len(chunks_df):,} chunks to:\n{chunks_path}")

# Peek
display(chunks_df.head(5)[["title","chunk_id","chunk_tokens","chunk_text"]])

Wrote 66,337 chunks to:
/content/drive/MyDrive/AMS 560 PROJECT/data/processed/wikipedia/wiki_chunks.parquet


Unnamed: 0,title,chunk_id,chunk_tokens,chunk_text
0,Anarchism,0,500,Anarchism is a political philosophy and moveme...
1,Anarchism,1,500,had a significant role in workers' struggles f...
2,Anarchism,2,500,The etymological origin of anarchism is from t...
3,Anarchism,3,500,often been used as a synonym for anarchism and...
4,Anarchism,4,500,dichotomies between the two. Some scholars des...


**Step 4: Embedding Generation (Wikipedia)**

Goal:
- Encode each chunk into a 384-dim vector using BGE-small
- Normalize (cosine-ready), batch on GPU if available
- Save: vectors (.npy) + metadata (.parquet)

Inputs : data/processed/wikipedia/wiki_chunks.parquet

Outputs:
- data/embeddings/wikipedia/wiki_vectors.npy
- data/embeddings/wikipedia/wiki_meta.parquet

In [None]:
import os, numpy as np, pandas as pd, torch
from tqdm import trange
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
chunks_path  = f"{project_root}/data/processed/wikipedia/wiki_chunks.parquet"
emb_dir      = f"{project_root}/data/embeddings/wikipedia"
os.makedirs(emb_dir, exist_ok=True)

In [None]:
# 1) Load chunks
df = pd.read_parquet(chunks_path)
texts = df["chunk_text"].tolist()
print(f"Chunks: {len(texts):,}")

# 2) Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=device)
model.max_seq_length = 512

# 3) Encode in batches
BATCH = 256 if device == "cuda" else 64
vecs = []
for i in trange(0, len(texts), BATCH, desc="Encoding"):
    batch = texts[i:i+BATCH]
    v = model.encode(batch, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
    vecs.append(v)
vectors = np.vstack(vecs).astype("float32")

# 4) Save vectors and metadata (ids to map back)
np.save(f"{emb_dir}/wiki_vectors.npy", vectors)
meta = df[["id","domain","source_id","title","url","chunk_id","chunk_tokens"]].reset_index(drop=True)
meta.to_parquet(f"{emb_dir}/wiki_meta.parquet")

print("Saved:")
print(f"- Vectors: {emb_dir}/wiki_vectors.npy  (shape {vectors.shape})")
print(f"- Meta   : {emb_dir}/wiki_meta.parquet")

Chunks: 66,337


Encoding: 100%|██████████| 260/260 [03:11<00:00,  1.35it/s]


Saved:
- Vectors: /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/wikipedia/wiki_vectors.npy  (shape (66337, 384))
- Meta   : /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/wikipedia/wiki_meta.parquet


**Step 5: Build FAISS Index (Wikipedia)**

Goal:
- Create an HNSW index for fast nearest-neighbor search
- Store index on Drive for reuse

Inputs : wiki_vectors.npy

Outputs: indices/wikipedia/wiki_hnsw.faiss

In [None]:
import faiss, os, numpy as np

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
vec_path = f"{project_root}/data/embeddings/wikipedia/wiki_vectors.npy"
index_dir = f"{project_root}/indices/wikipedia"
os.makedirs(index_dir, exist_ok=True)

# 1) Load vectors
vectors = np.load(vec_path).astype("float32")
print(f"Loaded vectors: {vectors.shape}")

# 2) Build FAISS HNSW index (fast, memory-efficient)
dim = vectors.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)   # 32 = neighbors per node
index.hnsw.efConstruction = 200        # build accuracy
index.add(vectors)

# 3) Save index
faiss.write_index(index, f"{index_dir}/wiki_hnsw.faiss")
print(f"Index built & saved to:\n{index_dir}/wiki_hnsw.faiss")

Loaded vectors: (66337, 384)
Index built & saved to:
/content/drive/MyDrive/AMS 560 PROJECT/indices/wikipedia/wiki_hnsw.faiss


**Step 6: Query & Test Retrieval**

 Goal:
- Test the FAISS index with a few natural-language queries
- Retrieve top-k matching chunks to verify relevance

Inputs : wiki_hnsw.faiss, wiki_vectors.npy, wiki_meta.parquet

Output : printed top results

In [None]:
import faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
index_path  = f"{project_root}/indices/wikipedia/wiki_hnsw.faiss"
meta_path   = f"{project_root}/data/embeddings/wikipedia/wiki_meta.parquet"

# 1) Load index and metadata
index = faiss.read_index(index_path)
meta  = pd.read_parquet(meta_path)

print(f"Index: {index.ntotal:,} vectors loaded")
print(f"Meta:  {len(meta):,} entries")

# 2) Load embedding model
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# 3) Encode a test query
query = "What is the theory of relativity?"
qvec = model.encode([query], normalize_embeddings=True).astype("float32")

# 4) Search in FAISS index
k = 5  # top results
D, I = index.search(qvec, k)

# 5) Show retrieved results
print(f"\n Query: {query}\n")
for rank, idx in enumerate(I[0]):
    title = meta.iloc[idx]["title"]
    preview = meta.iloc[idx]["source_id"]
    print(f"{rank+1}. {title}")
    print(meta.iloc[idx].get("url", ""))
    print()

Index: 66,337 vectors loaded
Meta:  66,337 entries

 Query: What is the theory of relativity?

1. Albert Einstein
https://en.wikipedia.org/wiki/Albert%20Einstein

2. Albert Einstein
https://en.wikipedia.org/wiki/Albert%20Einstein

3. Albert Einstein
https://en.wikipedia.org/wiki/Albert%20Einstein

4. Albert Einstein
https://en.wikipedia.org/wiki/Albert%20Einstein

5. Albert Einstein
https://en.wikipedia.org/wiki/Albert%20Einstein



###**PUBMED**

**Step 1: Data Collection (NCBI E-utilities)**

Grabs N_DOCS PubMed abstracts via NCBI Entrez API in batches.

Output: data/raw/pubmed/raw_pubmed_<N>.parquet

In [None]:
!pip -q install biopython pandas pyarrow tqdm
from Bio import Entrez
from tqdm import trange
import pandas as pd, time, uuid, os

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
raw_dir = f"{project_root}/data/raw/pubmed"
os.makedirs(raw_dir, exist_ok=True)

EMAIL = "shrutijms@gmail.com"       # REQUIRED by NCBI
API_KEY = None                  # speeds up
N_DOCS  = 5000
BATCH   = 500

Entrez.email = EMAIL
if API_KEY: Entrez.api_key = API_KEY

# 1) Search PMIDs (e.g., all with abstracts, last 20 yrs)
term = '(hasabstract[text]) AND (2005:2025[pdat])'
search = Entrez.esearch(db="pubmed", term=term, retmax=N_DOCS)
pmids = Entrez.read(search)["IdList"][:N_DOCS]

# 2) Fetch in batches → JSON records
recs = []
for i in trange(0, len(pmids), BATCH, desc="Fetching"):
    chunk = pmids[i:i+BATCH]
    handle = Entrez.efetch(db="pubmed", id=",".join(chunk), rettype="abstract", retmode="xml")
    data = Entrez.read(handle)
    for art in data["PubmedArticle"]:
        pmid = str(art["MedlineCitation"]["PMID"])
        ti   = art["MedlineCitation"]["Article"].get("ArticleTitle","")
        absps= art["MedlineCitation"]["Article"].get("Abstract",{}).get("AbstractText",[])
        abstract = " ".join(str(x) for x in absps)
        date = art["MedlineCitation"]["Article"].get("Journal",{}).get("JournalIssue",{}).get("PubDate",{})
        url  = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
        recs.append({
            "id": str(uuid.uuid4()),
            "domain": "pubmed",
            "source_id": pmid,
            "title": ti,
            "url": url,
            "published_at": str(date),
            "text": abstract
        })
    # polite rate limit
    time.sleep(0.35 if API_KEY else 0.5)

df = pd.DataFrame(recs)
out = f"{raw_dir}/raw_pubmed_{len(df)}.parquet"
df.to_parquet(out)
print(f"Saved {len(df):,} abstracts → {out}")
df.head(3)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[?25h

Fetching: 100%|██████████| 10/10 [00:45<00:00,  4.57s/it]


Saved 4,999 abstracts → /content/drive/MyDrive/AMS 560 PROJECT/data/raw/pubmed/raw_pubmed_4999.parquet


Unnamed: 0,id,domain,source_id,title,url,published_at,text
0,6d50053d-6842-452b-82ed-52cc61d9a92d,pubmed,41175411,The role of decompressive craniectomy in the m...,https://pubmed.ncbi.nlm.nih.gov/41175411/,"{'Year': '2025', 'Month': 'Nov', 'Day': '01'}",Cerebral infection syndromes are life-threaten...
1,a0131b02-23be-4975-91ad-1ce34b2fd462,pubmed,41175410,Ear and nose-related intracranial empyema in c...,https://pubmed.ncbi.nlm.nih.gov/41175410/,"{'Year': '2025', 'Month': 'Nov', 'Day': '01'}",Intracranial empyemas (IEs) are rare but sever...
2,9a5d6ec2-2fe5-4bad-8bb1-1d13929b97f2,pubmed,41175409,Use of bilateral occipital horn ventricular la...,https://pubmed.ncbi.nlm.nih.gov/41175409/,"{'Year': '2025', 'Month': 'Nov', 'Day': '01'}",The aim of this study was to describe a bilate...


**Step 2: Cleaning**

In [None]:
import pandas as pd, re, html, os

in_p  = out
stg_dir = f"{project_root}/data/processed/pubmed"
os.makedirs(stg_dir, exist_ok=True)
clean_p = f"{stg_dir}/pubmed_cleaned.parquet"

df = pd.read_parquet(in_p)

def clean_txt(t):
    if not isinstance(t, str): return ""
    t = html.unescape(t)
    t = re.sub(r"\[[0-9,; ]+\]", " ", t)   # citation brackets
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df["text"].apply(clean_txt)
df = df.drop(columns=["text"])
df = df[df["clean_text"].str.len() > 100]
df.to_parquet(clean_p)
print(f"Cleaned {len(df):,} → {clean_p}")

Cleaned 4,966 → /content/drive/MyDrive/AMS 560 PROJECT/data/processed/pubmed/pubmed_cleaned.parquet


**Step 3: Chunking**

Input : data/processed/pubmed/pubmed_cleaned.parquet

Output: data/processed/pubmed/pubmed_chunks.parquet

In [None]:
import os, pandas as pd

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
clean_p  = f"{project_root}/data/processed/pubmed/pubmed_cleaned.parquet"
chunks_p = f"{project_root}/data/processed/pubmed/pubmed_chunks.parquet"
os.makedirs(f"{project_root}/data/processed/pubmed", exist_ok=True)

# Load cleaned abstracts
df = pd.read_parquet(clean_p)

def chunk_text(text, max_len=500, stride=150, min_len=120):
    """Word-based sliding window; fast & good enough."""
    words, out, i = text.split(), [], 0
    while i < len(words):
        j = min(i + max_len, len(words))
        piece = words[i:j]
        if len(piece) >= min_len: out.append(" ".join(piece))
        if j == len(words): break
        i += stride
    return out

In [None]:
# Build chunk rows
rows = []
for _, r in df.iterrows():
    for cid, ch in enumerate(chunk_text(r["clean_text"])):
        rows.append({
            "id": r["id"],
            "domain": r["domain"],          # "pubmed"
            "source_id": r["source_id"],    # PMID
            "title": r["title"],
            "url": r["url"],
            "chunk_id": cid,
            "chunk_text": ch,
            "chunk_tokens": len(ch.split())
        })

chunks_df = pd.DataFrame(rows)

In [None]:
# Quality bounds
chunks_df = chunks_df[(chunks_df["chunk_tokens"] >= 120) & (chunks_df["chunk_tokens"] <= 800)]

# Save
chunks_df.to_parquet(chunks_p)
print(f"PubMed chunks: {len(chunks_df):,} → {chunks_p}")
display(chunks_df.head(3)[["title","chunk_id","chunk_tokens","chunk_text"]])

PubMed chunks: 4,583 → /content/drive/MyDrive/AMS 560 PROJECT/data/processed/pubmed/pubmed_chunks.parquet


Unnamed: 0,title,chunk_id,chunk_tokens,chunk_text
0,The role of decompressive craniectomy in the m...,0,330,Cerebral infection syndromes are life-threaten...
1,Ear and nose-related intracranial empyema in c...,0,420,Intracranial empyemas (IEs) are rare but sever...
2,Use of bilateral occipital horn ventricular la...,0,356,The aim of this study was to describe a bilate...


**Step 4: EMBEDDINGS

Input : data/processed/pubmed/pubmed_chunks.parquet

Output: data/embeddings/pubmed/pubmed_vectors.npy + pubmed_meta.parquet**

In [None]:
import os, numpy as np, pandas as pd, torch
from tqdm import trange
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
chunks_p = f"{project_root}/data/processed/pubmed/pubmed_chunks.parquet"
emb_dir  = f"{project_root}/data/embeddings/pubmed"
os.makedirs(emb_dir, exist_ok=True)

# 1) Load chunks
df = pd.read_parquet(chunks_p)
texts = df["chunk_text"].tolist()
print(f"Chunks: {len(texts):,}")

# 2) Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=device)
model.max_seq_length = 512

# 3) Batch encode (normalized to cosine-ready)
BATCH = 256 if device == "cuda" else 64
vecs = []
for i in trange(0, len(texts), BATCH, desc="Encoding"):
    batch = texts[i:i+BATCH]
    v = model.encode(batch, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
    vecs.append(v)
vectors = np.vstack(vecs).astype("float32")

# 4) Save vectors and meta
np.save(f"{emb_dir}/pubmed_vectors.npy", vectors)
meta = df[["id","domain","source_id","title","url","chunk_id","chunk_tokens"]].reset_index(drop=True)
meta.to_parquet(f"{emb_dir}/pubmed_meta.parquet")

print("Saved")
print(f"- {emb_dir}/pubmed_vectors.npy  shape={vectors.shape}")
print(f"- {emb_dir}/pubmed_meta.parquet")

Chunks: 4,583


Encoding: 100%|██████████| 18/18 [00:11<00:00,  1.58it/s]


Saved
- /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/pubmed/pubmed_vectors.npy  shape=(4583, 384)
- /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/pubmed/pubmed_meta.parquet


**Step 5: Build FAISS Index**

Inputs : pubmed_vectors.npy

Output : indices/pubmed/pubmed_hnsw.faiss

In [None]:
import os, numpy as np, faiss

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
vec_path  = f"{project_root}/data/embeddings/pubmed/pubmed_vectors.npy"
index_dir = f"{project_root}/indices/pubmed"
os.makedirs(index_dir, exist_ok=True)

# Load vectors
xb = np.load(vec_path).astype("float32")
print("Vectors:", xb.shape)  # (N, 384)

# Build HNSW (cosine via inner product on normalized vecs)
dim = xb.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)   # M=32 good default
index.hnsw.efConstruction = 200
index.add(xb)

# Save
faiss.write_index(index, f"{index_dir}/pubmed_hnsw.faiss")
print(f"Saved index → {index_dir}/pubmed_hnsw.faiss")

Vectors: (4583, 384)
Saved index → /content/drive/MyDrive/AMS 560 PROJECT/indices/pubmed/pubmed_hnsw.faiss


**Step 6: Query and Test**

Inputs : indices/pubmed/pubmed_hnsw.faiss, pubmed_meta.parquet

In [None]:
import faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
index_path = f"{project_root}/indices/pubmed/pubmed_hnsw.faiss"
meta_path  = f"{project_root}/data/embeddings/pubmed/pubmed_meta.parquet"

# 1) Load index + metadata
index = faiss.read_index(index_path)
meta  = pd.read_parquet(meta_path)
print(f"Index size: {index.ntotal:,}  |  Meta rows: {len(meta):,}")

# 2) Load same embedding model
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# 3) Encode query
query = "role of IL-6 in inflammation"
qvec  = model.encode([query], normalize_embeddings=True).astype("float32")

# 4) Search top-k
k = 5
D, I = index.search(qvec, k)

# 5) Display hits
print(f"\n Query: {query}\n")
for r, idx in enumerate(I[0], 1):
    row = meta.iloc[idx]
    print(f"{r}. PMID {row['source_id']} — {row['title']}")
    print(row['url'])
    print()


Index size: 4,583  |  Meta rows: 4,583

 Query: role of IL-6 in inflammation

1. PMID 41173373 — Acute injection of IL-6, but not hepcidin, results in hypozincemia but does not inhibit dietary zinc absorption in mice.
https://pubmed.ncbi.nlm.nih.gov/41173373/

2. PMID 41174156 — The involvement of microglia and the CXCL16-CXCR6 axis in the recruitment of CD8<sup>+</sup> T cells to an amyloidogenic mouse brain.
https://pubmed.ncbi.nlm.nih.gov/41174156/

3. PMID 41173155 — M<sup>6</sup>A methylation in tumor immune microenvironment: Multidimensional mechanism and targeted therapy strategies.
https://pubmed.ncbi.nlm.nih.gov/41173155/

4. PMID 41173111 — Insights into transcriptomic changes in blood of a mouse model of LPS-induced peritonitis.
https://pubmed.ncbi.nlm.nih.gov/41173111/

5. PMID 41170308 — Relative expression of pro-inflammatory cytokine genes in Holstein dairy cows naturally affected by <i>Escherichia coli</i> mastitis.
https://pubmed.ncbi.nlm.nih.gov/41170308/



### **STACKEXCHANGE**



**STEP 1: COLLECT (STACKEXCHANGE: Stack Overflow)**

Output: data/raw/stackexchange/raw_so_<N>.parquet

Notes:
- Uses Stack Exchange API (no key needed for small pulls; key speeds up)
- Grabs Q&A with bodies; highest-voted first; paginated

In [None]:
# import os, time, uuid, requests, pandas as pd

# project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
# raw_dir = f"{project_root}/data/raw/stackexchange"
# os.makedirs(raw_dir, exist_ok=True)

# N_DOCS   = 5000          # start smaller; raise after it works
# PAGESIZE = 100
# SITE     = "stackoverflow"
# API_KEY  = "rl_7mSjfTsnfgybSNBckBaoqrpDU"          # for higher quota

# def get_json(url, params, tries=5, sleep=1.0):
#     """GET with JSON-only check + exponential backoff."""
#     for t in range(tries):
#         r = requests.get(url, params=params, timeout=30)
#         ct = r.headers.get("Content-Type","")
#         if r.status_code == 200 and "application/json" in ct:
#             try: return r.json()
#             except Exception: pass
#         time.sleep(sleep * (2**t))
#     return None

In [None]:
# records, page = [], 1
# BASE_Q = "https://api.stackexchange.com/2.3/questions"
# BASE_A = "https://api.stackexchange.com/2.3/answers/"

# while len(records) < N_DOCS:
#     q_params = {
#         "order":"desc","sort":"votes","site":SITE,
#         "pagesize":PAGESIZE,"page":page,
#         "filter":"withbody","key":API_KEY
#     }
#     q_data = get_json(BASE_Q, q_params)
#     if not q_data:
#         print("skip page (bad response)"); page += 1; continue
#     items = q_data.get("items", [])
#     if not items: break

#     for q in items:
#         aid = q.get("accepted_answer_id")
#         if not aid: continue
#         a_params = {"order":"desc","sort":"votes","site":SITE,"filter":"withbody","key":API_KEY}
#         a_data = get_json(BASE_A + str(aid), a_params)
#         if not a_data: continue
#         ans_items = a_data.get("items", [])
#         if not ans_items: continue
#         a = ans_items[0]

#         records.append({
#             "id": str(uuid.uuid4()),
#             "domain": "stackexchange",
#             "source_id": str(q["question_id"]),
#             "title": q.get("title",""),
#             "url": q.get("link",""),
#             "published_at": q.get("creation_date", None),
#             "text": q.get("body","") + "\n\n[ANSWER]\n\n" + a.get("body","")
#         })
#         if len(records) >= N_DOCS: break

#     page += 1
#     time.sleep(1.0)  # polite pacing

In [None]:
import os, time, uuid, requests, pandas as pd

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
raw_dir = f"{project_root}/data/raw/stackexchange"
os.makedirs(raw_dir, exist_ok=True)

N_DOCS   = 5000
PAGESIZE = 100
SITE     = "stackoverflow"
API_KEY  = "rl_7mSjfTsnfgybSNBckBaoqrpDU"

BASE_Q = "https://api.stackexchange.com/2.3/questions"
BASE_A = "https://api.stackexchange.com/2.3/answers/"

def get_json(url, params, tries=5, sleep=1.0):
    for t in range(tries):
        r = requests.get(url, params=params, timeout=30)
        ct = r.headers.get("Content-Type", "")
        if r.status_code == 200 and "application/json" in ct:
            try:
                return r.json()
            except Exception:
                pass
        time.sleep(sleep * (2 ** t))
    return None

records = []
page = 1
bad_streak = 0
MAX_BAD_PAGES = 10   # to avoid infinite loop

while len(records) < N_DOCS:
    q_params = {
        "order": "desc",
        # "sort": "votes",          # this is heavy/top-heavy
        "sort": "creation",         # much healthier paging
        "site": SITE,
        "pagesize": PAGESIZE,
        "page": page,
        "filter": "withbody",
        "key": API_KEY,
    }

    q_data = get_json(BASE_Q, q_params)

    if not q_data:
        print("skip page (bad response)")
        bad_streak += 1
        if bad_streak >= MAX_BAD_PAGES:
            print("too many bad pages, stopping.")
            break
        page += 1
        continue

    # reset streak if we got something valid
    bad_streak = 0

    # handle API 'backoff'
    if "backoff" in q_data:
        wait_s = int(q_data["backoff"])
        print(f"API asked to back off for {wait_s} seconds...")
        time.sleep(wait_s)

    items = q_data.get("items", [])
    if not items:
        print("no items on this page, stopping.")
        break

    for q in items:
        aid = q.get("accepted_answer_id")
        if not aid:
            continue

        a_params = {
            "order": "desc",
            "sort": "votes",
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY,
        }
        a_data = get_json(BASE_A + str(aid), a_params)
        if not a_data:
            continue

        ans_items = a_data.get("items", [])
        if not ans_items:
            continue

        a = ans_items[0]

        records.append({
            "id": str(uuid.uuid4()),
            "domain": "stackexchange",
            "source_id": str(q["question_id"]),
            "title": q.get("title", ""),
            "url": q.get("link", ""),
            "published_at": q.get("creation_date", None),
            "text": q.get("body", "") + "\n\n[ANSWER]\n\n" + a.get("body", "")
        })

        if len(records) >= N_DOCS:
            break

    # stop if API says no more pages
    if not q_data.get("has_more", False):
        print("API reports no more pages.")
        break

    # stop if close to quota
    if q_data.get("quota_remaining", 9999) < 10:
        print("quota almost exhausted, stopping.")
        break

    page += 1
    time.sleep(0.5)   # polite

In [None]:
# convert to DataFrame and save
df = pd.DataFrame(records)
out = f"{raw_dir}/raw_so_{len(df)}.parquet"
df.to_parquet(out)
print(f"Saved {len(df):,} Stack Overflow Q&A → {out}")

# quick preview
df.head(2)

Saved 5,000 Stack Overflow Q&A → /content/drive/MyDrive/AMS 560 PROJECT/data/raw/stackexchange/raw_so_5000.parquet


Unnamed: 0,id,domain,source_id,title,url,published_at,text
0,6135e175-a65f-4f5e-a18a-ea6b18c15981,stackexchange,79806848,Why does the compiler give strncpy &#39;string...,https://stackoverflow.com/questions/79806848/w...,1762036357,"<p>With avr-gcc 14, the compiler gives this wa..."
1,b2771d0d-d3fc-47b2-809d-a6622eec2ca4,stackexchange,79806807,How to iterate over an array instead using for...,https://stackoverflow.com/questions/79806807/h...,1762028878,"<p>I have to iterate over an array, there to g..."


**Step 2: Cleaning**

Input : data/raw/stackexchange/raw_so_<N>.parquet

Output: data/processed/stackexchange/so_cleaned.parquet

In [None]:
import pandas as pd, re, html, os

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
raw_p  = f"{project_root}/data/raw/stackexchange/raw_so_5000.parquet"   # update if different
clean_dir = f"{project_root}/data/processed/stackexchange"
os.makedirs(clean_dir, exist_ok=True)
clean_p = f"{clean_dir}/so_cleaned.parquet"

# Load data
df = pd.read_parquet(raw_p)

def clean_html(text):
    """Strip HTML tags, decode entities, and clean spacing."""
    if not isinstance(text, str): return ""
    text = html.unescape(text)
    text = re.sub(r"<[^>]+>", " ", text)            # remove all HTML tags
    text = re.sub(r"http\S+|www\S+", " ", text)     # remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Clean
df["clean_text"] = df["text"].apply(clean_html)
df = df.drop(columns=["text"])
df = df[df["clean_text"].str.len() > 100]

# Save cleaned
df.to_parquet(clean_p)
print(f"Cleaned {len(df):,} StackExchange records → {clean_p}")
display(df.head(3)[["title","clean_text"]])

Cleaned 5,000 StackExchange records → /content/drive/MyDrive/AMS 560 PROJECT/data/processed/stackexchange/so_cleaned.parquet


Unnamed: 0,title,clean_text
0,Why does the compiler give strncpy &#39;string...,"With avr-gcc 14, the compiler gives this warni..."
1,How to iterate over an array instead using for...,"I have to iterate over an array, there to get ..."
2,Python dictionary key error when using nested ...,I’m trying to count occurrences of items in ne...


**Step 3: Chunking**

Input : data/processed/stackexchange/so_cleaned.parquet

Output: data/processed/stackexchange/so_chunks.parquet

In [None]:
import os, pandas as pd

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
clean_p  = f"{project_root}/data/processed/stackexchange/so_cleaned.parquet"
chunks_p = f"{project_root}/data/processed/stackexchange/so_chunks.parquet"
os.makedirs(f"{project_root}/data/processed/stackexchange", exist_ok=True)

# Load cleaned Q&A
df = pd.read_parquet(clean_p)

def chunk_text(text, max_len=500, stride=150, min_len=120):
    """Fast word-based sliding window chunker."""
    words, out, i = text.split(), [], 0
    while i < len(words):
        j = min(i + max_len, len(words))
        piece = words[i:j]
        if len(piece) >= min_len:
            out.append(" ".join(piece))
        if j == len(words): break
        i += stride
    return out

In [None]:
# Build chunk rows with metadata
rows = []
for _, r in df.iterrows():
    for cid, ch in enumerate(chunk_text(r["clean_text"])):
        rows.append({
            "id": r["id"],
            "domain": r["domain"],        # "stackexchange"
            "source_id": r["source_id"],  # question_id
            "title": r["title"],
            "url": r["url"],
            "chunk_id": cid,
            "chunk_text": ch,
            "chunk_tokens": len(ch.split())
        })

chunks_df = pd.DataFrame(rows)

In [None]:
# Quality bounds
chunks_df = chunks_df[(chunks_df["chunk_tokens"] >= 120) & (chunks_df["chunk_tokens"] <= 800)]

# Save
chunks_df.to_parquet(chunks_p)
print(f"StackExchange chunks: {len(chunks_df):,} → {chunks_p}")
display(chunks_df.head(3)[["title","chunk_id","chunk_tokens","chunk_text"]])

StackExchange chunks: 8,244 → /content/drive/MyDrive/AMS 560 PROJECT/data/processed/stackexchange/so_chunks.parquet


Unnamed: 0,title,chunk_id,chunk_tokens,chunk_text
0,Why does the compiler give strncpy &#39;string...,0,346,"With avr-gcc 14, the compiler gives this warni..."
1,How to iterate over an array instead using for...,0,177,"I have to iterate over an array, there to get ..."
2,Python dictionary key error when using nested ...,0,165,I’m trying to count occurrences of items in ne...


**Step 4: Embeddings**

Input : data/processed/stackexchange/so_chunks.parquet

Output: data/embeddings/stackexchange/so_vectors.npy + so_meta.parquet

In [None]:
import os, numpy as np, pandas as pd, torch
from tqdm import trange
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
chunks_p = f"{project_root}/data/processed/stackexchange/so_chunks.parquet"
emb_dir  = f"{project_root}/data/embeddings/stackexchange"
os.makedirs(emb_dir, exist_ok=True)

# 1) Load chunks
df = pd.read_parquet(chunks_p)
texts = df["chunk_text"].tolist()
print(f"Chunks: {len(texts):,}")

Chunks: 8,244


In [None]:
# 2) Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=device)
model.max_seq_length = 512

# 3) Batch encode (normalized for cosine/IP search)
BATCH = 256 if device == "cuda" else 64
vecs = []
for i in trange(0, len(texts), BATCH, desc="Encoding"):
    batch = texts[i:i+BATCH]
    v = model.encode(batch, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
    vecs.append(v)
vectors = np.vstack(vecs).astype("float32")

# 4) Save vectors + metadata
np.save(f"{emb_dir}/so_vectors.npy", vectors)
meta = df[["id","domain","source_id","title","url","chunk_id","chunk_tokens"]].reset_index(drop=True)
meta.to_parquet(f"{emb_dir}/so_meta.parquet")

print("Saved")
print(f"- {emb_dir}/so_vectors.npy  shape={vectors.shape}")
print(f"- {emb_dir}/so_meta.parquet")

Encoding: 100%|██████████| 33/33 [00:24<00:00,  1.35it/s]


Saved
- /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/stackexchange/so_vectors.npy  shape=(8244, 384)
- /content/drive/MyDrive/AMS 560 PROJECT/data/embeddings/stackexchange/so_meta.parquet


**Step 5: Build FAISS Index**

Input : data/embeddings/stackexchange/so_vectors.npy

Output: indices/stackexchange/so_hnsw.faiss

In [None]:
import os, numpy as np, faiss

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
vec_path  = f"{project_root}/data/embeddings/stackexchange/so_vectors.npy"
index_dir = f"{project_root}/indices/stackexchange"
os.makedirs(index_dir, exist_ok=True)

# 1) Load vectors
xb = np.load(vec_path).astype("float32")
print("Vectors:", xb.shape)

# 2) Build HNSW index (inner product = cosine similarity)
dim = xb.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)   # 32 = neighbors per node
index.hnsw.efConstruction = 200
index.add(xb)

# 3) Save index
faiss.write_index(index, f"{index_dir}/so_hnsw.faiss")
print(f"Index saved → {index_dir}/so_hnsw.faiss")

Vectors: (8244, 384)
Index saved → /content/drive/MyDrive/AMS 560 PROJECT/indices/stackexchange/so_hnsw.faiss


**Step 6: Query and Testing**

In [None]:
# Test the FAISS index with a sample query
import faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"
index_path = f"{project_root}/indices/stackexchange/so_hnsw.faiss"
meta_path  = f"{project_root}/data/embeddings/stackexchange/so_meta.parquet"

# Load index + metadata
index = faiss.read_index(index_path)
meta  = pd.read_parquet(meta_path)
print(f"Index: {index.ntotal:,}  |  Meta: {len(meta):,}")

# Same embedding model
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# Encode query and search top-k
query = "How to speed up a Python loop?"
qvec  = model.encode([query], normalize_embeddings=True).astype("float32")
D, I  = index.search(qvec, 5)

# Show hits
print(f"\n {query}\n")
for r, idx in enumerate(I[0], 1):
    row = meta.iloc[idx]
    print(f"{r}. {row['title']}")
    print(row['url'], "\n")

Index: 8,244  |  Meta: 8,244

 How to speed up a Python loop?

1. Poker all existing flops generation using Python
https://stackoverflow.com/questions/79781979/poker-all-existing-flops-generation-using-python 

2. Cartesian product for both keys and values of a dictionary?
https://stackoverflow.com/questions/79727981/cartesian-product-for-both-keys-and-values-of-a-dictionary 

3. Improve implementation of a &quot;widget tree structure&quot; in Qt6
https://stackoverflow.com/questions/79736543/improve-implementation-of-a-widget-tree-structure-in-qt6 

4. Sequential compilation times of a jax-jitted recursive function
https://stackoverflow.com/questions/79769647/sequential-compilation-times-of-a-jax-jitted-recursive-function 

5. Why my code is becoming slower over long loop?
https://stackoverflow.com/questions/79745188/why-my-code-is-becoming-slower-over-long-loop 



# **MULTI DOMAIN MERGE**

Output:
- data/embeddings/multi_default/multi_meta.parquet
- data/embeddings/multi_default/multi_vectors.npy
- indices/multi_default/multi_hnsw.faiss

In [None]:
import os, numpy as np, pandas as pd, faiss

project_root = "/content/drive/MyDrive/AMS 560 PROJECT"

# 1) Paths
w_meta = f"{project_root}/data/embeddings/wikipedia/wiki_meta.parquet"
w_vecs = f"{project_root}/data/embeddings/wikipedia/wiki_vectors.npy"
p_meta = f"{project_root}/data/embeddings/pubmed/pubmed_meta.parquet"
p_vecs = f"{project_root}/data/embeddings/pubmed/pubmed_vectors.npy"
s_meta = f"{project_root}/data/embeddings/stackexchange/so_meta.parquet"
s_vecs = f"{project_root}/data/embeddings/stackexchange/so_vectors.npy"

out_dir = f"{project_root}/data/embeddings/multi_default"
os.makedirs(out_dir, exist_ok=True)

# 2) Load metas & vectors
m_w, v_w = pd.read_parquet(w_meta), np.load(w_vecs)
m_p, v_p = pd.read_parquet(p_meta), np.load(p_vecs)
m_s, v_s = pd.read_parquet(s_meta), np.load(s_vecs)

# 3) Concatenate (keeping domain column for filtering later)
multi_meta   = pd.concat([m_w, m_p, m_s], ignore_index=True)
multi_vectors= np.vstack([v_w, v_p, v_s]).astype("float32")

# 4) Shuffle to mix domains
perm = np.random.permutation(len(multi_meta))
multi_meta    = multi_meta.iloc[perm].reset_index(drop=True)
multi_vectors = multi_vectors[perm]

# 5) Save merged artifacts
multi_meta.to_parquet(f"{out_dir}/multi_meta.parquet")
np.save(f"{out_dir}/multi_vectors.npy", multi_vectors)
print("Merged:", multi_meta.shape, multi_vectors.shape)

# 6) Build FAISS index (HNSW)
index_dir = f"{project_root}/indices/multi_default"
os.makedirs(index_dir, exist_ok=True)
dim = multi_vectors.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efConstruction = 200
index.add(multi_vectors)
faiss.write_index(index, f"{index_dir}/multi_hnsw.faiss")
print(f"Multi index saved → {index_dir}/multi_hnsw.faiss")

Merged: (79164, 7) (79164, 384)
Multi index saved → /content/drive/MyDrive/AMS 560 PROJECT/indices/multi_default/multi_hnsw.faiss


In [3]:
!pwd

/content


In [4]:
!find /content/drive -name "$(basename *.ipynb)"


find: ‘/content/drive’: No such file or directory


In [7]:
!ls /content/drive/MyDrive

 116664310_project_topic.gdoc
 515_ppt.pdf
'515_proj.ipynb - Colab.pdf'
 560_assignment_screenshots.gdoc
 560_Paper_Summary_Group6.gdoc
'560 QUIZ 3'
'all_marksheets (1).pdf'
'all_marksheets (2).pdf'
'all_marksheets (3).pdf'
'all_marksheets (4).pdf'
 all_marksheets.pdf
'AMS 560 PROJECT_old'
'AMS 560 PROJECT VIZ'
 ASSIGNMENT_2.gdoc
 Chapin-H-1117A-2_Shruti_Jagdale_116664310.mp4
 chapter_5_3.gdoc
'chapter 6.gdoc'
'Colab Notebooks'
'Course eval.gdoc'
 current_build.drawio
'Deep Learning On Binance Order Book Updates.zip'
'Deeter interview prep.gdoc'
 DRAW.IO
 Extra_Credit_Assignment.gdoc
'Green and Grey Modern Analysis of Results Presentation.gslides'
'HCI PROJECT'
'In class assignment.gdoc'
'indus ppt.gdoc'
 internship_application
'interview prep.gdoc'
'Introduction to PSG (1).ipynb'
 jobs.gdoc
 jobs_top_companies.gsheet
 job_tracker_9sept.gsheet
 leetcode_progress.gsheet
 LLM_Security_Tool_Capabilities.gdoc
'LLMs prep.gdoc'
 model_comparisons.gsheet
'NEW BIOTECH PROJECT'
'new_build_ (1).

In [8]:
!find /content/drive/MyDrive -name "*.ipynb"

/content/drive/MyDrive/Introduction to PSG (1).ipynb
/content/drive/MyDrive/Colab Notebooks/scaling_rag_pipeline (2).ipynb
/content/drive/MyDrive/Colab Notebooks/Untitled0.ipynb
/content/drive/MyDrive/Colab Notebooks/scaling_rag_pipeline.ipynb
/content/drive/MyDrive/Colab Notebooks/Copy of Retrieval.ipynb
/content/drive/MyDrive/Colab Notebooks/Copy of Step2_Retrieval.ipynb
/content/drive/MyDrive/HCI PROJECT/SoniDash.ipynb
/content/drive/MyDrive/AMS 560 PROJECT_old/Copy of Step2_Retrieval.ipynb
/content/drive/MyDrive/AMS 560 PROJECT_old/Copy_of_Step2_Retrieval.ipynb
/content/drive/MyDrive/AMS 560 PROJECT_old/Copy of Preprocessing.ipynb
/content/drive/MyDrive/AMS 560 PROJECT VIZ/Visualizations.ipynb


In [9]:
!cp "/content/drive/MyDrive/AMS 560 PROJECT_old/Copy of Preprocessing.ipynb" "/content/Scaling-Retrieval-System/"

cp: cannot create regular file '/content/Scaling-Retrieval-System/': Not a directory
