In [1]:
# =============================
# STEP 0: Install dependencies
# =============================
!pip install -q sentence-transformers faiss-cpu tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# prompt: connect with google drive

# =============================
# STEP 1: Connect with Google Drive
# =============================
from google.colab import drive
drive.mount('/content/drive')

# Navigate to the appropriate directory if needed
%cd /content/drive/My Drive/Colab Notebooks/rag


Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/rag


In [3]:
# prompt: pwd

!pwd

/content/drive/My Drive/Colab Notebooks/rag


In [7]:
# =============================
# STEP 1: Setup paths & config
# =============================
import os

# Adjust paths for Colab
JSON_PATH = "./arxiv-metadata-oai-snapshot.json"  # Place your file here
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
SAVE_DIR = "./embeddings/all"
MAX_DOCS = 1000000  # Set to None to load entire dataset

# Optional: Download small sample if needed
# !wget https://huggingface.co/datasets/PaulMougel/arxiv-oa/resolve/main/arxiv-metadata-oai-snapshot.json -O /content/arxiv-metadata-oai-snapshot.json

In [8]:
# =============================
# STEP 2: Load JSON abstracts
# =============================
import json
from tqdm import tqdm

abstracts = []
paper_meta = []

with open(JSON_PATH, 'r') as f:
    for line in tqdm(f, desc="Loading abstracts"):
        record = json.loads(line)
        if record.get("abstract"):
            clean_abstract = record["abstract"].strip().replace('\n', ' ')
            abstracts.append(clean_abstract)
            paper_meta.append({
                "id": record["id"],
                "title": record["title"],
                "authors": record["authors"],
                "categories": record["categories"],
                "abstract": clean_abstract,
            })
            if MAX_DOCS and len(abstracts) >= MAX_DOCS:
                break

print(f"✅ Loaded {len(abstracts)} abstracts.")

Loading abstracts: 999999it [00:33, 30302.78it/s]

✅ Loaded 1000000 abstracts.





In [9]:
# =============================
# STEP 3: Generate embeddings
# =============================
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(EMBEDDING_MODEL)
embeddings = model.encode(abstracts, show_progress_bar=True, batch_size=64)
print(f"✅ Embeddings shape: {embeddings.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/15625 [00:00<?, ?it/s]

✅ Embeddings shape: (1000000, 384)


In [10]:
# =============================
# STEP 4: Save artifacts
# =============================
import numpy as np
import os

os.makedirs(SAVE_DIR, exist_ok=True)
np.save(f"{SAVE_DIR}/abstracts.npy", embeddings)
with open(f"{SAVE_DIR}/meta.json", "w") as f:
    json.dump(paper_meta, f)


In [11]:
# =============================
# STEP 5: Build FAISS index
# =============================
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, f"{SAVE_DIR}/faiss_index.index")

print(f"✅ FAISS index built with {len(embeddings)} entries.")

✅ FAISS index built with 1000000 entries.
