<a href="https://colab.research.google.com/github/daoyul1/DL-LLMScience-RAG/blob/main/Generate_FAISS_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.2.1
    Uninstalling sentence-transformers-3.2.1:
      Successfully uninstalled sentence-transformers-3.2.1
Successfully installed sentence-transformers-3.3.1


In [2]:
pip install --no-cache-dir faiss-gpu==1.7.2

Collecting faiss-gpu==1.7.2
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m184.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

### Load Model and Libraries

Used BAAI/bge-small-en-v1.5 model for generating faiss indexes.

In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from datasets import load_from_disk

# Load bge-small-en-v1.5 model
model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Download Preprocessed Wikipedia Dataset from Kaggle

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mbanaei/all-paraphs-parsed-expanded")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mbanaei/all-paraphs-parsed-expanded?dataset_version_number=1...


100%|██████████| 524M/524M [00:05<00:00, 108MB/s]

Extracting files...





### Load and Prepare Dataset

In [6]:
# Load the preprocessed Wikipedia dataset
dataset = load_from_disk(path)

# Extract text chunks
texts = [row["text"] for row in dataset]

In [13]:
import torch
torch.cuda.empty_cache()  # Clear unused GPU memory

### Generate Embeddings

In [14]:
# Generate embeddings with bge-1.5
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, device="cuda")
embeddings = np.array(embeddings, dtype=np.float32)  # FAISS requires float32

Batches:   0%|          | 0/32833 [00:00<?, ?it/s]

### Build & Save FAISS Index

In [15]:
# Initialize FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance for 384 dimensions
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, "bge-small-faiss.index")
print("FAISS index created with bge-small-en-v1.5.")

FAISS index created with bge-small-en-v1.5.


### Test the FAISS Index

In [16]:
query = ["What is the theory of relativity?"]

# Generate query embedding
query_embedding = model.encode(query, device="cuda")
query_embedding = np.array(query_embedding, dtype=np.float32)

# Search the FAISS index
distances, indices = index.search(query_embedding, k=5)
print("Top-5 closest matches:", indices)

Top-5 closest matches: [[1486907 1312782 1573156  916173  907733]]


### Generating FAISS Index by using all-mpnet-base-v2

In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from datasets import load_from_disk

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="cuda")

dataset = load_from_disk(path)

# Extract text chunks
texts = [row["text"] for row in dataset]

# Generate embeddings with the new model
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, device="cuda")
embeddings = np.array(embeddings, dtype=np.float32)  # FAISS requires float32

# Initialize FAISS index (using L2 for normalized embeddings)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, "all-mpnet-base-v2-faiss.index")
print("FAISS index created with all-mpnet-base-v2.")

# Inference-like retrieval process:
query = ["What is the theory of relativity?"]
query_embedding = model.encode(query, device="cuda")
query_embedding = np.array(query_embedding, dtype=np.float32)

# Search the FAISS index
distances, indices = index.search(query_embedding, k=5)
print("Top-5 closest matches:", indices)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32833 [00:00<?, ?it/s]

FAISS index created with all-mpnet-base-v2.
Top-5 closest matches: [[1486907  907733  907729  916173 1273101]]
