# Vector Search 101 - Hands On
This notebook demonstrates embedding generation, similarity calculation, and vector indexing with FAISS.

In [2]:
# Install required packages
!pip install -q sentence-transformers faiss-cpu

In [3]:
# Imports
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Step 1: Define your document corpus

In [25]:
documents = [
    "Cats are small, carnivorous mammals that are often kept as pets.",
    "Dogs are loyal animals and are known for their companionship.",
    "The sun is the star at the center of the solar system.",
    "Artificial Intelligence is a branch of computer science focused on building smart machines.",
    "Python is a popular programming language known for its simplicity.",
    "Python is a breed of snakes",
    "SpaceX was founded by Elon Musk to revolutionize space transportation.",
    "Machine Learning is a subset of AI that allows systems to learn from data.",
    "The Great Wall of China is one of the largest man-made structures in the world."
]

In [26]:
# Check size of documents
print(len(documents))

9


# Step 2: Load a pre-trained sentence transformer model

In [27]:
## Link: https://huggingface.co/sentence-transformers?sort_models=likes#models
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Create embeddings for documents

In [28]:
doc_embeddings = model.encode(documents, convert_to_numpy=True)

### Check embeddings

In [29]:
doc_embeddings

array([[ 0.10721366,  0.05871374, -0.00794898, ...,  0.18576948,
         0.12180948,  0.05453058],
       [-0.03625739, -0.03113479,  0.06340972, ...,  0.11209203,
         0.119016  , -0.0097626 ],
       [ 0.00140531,  0.08775607, -0.01453394, ...,  0.03924378,
        -0.01461147,  0.07231349],
       ...,
       [ 0.00848975,  0.01577356,  0.08695587, ...,  0.00490149,
         0.02572214,  0.04021992],
       [-0.02949186,  0.00848798,  0.03031458, ...,  0.0846801 ,
         0.04997958, -0.05509678],
       [ 0.02989549,  0.06764077, -0.00119817, ..., -0.05169354,
         0.01318319,  0.05810451]], dtype=float32)

In [30]:
type(doc_embeddings)

numpy.ndarray

In [31]:
# embeddings shape for your documents
doc_embeddings.shape

(9, 384)

### Check Similarity using Cosine Similarity

In [32]:
# Calculate Cosine Similarity
cosine_similarity_mat = cosine_similarity(doc_embeddings)

In [33]:
# Show the similarity visually
pd.DataFrame(cosine_similarity_mat, index=documents, columns=documents)

Unnamed: 0,"Cats are small, carnivorous mammals that are often kept as pets.",Dogs are loyal animals and are known for their companionship.,The sun is the star at the center of the solar system.,Artificial Intelligence is a branch of computer science focused on building smart machines.,Python is a popular programming language known for its simplicity.,Python is a breed of snakes,SpaceX was founded by Elon Musk to revolutionize space transportation.,Machine Learning is a subset of AI that allows systems to learn from data.,The Great Wall of China is one of the largest man-made structures in the world.
"Cats are small, carnivorous mammals that are often kept as pets.",1.0,0.443965,0.10659,-0.033844,0.152346,0.245133,-0.014112,0.024458,0.03427
Dogs are loyal animals and are known for their companionship.,0.443965,1.0,0.101228,0.070075,0.167551,0.226379,-0.046525,0.156391,-0.018565
The sun is the star at the center of the solar system.,0.10659,0.101228,1.0,0.094035,0.041704,-0.049736,0.209648,0.124073,0.029088
Artificial Intelligence is a branch of computer science focused on building smart machines.,-0.033844,0.070075,0.094035,1.0,0.251918,0.077701,0.140664,0.628817,0.184527
Python is a popular programming language known for its simplicity.,0.152346,0.167551,0.041704,0.251918,1.0,0.577643,0.136825,0.217712,0.092387
Python is a breed of snakes,0.245133,0.226379,-0.049736,0.077701,0.577643,1.0,0.041895,0.030214,0.090764
SpaceX was founded by Elon Musk to revolutionize space transportation.,-0.014112,-0.046525,0.209648,0.140664,0.136825,0.041895,1.0,0.055413,0.11692
Machine Learning is a subset of AI that allows systems to learn from data.,0.024458,0.156391,0.124073,0.628817,0.217712,0.030214,0.055413,1.0,0.006263
The Great Wall of China is one of the largest man-made structures in the world.,0.03427,-0.018565,0.029088,0.184527,0.092387,0.090764,0.11692,0.006263,1.0


# Step 4: Create a FAISS index and add document vectors

In [34]:
dimension = doc_embeddings.shape[1]

# IndexFlatL2 is for Euclidean distance metric
vector_index = faiss.IndexFlatL2(dimension)

vector_index.add(doc_embeddings)

# Step 5: Define user query and embed it

In [38]:
# query = "Tell me about Programming Languages."
# query = "Tell me about Python."
query = "Tell me about Donald Trump."

query_embed = model.encode([query], convert_to_numpy=True)

# Step 6: Perform similarity search

In [39]:
top_k = 3

distances, indices = vector_index.search(query_embed, top_k)

# Step 7: Display results

In [40]:
print(f"Query - {query}")
print("\nTop Results:")
for i, idx in enumerate(indices[0]):
  print(f"{i+1}. {documents[idx]} | Scores: {distances[0][i]:.4f}")

Query - Tell me about Donald Trump.

Top Results:
1. The Great Wall of China is one of the largest man-made structures in the world. | Scores: 1.7032
2. SpaceX was founded by Elon Musk to revolutionize space transportation. | Scores: 1.7071
3. Artificial Intelligence is a branch of computer science focused on building smart machines. | Scores: 1.7746


# Limitations and futher use cases



*   Struggles if there are no relevant documents at all.
*   All the items listed might be not relevant, so better to keep top_k as small number.
*   Use case: RAG is a very popular use case which utilizes Vector Search internally

