In [1]:
!pip install sentence-transformers --quiet


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [3]:
print("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

Loading embedding model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model produces 384 dimensional embeddings


In [5]:
text = "The cat sat on the mat"
embedding = model.encode(text)
print(f"Original text {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First ten values: {embedding[:10]}")

Original text The cat sat on the mat
Embedding shape: (384,)
First ten values: [ 0.13040183 -0.01187012 -0.02811707  0.05123867 -0.05597445  0.03019156
  0.03016133  0.02469836 -0.01837058  0.0587668 ]


In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    vec1_scalar = np.linalg.norm(vec1)
    vec2_scalar = np.linalg.norm(vec2)
    return dot_product / (vec1_scalar * vec2_scalar)

✅ Similarity function ready!


In [8]:
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",      # Similar meaning, different words
    "Dogs are loyal animals",          # Different topic
    "Python is a programming language" # Completely unrelated
]

embeddings = model.encode(sentences)
embeddings.shape

(4, 384)

In [11]:
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to {sentence}")
    print(f"Score: {similarity}\n")

Similarity to The cat sat on the mat
Score: 1.0000001192092896

Similarity to A feline rested on the rug
Score: 0.5643377304077148

Similarity to Dogs are loyal animals
Score: 0.16523928940296173

Similarity to Python is a programming language
Score: 0.0308724083006382



### Simple Semantic Search

In [13]:
# Sample knowledge base
documents = [
    "Python is a high-level programming language known for simplicity",
    "Machine learning enables computers to learn from data",
    "Neural networks are inspired by biological brains",
    "Dogs are loyal and friendly pets that need exercise",
    "Cats are independent animals that make great companions",
    "JavaScript is used for web development and runs in browsers",
    "Deep learning uses multi-layered neural networks",
    "Puppies require training and socialization from an early age"
]

print(f"Knowledge base: {len(documents)} documents")

Knowledge base: 8 documents


In [14]:
#Create embeddings
embeddings = model.encode(documents)

print(f"Created {len(embeddings)} embeddings")
print(f"Each embedding has {embeddings[0].shape[0]} dimensions")

Created 8 embeddings
Each embedding has 384 dimensions


In [19]:
def search(query, documents, doc_embeddings, top_k):
    query_embedding = model.encode(query)
    similarities = []
    for index, embedding in enumerate(doc_embeddings):
        similarity = cosine_similarity(embedding, query_embedding)
        similarities.append((documents[index], similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]



In [20]:
# Test different queries
queries = [
    "What is artificial intelligence?",
    "Tell me about pet dogs",
    "How do I code in Python?"
]

for query in queries:
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"{'='*80}")
    
    results = search(query, documents, embeddings, top_k=3)
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n{i}. (Score: {score:.3f})")
        print(f"   {doc}")


QUERY: What is artificial intelligence?

1. (Score: 0.408)
   Machine learning enables computers to learn from data

2. (Score: 0.395)
   Neural networks are inspired by biological brains

3. (Score: 0.326)
   Python is a high-level programming language known for simplicity

QUERY: Tell me about pet dogs

1. (Score: 0.548)
   Dogs are loyal and friendly pets that need exercise

2. (Score: 0.437)
   Puppies require training and socialization from an early age

3. (Score: 0.413)
   Cats are independent animals that make great companions

QUERY: How do I code in Python?

1. (Score: 0.554)
   Python is a high-level programming language known for simplicity

2. (Score: 0.148)
   Puppies require training and socialization from an early age

3. (Score: 0.138)
   JavaScript is used for web development and runs in browsers


In [18]:
results

('Deep learning uses multi-layered neural networks', np.float32(0.31909025))

In [21]:
# Load two different models for comparison
print("Loading models...\n")

model_small = SentenceTransformer('all-MiniLM-L6-v2')      # 384 dimensions
model_large = SentenceTransformer('all-mpnet-base-v2')     # 768 dimensions

print("✅ Both models loaded!")
print(f"Small model: {model_small.get_sentence_embedding_dimension()} dimensions")
print(f"Large model: {model_large.get_sentence_embedding_dimension()} dimensions")

Loading models...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Both models loaded!
Small model: 384 dimensions
Large model: 768 dimensions


In [22]:
# Compare on a similarity task
test_pairs = [
    ("The dog is running", "A canine is jogging"),           # Similar
    ("I love pizza", "Pizza is delicious"),                  # Related
    ("Python programming", "Cooking pasta")                  # Unrelated
]

print("Comparing model performance:\n")
for text1, text2 in test_pairs:
    # Small model
    emb1_small = model_small.encode([text1, text2])
    sim_small = cosine_similarity(emb1_small[0], emb1_small[1])
    
    # Large model  
    emb1_large = model_large.encode([text1, text2])
    sim_large = cosine_similarity(emb1_large[0], emb1_large[1])
    
    print(f"Pair: '{text1}' vs '{text2}'")
    print(f"  Small model: {sim_small:.3f}")
    print(f"  Large model: {sim_large:.3f}")
    print()

Comparing model performance:

Pair: 'The dog is running' vs 'A canine is jogging'
  Small model: 0.818
  Large model: 0.827

Pair: 'I love pizza' vs 'Pizza is delicious'
  Small model: 0.801
  Large model: 0.785

Pair: 'Python programming' vs 'Cooking pasta'
  Small model: 0.142
  Large model: 0.120

