# Embedding

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

texts = [
    "How to enable cruise control in a car?",
    "How to turn on the speed keeping system?",
    "What is the capital of Poland?"
]

# Load the embedding model (text → vector representation)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert sentences into numerical vectors (embeddings)
emb = model.encode(texts)

# cosine_similarity() computes cosine similarity between vectors.
# It measures how similar two vectors are in terms of their *direction*
# (not their length).
#
# Returned values:
# - close to 1.0 → very similar meaning
# - around 0.0 → little or no semantic similarity
# - negative values (rare for sentence embeddings) → opposite meaning
#
# Mathematical definition:
# cos(φ) = A · B / (||A|| * ||B||)
# where:
# - A · B is the dot product of vectors A and B
# - ||A|| and ||B|| are the vector norms (lengths)

# Here we compute similarity of all sentences
# relative to the first sentence (emb[0])
sim = cosine_similarity([emb[0]], emb)[0]

# Print similarity scores for each sentence
for i, s in enumerate(sim):
    print(f"{i}: {s:.3f} | {texts[i]}")


0: 1.000 | How to enable cruise control in a car?
1: 0.522 | How to turn on the speed keeping system?
2: 0.055 | What is the capital of Poland?


In [None]:
# Embedding dimensions:
print(f"Embedding matrics dim: {emb.shape}")
print(f"Number of texts: {emb.shape[0]}")
print(f"Embedding dimension: {emb.shape[1]}")

Embedding matrics dim: (3, 384)
Number of texts: 3
Embedding dimension: 384
