In [None]:
!pip install sentence-transformers scikit-learn numpy

# mixedbread-ai/mxbai-embed-large-v1

In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained sentence transformer model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

instruction = "Represent this sentence for searching relevant passages: "

# Words to test the analogy
words = ["king", "man", "woman", "queen", "princess", "prince", "clown", "horse", "castle", "person", "banana", "apple", "football", "basketball"]

# Encode the words to get their embeddings and store them in a dictionary
embeddings_dict = {word: model.encode(instruction + word) for word in words}

# Perform the analogy calculation: king - man + woman
analogy_vector = embeddings_dict["king"] - embeddings_dict["man"] + embeddings_dict["woman"]

# Calculate similarity scores and store them in lists
dot_product_scores = []
cosine_similarity_scores = []
euclidean_distance_scores = []

for word, embedding in embeddings_dict.items():
    # Dot Product
    dot_product = np.dot(analogy_vector, embedding)
    dot_product_scores.append((word, dot_product))

    # Cosine Similarity
    cosine_sim = cosine_similarity(analogy_vector.reshape(1, -1), embedding.reshape(1, -1))[0][0]
    cosine_similarity_scores.append((word, cosine_sim))

    # Euclidean Distance (use negative for sorting purposes, since lower distance means higher similarity)
    euclidean_distance = -np.linalg.norm(analogy_vector - embedding)
    euclidean_distance_scores.append((word, euclidean_distance))

# Sort the lists by scores
sorted_dot_product_scores = sorted(dot_product_scores, key=lambda x: x[1], reverse=True)
sorted_cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
sorted_euclidean_distance_scores = sorted(euclidean_distance_scores, key=lambda x: x[1], reverse=True)

# Print the sorted results for Dot Product
print("Dot Product:")
for word, score in sorted_dot_product_scores:
    print(f"Dot product between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Cosine Similarity
print("Cosine Similarity:")
for word, score in sorted_cosine_similarity_scores:
    print(f"Cosine similarity between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Euclidean Distance
print("Euclidean Distance (sorted by smallest distance, which indicates highest similarity):")
for word, score in sorted_euclidean_distance_scores:
    print(f"Euclidean distance between '{word}' and analogy vector: {-score}")  # Multiply by -1 to show the original positive distanceb

Dot Product:
Dot product between 'king' and analogy vector: 206.02333068847656
Dot product between 'woman' and analogy vector: 179.44287109375
Dot product between 'princess' and analogy vector: 177.8961181640625
Dot product between 'queen' and analogy vector: 177.6486053466797
Dot product between 'castle' and analogy vector: 116.86325073242188
Dot product between 'prince' and analogy vector: 113.52368927001953
Dot product between 'horse' and analogy vector: 113.3372802734375
Dot product between 'person' and analogy vector: 110.6568374633789
Dot product between 'apple' and analogy vector: 107.81037139892578
Dot product between 'banana' and analogy vector: 103.31510925292969
Dot product between 'basketball' and analogy vector: 101.27586364746094
Dot product between 'clown' and analogy vector: 97.28660583496094
Dot product between 'football' and analogy vector: 96.44972229003906
Dot product between 'man' and analogy vector: 47.41835021972656

----------------------------------------------

# BAAI/bge-base-en-v1.5

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained sentence transformer model
model = SentenceTransformer("BAAI/bge-base-en-v1.5")

instruction = "Represent this sentence for searching relevant passages: "

# Words to test the analogy
words = ["king", "man", "woman", "queen", "princess", "prince", "clown", "horse", "castle", "person", "banana", "apple", "football", "basketball"]

# Encode the words to get their embeddings and store them in a dictionary
embeddings_dict = {word: model.encode(instruction + word) for word in words}

# Perform the analogy calculation: king - man + woman
analogy_vector = embeddings_dict["king"] - embeddings_dict["man"] + embeddings_dict["woman"]

# Calculate similarity scores and store them in lists
dot_product_scores = []
cosine_similarity_scores = []
euclidean_distance_scores = []

for word, embedding in embeddings_dict.items():
    # Dot Product
    dot_product = np.dot(analogy_vector, embedding)
    dot_product_scores.append((word, dot_product))

    # Cosine Similarity
    cosine_sim = cosine_similarity(analogy_vector.reshape(1, -1), embedding.reshape(1, -1))[0][0]
    cosine_similarity_scores.append((word, cosine_sim))

    # Euclidean Distance (use negative for sorting purposes, since lower distance means higher similarity)
    euclidean_distance = -np.linalg.norm(analogy_vector - embedding)
    euclidean_distance_scores.append((word, euclidean_distance))

# Sort the lists by scores
sorted_dot_product_scores = sorted(dot_product_scores, key=lambda x: x[1], reverse=True)
sorted_cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
sorted_euclidean_distance_scores = sorted(euclidean_distance_scores, key=lambda x: x[1], reverse=True)

# Print the sorted results for Dot Product
print("Dot Product:")
for word, score in sorted_dot_product_scores:
    print(f"Dot product between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Cosine Similarity
print("Cosine Similarity:")
for word, score in sorted_cosine_similarity_scores:
    print(f"Cosine similarity between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Euclidean Distance
print("Euclidean Distance (sorted by smallest distance, which indicates highest similarity):")
for word, score in sorted_euclidean_distance_scores:
    print(f"Euclidean distance between '{word}' and analogy vector: {-score}")  # Multiply by -1 to show the original positive distance

Dot Product:
Dot product between 'king' and analogy vector: 0.8582403659820557
Dot product between 'woman' and analogy vector: 0.7982355356216431
Dot product between 'queen' and analogy vector: 0.7474974393844604
Dot product between 'princess' and analogy vector: 0.6489111185073853
Dot product between 'castle' and analogy vector: 0.4740004539489746
Dot product between 'person' and analogy vector: 0.4641091525554657
Dot product between 'banana' and analogy vector: 0.4288800358772278
Dot product between 'basketball' and analogy vector: 0.42559313774108887
Dot product between 'apple' and analogy vector: 0.4131776690483093
Dot product between 'clown' and analogy vector: 0.3922470510005951
Dot product between 'horse' and analogy vector: 0.3739059567451477
Dot product between 'prince' and analogy vector: 0.34641796350479126
Dot product between 'football' and analogy vector: 0.296387255191803
Dot product between 'man' and analogy vector: 0.16995564103126526

----------------------------------

# all-MiniLM-L6-v2 (without instruction)

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

instruction = "" #"Represent this sentence for searching relevant passages: "

# Words to test the analogy
words = ["king", "man", "woman", "queen", "princess", "prince", "clown", "horse", "castle", "person", "banana", "apple", "football", "basketball"]

# Encode the words to get their embeddings and store them in a dictionary
embeddings_dict = {word: model.encode(instruction + word) for word in words}

# Perform the analogy calculation: king - man + woman
analogy_vector = embeddings_dict["king"] - embeddings_dict["man"] + embeddings_dict["woman"]

# Calculate similarity scores and store them in lists
dot_product_scores = []
cosine_similarity_scores = []
euclidean_distance_scores = []

for word, embedding in embeddings_dict.items():
    # Dot Product
    dot_product = np.dot(analogy_vector, embedding)
    dot_product_scores.append((word, dot_product))

    # Cosine Similarity
    cosine_sim = cosine_similarity(analogy_vector.reshape(1, -1), embedding.reshape(1, -1))[0][0]
    cosine_similarity_scores.append((word, cosine_sim))

    # Euclidean Distance (use negative for sorting purposes, since lower distance means higher similarity)
    euclidean_distance = -np.linalg.norm(analogy_vector - embedding)
    euclidean_distance_scores.append((word, euclidean_distance))

# Sort the lists by scores
sorted_dot_product_scores = sorted(dot_product_scores, key=lambda x: x[1], reverse=True)
sorted_cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
sorted_euclidean_distance_scores = sorted(euclidean_distance_scores, key=lambda x: x[1], reverse=True)

# Print the sorted results for Dot Product
print("Dot Product:")
for word, score in sorted_dot_product_scores:
    print(f"Dot product between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Cosine Similarity
print("Cosine Similarity:")
for word, score in sorted_cosine_similarity_scores:
    print(f"Cosine similarity between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Euclidean Distance
print("Euclidean Distance (sorted by smallest distance, which indicates highest similarity):")
for word, score in sorted_euclidean_distance_scores:
    print(f"Euclidean distance between '{word}' and analogy vector: {-score}")  # Multiply by -1 to show the original positive distance

Dot Product:
Dot product between 'king' and analogy vector: 0.9423492550849915
Dot product between 'woman' and analogy vector: 0.9383166432380676
Dot product between 'queen' and analogy vector: 0.8659945130348206
Dot product between 'princess' and analogy vector: 0.660201370716095
Dot product between 'prince' and analogy vector: 0.5800817012786865
Dot product between 'person' and analogy vector: 0.5365455150604248
Dot product between 'castle' and analogy vector: 0.5108327865600586
Dot product between 'clown' and analogy vector: 0.4597344994544983
Dot product between 'banana' and analogy vector: 0.45373284816741943
Dot product between 'horse' and analogy vector: 0.3839447498321533
Dot product between 'basketball' and analogy vector: 0.33809804916381836
Dot product between 'apple' and analogy vector: 0.3302672207355499
Dot product between 'football' and analogy vector: 0.2978869080543518
Dot product between 'man' and analogy vector: -0.3526756763458252

----------------------------------

# all-MiniLM-L6-v2 (with instruction)

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

instruction = "Represent this sentence for searching relevant passages: "

# Words to test the analogy
words = ["king", "man", "woman", "queen", "princess", "prince", "clown", "horse", "castle", "person", "banana", "apple", "football", "basketball"]

# Encode the words to get their embeddings and store them in a dictionary
embeddings_dict = {word: model.encode(instruction + word) for word in words}

# Perform the analogy calculation: king - man + woman
analogy_vector = embeddings_dict["king"] - embeddings_dict["man"] + embeddings_dict["woman"]

# Calculate similarity scores and store them in lists
dot_product_scores = []
cosine_similarity_scores = []
euclidean_distance_scores = []

for word, embedding in embeddings_dict.items():
    # Dot Product
    dot_product = np.dot(analogy_vector, embedding)
    dot_product_scores.append((word, dot_product))

    # Cosine Similarity
    cosine_sim = cosine_similarity(analogy_vector.reshape(1, -1), embedding.reshape(1, -1))[0][0]
    cosine_similarity_scores.append((word, cosine_sim))

    # Euclidean Distance (use negative for sorting purposes, since lower distance means higher similarity)
    euclidean_distance = -np.linalg.norm(analogy_vector - embedding)
    euclidean_distance_scores.append((word, euclidean_distance))

# Sort the lists by scores
sorted_dot_product_scores = sorted(dot_product_scores, key=lambda x: x[1], reverse=True)
sorted_cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
sorted_euclidean_distance_scores = sorted(euclidean_distance_scores, key=lambda x: x[1], reverse=True)

# Print the sorted results for Dot Product
print("Dot Product:")
for word, score in sorted_dot_product_scores:
    print(f"Dot product between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Cosine Similarity
print("Cosine Similarity:")
for word, score in sorted_cosine_similarity_scores:
    print(f"Cosine similarity between '{word}' and analogy vector: {score}")

# Print a separator
print("\n" + "-"*80 + "\n")

# Print the sorted results for Euclidean Distance
print("Euclidean Distance (sorted by smallest distance, which indicates highest similarity):")
for word, score in sorted_euclidean_distance_scores:
    print(f"Euclidean distance between '{word}' and analogy vector: {-score}")  # Multiply by -1 to show the original positive distance

Dot Product:
Dot product between 'king' and analogy vector: 0.9228507876396179
Dot product between 'queen' and analogy vector: 0.8901626467704773
Dot product between 'woman' and analogy vector: 0.8280522227287292
Dot product between 'princess' and analogy vector: 0.8019042015075684
Dot product between 'prince' and analogy vector: 0.6872838139533997
Dot product between 'castle' and analogy vector: 0.6797710657119751
Dot product between 'person' and analogy vector: 0.5742331147193909
Dot product between 'horse' and analogy vector: 0.5541369915008545
Dot product between 'banana' and analogy vector: 0.5159429311752319
Dot product between 'clown' and analogy vector: 0.510412871837616
Dot product between 'basketball' and analogy vector: 0.5029580593109131
Dot product between 'apple' and analogy vector: 0.47977399826049805
Dot product between 'football' and analogy vector: 0.47085604071617126
Dot product between 'man' and analogy vector: 0.4366154074668884

-----------------------------------