In [1]:
from langchain.embeddings import BedrockEmbeddings
from numpy import dot
from numpy.linalg import norm


In [3]:
#create an Amazon Titan Embeddings client
client = BedrockEmbeddings()

In [22]:
#Define the classes to store embeddings and the comparison results.
class EmbedItem:
    def __init__(self, text):
        self.text = text
        self.embedding = client.embed_query(text)
        print (f"{text}:", self.embedding)

class ComparisonResult:
    def __init__(self, text, similarity):
        self.text = text
        self.similarity = similarity

In [23]:
#Define the function to compare the similarity of two vectors.
def calculate_similarity(a, b): #See Cosine Similarity: https://en.wikipedia.org/wiki/Cosine_similarity
    return dot(a, b) / (norm(a) * norm(b))


In [24]:
#Build a list of embeddings from the items.txt file.
items = []

with open("items.txt", "r") as f:
    text_items = f.read().splitlines()

for text in text_items:
    items.append(EmbedItem(text))

Felines, canines, and rodents: [0.26953125, -0.0703125, 0.375, 0.20410156, 0.036376953, 0.31054688, 0.0859375, -0.00092315674, 0.015380859, -0.515625, 0.41601562, -0.13378906, -0.14453125, -0.022460938, 0.1015625, -0.09765625, 0.671875, 0.02734375, -0.10546875, 0.68359375, -0.26757812, -0.35742188, -0.12060547, -0.02319336, -0.06347656, 0.79296875, 0.03930664, -0.23339844, 0.39453125, -0.106933594, 0.82421875, -0.671875, 0.06347656, -1.0859375, 0.4140625, -0.49804688, 0.33203125, 0.13476562, 0.75390625, 0.003112793, -0.31835938, 0.6953125, 1.109375, -0.29492188, -0.890625, 0.30273438, -0.37109375, 0.4609375, 0.28710938, -0.20507812, 0.33398438, 0.47460938, -0.22265625, -0.8359375, -0.7265625, -0.018676758, -0.22167969, 0.14941406, -0.57421875, -0.3671875, 0.26757812, 0.35546875, 0.62890625, -0.6171875, -0.096191406, 0.30859375, 0.62890625, -0.12109375, -1.078125, 0.06591797, -0.54296875, -0.74609375, 0.640625, 0.7265625, 1.1171875, -0.0234375, -0.01574707, 0.15332031, -0.16113281, 0.28

In [18]:
#Compare embeddings and display lists to show how similar or different the various texts are.

for e1 in items:
    print(f"Closest matches for '{e1.text}'")
    print ("----------------")
    cosine_comparisons = []
    
    for e2 in items:
        similarity_score = calculate_similarity(e1.embedding, e2.embedding)
        
        cosine_comparisons.append(ComparisonResult(e2.text, similarity_score)) #save the comparisons to a list
        
    cosine_comparisons.sort(key=lambda x: x.similarity, reverse=True) # list the closest matches first
    
    for c in cosine_comparisons:
        print("%.6f" % c.similarity, "\t", c.text)
    
    print()


Closest matches for 'Felines, canines, and rodents'
----------------
1.000000 	 Felines, canines, and rodents
0.872856 	 Cats, dogs, and mice
0.599730 	 Chats, chiens et souris
0.518696 	 고양이, 개, 생쥐
0.516598 	 Lions, tigers, and bears
0.488377 	 ネコ科、イヌ科、げっ歯類
0.484819 	 猫科动物、犬科动物和啮齿动物
0.457718 	 猫、狗和老鼠
0.455923 	 猫、犬、ネズミ
0.452451 	 고양이과, 개과, 설치류
0.068916 	 パン屋への道順を知りたい
0.061314 	 パン屋への行き方を教えてください
0.002239 	 Can you please tell me how to get to the stadium?
-0.003159 	 Kannst du mir bitte sagen, wie ich zur Bäckerei komme?
-0.007595 	 Can you please tell me how to get to the bakery?
-0.019469 	 Pouvez-vous s'il vous plaît me dire comment me rendre à la boulangerie?
-0.020840 	 I need directions to the bread shop

Closest matches for 'Can you please tell me how to get to the bakery?'
----------------
1.000000 	 Can you please tell me how to get to the bakery?
0.712236 	 I need directions to the bread shop
0.541959 	 Pouvez-vous s'il vous plaît me dire comment me rendre à la boulangerie?
0