##### Installing libraries

In [None]:
%pip install sentence-transformers torch

##### Importing libraries

In [3]:
import torch
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


##### Using HF embedding model 

In [2]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

##### Defining a corpus of example statements

In [8]:
corpus = [
    "A man is eating food.", 
    "A man is eating pasta", 
    "The girl is carrying a baby.", 
    "A man is riding a horse.",
    "A woman is playing violin.", 
    "Two men pushed carts through the woods.", 
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.", 
    "A cheetah is running behind its prey."
]

##### Use "convert_to_tensor = True" to keep the tensors on GPU (if present)

In [9]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor = True)

In [10]:
queries = [
    "A man is eating pasta.", 
    "Someone in a gorilla costume is playing a set of drums.", 
    "A cheetah chases prey on across a field."
]

##### finding the closest 5 sentences of the corpus for each query sentence based on cosine similarity

In [11]:
top_k = min(5, len(corpus))
for query in queries: 
    query_embedding = embedder.encode(query, convert_to_tensor = True)
    similarity_scores = embedder.similarity(query_embedding, corpus_embeddings)[0]
    scores, indices = torch.topk(similarity_scores, k = top_k)
    print("\nQuery:", query)
    print("Top 5 similar sentences in corpus:")
    for score, idx in zip(scores, indices):
        print(corpus[idx], f"(Score: {score: .4f})")


Query: A man is eating pasta.
Top 5 similar sentences in corpus:
A man is eating pasta (Score:  0.9979)
A man is eating food. (Score:  0.7035)
A man is riding a horse. (Score:  0.1889)
A man is riding a white horse on an enclosed ground. (Score:  0.1047)
A cheetah is running behind its prey. (Score:  0.0980)

Query: Someone in a gorilla costume is playing a set of drums.
Top 5 similar sentences in corpus:
A monkey is playing drums. (Score:  0.6433)
A woman is playing violin. (Score:  0.2564)
A man is riding a horse. (Score:  0.1389)
A man is riding a white horse on an enclosed ground. (Score:  0.1191)
A cheetah is running behind its prey. (Score:  0.1080)

Query: A cheetah chases prey on across a field.
Top 5 similar sentences in corpus:
A cheetah is running behind its prey. (Score:  0.8253)
A man is eating food. (Score:  0.1399)
A monkey is playing drums. (Score:  0.1292)
A man is riding a white horse on an enclosed ground. (Score:  0.1097)
A man is riding a horse. (Score:  0.0650)


##### Another Example

In [1]:
sentences1 = [
    "The dog barked loudly.", 
    "She smiled brightly.",
    "Rain poured down."
]

In [2]:
sentences2 = [
    "Birds chirped nearby.",
    "The phone rang.",
    "He laughed quietly."
]

In [4]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings1 = embedder.encode(sentences1)
embeddings2 = embedder.encode(sentences2)

In [5]:
similarities = embedder.similarity(embeddings1, embeddings2)
similarities

tensor([[0.2309, 0.1762, 0.3190],
        [0.0642, 0.2115, 0.5462],
        [0.1487, 0.2985, 0.1155]])

In [6]:
for idx_i, sentence1 in enumerate(sentences1):
    print(sentence1)
    for idx_j, sentence2 in enumerate(sentences2):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

The dog barked loudly.
 - Birds chirped nearby.         : 0.2309
 - The phone rang.               : 0.1762
 - He laughed quietly.           : 0.3190
She smiled brightly.
 - Birds chirped nearby.         : 0.0642
 - The phone rang.               : 0.2115
 - He laughed quietly.           : 0.5462
Rain poured down.
 - Birds chirped nearby.         : 0.1487
 - The phone rang.               : 0.2985
 - He laughed quietly.           : 0.1155
