In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
docs = [
    "The dog jumped over the cat", 
    "The cat jumped over the dog",
    "It is very warm today",
    "The cat is yellow and the dog is red",
]

In [4]:
documents = []
for i, x in enumerate(docs):
    row = {
       "index": i,
       "data": x
    }
    documents.append(row)

documents

[{'index': 0, 'data': 'The dog jumped over the cat'},
 {'index': 1, 'data': 'The cat jumped over the dog'},
 {'index': 2, 'data': 'It is very warm today'},
 {'index': 3, 'data': 'The cat is yellow and the dog is red'}]

In [5]:
dataset = [
    {'id': 1, 'text': 'The dog jumped over the cat'},
    {'id': 2, 'text': 'The cat jumped over the dog'},
    {'id': 3, 'text': 'It is very warm today'},
    {'id': 4, 'text': 'The cat is yellow and the dog is red'},
    {'id': 5, 'text': 'The dog jumped over the purple cow'}
]

In [6]:
document_embeddings = []
for x in dataset:
    embed = model.encode(x.get('text'))
    final_data = (x.get('id'), embed)
    document_embeddings.append(final_data)

# document_embeddings[0][1]

In [7]:
# embeddings = model.encode(dataset)

In [8]:
query = "The dog jumped over the green cow"
query_embedding = model.encode([query])

In [9]:
query_embedding.shape

(1, 384)

In [74]:
# model.similarity(embeddings, model.encode([query]))

In [75]:
results = []
for doc in document_embeddings:
    doc_id = doc[0]
    doc_embedding = doc[1]
    rank = model.similarity(doc_embedding, query_embedding)
    results.append(
        (doc_id, rank)
    )
results

[(1, tensor([[0.5991]])),
 (2, tensor([[0.5876]])),
 (3, tensor([[-0.0063]])),
 (4, tensor([[0.2445]])),
 (5, tensor([[0.8564]]))]

In [76]:
results.sort(key=lambda x: x[1], reverse=True)

In [77]:
for result in results:
    dataset_id = result[0]
    rank = result[1]
    doc = next(doc for doc in dataset if doc['id'] == dataset_id)
    print(dataset_id, rank, doc)

5 tensor([[0.8564]]) {'id': 5, 'text': 'The dog jumped over the purple cow'}
1 tensor([[0.5991]]) {'id': 1, 'text': 'The dog jumped over the cat'}
2 tensor([[0.5876]]) {'id': 2, 'text': 'The cat jumped over the dog'}
4 tensor([[0.2445]]) {'id': 4, 'text': 'The cat is yellow and the dog is red'}
3 tensor([[-0.0063]]) {'id': 3, 'text': 'It is very warm today'}
