In [1]:
from transformers import BertTokenizer, BertModel

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
model_name = "bert-base-uncased"

In [4]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [6]:
documents = [
    "Machine Learning is a field of artificial intelligence",
    "Natural Language Processing involves understanding human language",
    "Artificial Intelligence encompasses machine learning and natural language processing",
    "Deep Learning is a subset of machine learning",
    "Data science combines statistics, data analysis and machine learning",
    "I go to shop"
]

In [7]:
query = "What is the machine learning ? "

In [8]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors = "pt", truncation = True, padding = True)

    outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state

    embeddding = last_hidden_state.mean(dim = 1) 

    return embeddding.detach().numpy()

In [9]:
doc_embedding = np.vstack([get_embedding(doc) for doc in documents])
query_embedding = get_embedding(query)

In [10]:
similarities = cosine_similarity(query_embedding, doc_embedding)

In [12]:
similarities

array([[0.74845916, 0.6767968 , 0.7113043 , 0.72433925, 0.70604473,
        0.5185727 ]], dtype=float32)

In [27]:
for i, score in enumerate(similarities[0]):
    print(f"Documents: {documents[i]}, Score: {score:.2f}")


max_similarity = similarities.argmax()
print(f"\n Most Similar Document: {documents[max_similarity]}, Score: {similarities[0][max_similarity]:.2f}")

Documents: Machine Learning is a field of artificial intelligence, Score: 0.75
Documents: Natural Language Processing involves understanding human language, Score: 0.68
Documents: Artificial Intelligence encompasses machine learning and natural language processing, Score: 0.71
Documents: Deep Learning is a subset of machine learning, Score: 0.72
Documents: Data science combines statistics, data analysis and machine learning, Score: 0.71
Documents: I go to shop, Score: 0.52

 Most Similar Document: Machine Learning is a field of artificial intelligence, Score: 0.75
