In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = "bert-base-uncased"  # TODO: Try different models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad(): # we are not training the model, so we don't need gradients
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding




In [None]:
def calculate_cosine_similarity(title1, title2):
    embedding1 = get_embedding(title1)
    embedding2 = get_embedding(title2)
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# Example usage
title1 = "Understanding Machine Learning"
title2 = "Introduction to Deep Learning"
similarity = calculate_cosine_similarity(title1, title2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.7934294939041138


In [3]:
article1= "14th_century"
article2= "African_slave_trade"
similarity = calculate_cosine_similarity(article1, article2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.6303218603134155


In [7]:
article1= "14th_century"
article2= "15th_century"
similarity = calculate_cosine_similarity(article1, article2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.9655939936637878


In [8]:
article1_2= "14th_century"
article2_2= "Ottoman_Empire"
similarity = calculate_cosine_similarity(article1_2, article2_2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.7063847780227661


In [None]:
%pip install sentence_transformers

In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_sbert_similarity(title1, title2):
    # Get embeddings
    embedding1 = model.encode(title1, convert_to_tensor=True)
    embedding2 = model.encode(title2, convert_to_tensor=True)
    # Calculate cosine similarity using SBERT's util function
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity



Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
   ---------------------------------------- 0.0/255.8 kB ? eta -:--:--
   - -------------------------------------- 10.2/255.8 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/255.8 kB 330.3 kB/s eta 0:00:01
   --------- ----------------------------- 61.4/255.8 kB 409.6 kB/s eta 0:00:01
   ------------------ ------------------- 122.9/255.8 kB 654.9 kB/s eta 0:00:01
   ------------------------------ ------- 204.8/255.8 kB 888.4 kB/s eta 0:00:01
   --------------------------------- ---- 225.3/255.8 kB 860.2 kB/s eta 0:00:01
   -------------------------------------- 255.8/255.8 kB 826.7 kB/s eta 0:00:00
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
# Example usage
title1_1 = "Understanding Machine Learning"
title2_1 = "Introduction to Deep Learning"
similarity = calculate_sbert_similarity(title1, title2)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.5268170833587646


In [12]:
article1_1= "14th_century"
article2_1= "15th_century"
similarity = calculate_sbert_similarity(article1_1, article2_1)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.9229943156242371


In [13]:
article1_3= "14th_century"
article2_3= "African_slave_trade"
similarity = calculate_sbert_similarity(article1_3, article2_3)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.30906590819358826
