In [1]:
# we use the sentence-transformers library to load the pre-trained BERT model and compute embeddings
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

model_name = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad(): # we are not training the model, so we don't need gradients
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# calculate the cosine similarity between two embeddings
def calculate_cosine_similarity(emb_title1, emb_title2):
    emb_title1 = emb_title1.reshape(1, -1)
    emb_title2 = emb_title2.reshape(1, -1)

    similarity = cosine_similarity(emb_title1, emb_title2)[0][0]
    return similarity


In [17]:
import ast
with open('all_distance_pairs.txt', 'r') as f:
    all_distance_pairs = ast.literal_eval(f.read())
with open('all_articles.txt', 'r') as f:
    all_articles = ast.literal_eval(f.read())


In [None]:
# get the embeddings of all articles and store them in a dictionary
article_embeddings = {}
for article in all_articles:
    article_embeddings[article] = get_embedding(article)

# calculate the cosine similarity between all pairs of articles
similarities = {}
for pair in all_distance_pairs:
    title1 = pair[0]
    title2 = pair[1]
    embedding1 = article_embeddings[title1]
    embedding2 = article_embeddings[title2]
    similarity =  calculate_cosine_similarity(embedding1, embedding2)
    similarities[(title1, title2)] = similarity

In [15]:
# get the format of the dataframes
import pandas as pd
df = pd.DataFrame(similarities.items(), columns=['pair', 'cosine_similarity'])

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_sbert_similarity(title1, title2):
    # Get embeddings
    embedding1 = model.encode(title1, convert_to_tensor=True)
    embedding2 = model.encode(title2, convert_to_tensor=True)
    # Calculate cosine similarity using SBERT's util function
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity




  from tqdm.autonotebook import tqdm, trange


Cosine Similarity with SBERT: 0.5268170833587646


In [None]:
# Calculate SBERT similarities for all pairs
sbert_similarities = {}
for pair in all_distance_pairs:
    title1 = pair[0]
    title2 = pair[1]
    similarity = calculate_sbert_similarity(title1, title2)
    sbert_similarities[(title1, title2)] = similarity

# Create a dataframe from the similarities
df_sbert = pd.DataFrame(sbert_similarities.items(), columns=['pair', 'cosine_similarity'])

In [None]:
# add a new colonne "sbert_cosine_similarity" to the csv file article_similarity.csv
df['sbert_cosine_similarity'] = df_sbert['cosine_similarity']
# save the csv file article_similarity.csv
df.to_csv('article_similarity.csv', index=False)


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import euclidean

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"  # You can use any suitable model here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    # Tokenize the text and get embeddings
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Pool the output to get a single vector representation of the text
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

def calculate_euclidean_distance(title1, title2):
    # Get embeddings
    embedding1 = get_embedding(title1)
    embedding2 = get_embedding(title2)
    # Convert to numpy arrays for distance calculation
    embedding1_np = embedding1.numpy()
    embedding2_np = embedding2.numpy()
    # Calculate Euclidean distance
    distance = euclidean(embedding1_np, embedding2_np)
    return distance




Euclidean Distance: 6.148644924163818


In [25]:
# add a column to the dataframe similarities that contains the euclidean distance between the embeddings of the articles in each pair
df['euclidean_distance'] = df['pair'].apply(lambda x: calculate_euclidean_distance(x[0], x[1]))

In [26]:
# save the dataframe to a csv file
df.to_csv('article_similarity.csv', index=False)