In [None]:
# we use the transformers library to load the pre-trained BERT model and extract embeddings
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad(): # we are not training the model, so we don't need gradients
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding




In [None]:
# calculate the cosine similarity between two embeddings
def calculate_cosine_similarity(emb_title1, emb_title2):
    emb_title1 = emb_title1.reshape(1, -1)
    emb_title2 = emb_title2.reshape(1, -1)

    similarity = cosine_similarity(emb_title1, emb_title2)[0][0]
    return similarity


In [None]:
import ast
with open('all_distance_pairs.txt', 'r') as f:
    all_distance_pairs = ast.literal_eval(f.read())
with open('all_articles.txt', 'r') as f:
    all_articles = ast.literal_eval(f.read())

# get the embeddings of all articles and store them in a dictionary
article_embeddings = {}
for article in all_articles:
    article_embeddings[article] = get_embedding(article)

# calculate the cosine similarity between all pairs of articles
similarities = {}
for pair in all_distance_pairs:
    title1 = pair[0]
    title2 = pair[1]
    embedding1 = article_embeddings[title1]
    embedding2 = article_embeddings[title2]
    similarity =  calculate_cosine_similarity(embedding1, embedding2)
    similarities[(title1, title2)] = similarity


In [15]:
# get the format of the dataframes
import pandas as pd
df = pd.DataFrame(similarities.items(), columns=['pair', 'cosine_similarity'])

In [None]:
# we try another model to calculate the similarity between two articles
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_sbert_similarity(embedding1, embedding2):    
    # Calculate cosine similarity using SBERT's util function
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity



In [18]:
# get the embeddings of all articles and store them in a dictionary
article_embeddings = {}
for article in all_articles:
    article_embeddings[article] = model.encode(title1, convert_to_tensor=True)

Project_Vanguard tensor([-1.0273e-01,  7.1749e-02, -5.3288e-03,  3.6209e-02, -5.3479e-02,
        -5.2362e-02,  7.1550e-02,  3.4960e-02,  1.1479e-02,  1.2465e-02,
        -4.8389e-02, -4.2698e-02, -5.7880e-02,  9.3618e-03, -8.0853e-02,
         1.8328e-02,  6.7661e-03, -5.4574e-03,  3.4570e-02, -7.2937e-03,
        -4.6051e-03,  8.2941e-02, -1.0945e-02,  5.8853e-02, -6.7523e-03,
         2.4892e-02, -1.7441e-03, -6.3383e-03, -6.7185e-03, -6.6327e-02,
        -9.4857e-03, -2.0342e-02,  3.1306e-02, -5.6769e-02,  5.5010e-03,
         1.5944e-02, -5.6350e-04, -2.7787e-02,  3.2054e-02, -1.9055e-02,
        -1.8506e-02, -1.4812e-02, -1.8068e-03,  1.0126e-02, -4.7513e-02,
         5.0780e-03,  1.6216e-02, -2.9788e-03,  3.2424e-02,  8.2895e-03,
        -5.3970e-03, -5.4724e-02, -5.3926e-02, -2.5364e-02, -4.0863e-02,
         2.2838e-02, -3.3259e-03, -1.2418e-02,  3.4026e-03, -7.9497e-02,
         5.9214e-02, -3.1312e-02,  8.4631e-03,  5.1549e-02, -5.7946e-03,
         2.7987e-02, -7.6765e-03, 

In [22]:
# calculate the cosine similarity between all pairs of articles
similarities_sbert = {}
for pair in all_distance_pairs:
    title1 = pair[0]
    title2 = pair[1]
    embedding1 = article_embeddings[title1]
    embedding2 = article_embeddings[title2]
    similarity =  calculate_sbert_similarity(embedding1, embedding2)
    similarities_sbert[(title1, title2)] = similarity

In [23]:
# add a new column to the dataframe with the SBERT cosine similarities
df['sbert_cosine_similarity'] = df['pair'].apply(lambda x: similarities_sbert[x])

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import euclidean

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"  # You can use any suitable model here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    # Tokenize the text and get embeddings
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Pool the output to get a single vector representation of the text
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

def calculate_euclidean_distance(title1, title2):
    # Get embeddings
    embedding1 = get_embedding(title1)
    embedding2 = get_embedding(title2)
    # Convert to numpy arrays for distance calculation
    embedding1_np = embedding1.numpy()
    embedding2_np = embedding2.numpy()
    # Calculate Euclidean distance
    distance = euclidean(embedding1_np, embedding2_np)
    return distance




Euclidean Distance: 6.148644924163818


In [25]:
# add a column to the dataframe similarities that contains the euclidean distance between the embeddings of the articles in each pair
df['euclidean_distance'] = df['pair'].apply(lambda x: calculate_euclidean_distance(x[0], x[1]))

In [26]:
# save the dataframe to a csv file
df.to_csv('article_similarity.csv', index=False)