In [3]:
import numpy as np
import pandas as pd
from urllib.parse import unquote
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

def decode_url(s):
    return unquote(s)

In [4]:
df = pd.read_csv('../datasets/wikispeedia_paths-and-graph/articles.tsv', skiprows=12, header=None, names=['articles'])
df['articles'] = df['articles'].str.replace('_', ' ')
df['articles'] = df['articles'].apply(decode_url)
titles = df['articles'][:10].tolist()  # Remove the [:10] to do all matrix

df = pd.DataFrame(index=titles, columns=titles)

In [5]:
df

Unnamed: 0,Áedán mac Gabráin,Åland,Édouard Manet,Éire,Óengus I of the Picts,€2 commemorative coins,10th century,11th century,12th century,13th century
Áedán mac Gabráin,,,,,,,,,,
Åland,,,,,,,,,,
Édouard Manet,,,,,,,,,,
Éire,,,,,,,,,,
Óengus I of the Picts,,,,,,,,,,
€2 commemorative coins,,,,,,,,,,
10th century,,,,,,,,,,
11th century,,,,,,,,,,
12th century,,,,,,,,,,
13th century,,,,,,,,,,


In [51]:
# Load model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to get embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Calculate similarity for upper triangle
for i in range(len(titles)):
    for j in range(i+1, len(titles)):
        embedding1 = get_embedding(titles[i])
        embedding2 = get_embedding(titles[j])
        similarity = cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())[0][0]
        df.iloc[i, j] = similarity
        df.iloc[j, i] = similarity  # Copy value to lower triangle
    print(i)

# Mirror upper triangle to lower triangle
#df = df + df.T - np.diag(np.diag(df))

# Fill diagonal with 1s as the similarity of a title with itself is 1
np.fill_diagonal(df.values, 1)


0
1
2
3
4
5
6
7
8
9


Export Results

In [10]:
df.to_csv('semantic_distances.csv')

Compare just two articles

In [55]:
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)
    return cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())[0][0]

In [56]:
semantic_similarity('king', 'queen')

0.9388244