# Semantic diversity of a dataset

Calculates the semantic diversity as 1-*s*, where *s* is the mean of the pairwise cosine similarities between the sentence embeddings for the given dataset.

Based on the multilingual sentence transformer: https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

In [15]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

dataset_pathname = "../../data/continual-pretraining/stories-en-balanced.txt"
results_pathname = dataset_pathname.replace(".txt", "_semantic_diversity.txt")

In [10]:
# Define average cosine similarity function
def avg_cosine_similarity(model, sentences):
    embeddings = model.encode(sentences)
    sim_matrix = cosine_similarity(embeddings)
    np.fill_diagonal(sim_matrix, 0)  # ignore self-similarity
    return np.mean(sim_matrix)

In [11]:
# Load the Sentence Transformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [12]:
# Load the dataset
with open(dataset_pathname, "r", encoding="utf-8") as f:
    sentences = f.readlines()

In [13]:
# Calculate the average cosine similarity
avg_sim = avg_cosine_similarity(model, sentences)

In [None]:
# Print and write the results to a file
summary = f"Semantic diversity: {1 - avg_sim:.4f}"
print(summary)
with open(results_pathname, "w", encoding="utf-8") as f:
    print(summary, file=f)