# Semantic diversity of a dataset - batch processing

Calculates the semantic diversity as 1-*s*, where *s* is the mean of the pairwise cosine similarities between the sentence embeddings for the given dataset.

Based on the multilingual sentence transformer: https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

Optimized for batch processing the results produced by inference.ipynb.

In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

dataset_base_dir = "path-to-dataset"

In [None]:
# Define series to be evaluated
model_archs = ["llama3.1-8B", "qwen2.5-7B", "llama3.2-3B"]

model_trns = ["base", "biasM", "balanced", "biasF"]

langs = ["es", "va", "en"]

prompt_sets = ["stories_new"]

ids = [1, 2, 3, 4, 5]

In [None]:
# Define average cosine similarity function
def avg_cosine_similarity(model, sentences):
    embeddings = model.encode(sentences)
    sim_matrix = cosine_similarity(embeddings)
    np.fill_diagonal(sim_matrix, 0)  # ignore self-similarity
    return np.mean(sim_matrix)

In [None]:
# Load the Sentence Transformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
# Load the dataset
for lang in langs:
    print(f"lang: {lang}")
    for prompt_set in prompt_sets:
        print(f"    prompt_set: {prompt_set}")
        for model_arch in model_archs:
            print(f"        model_arch: {model_arch}")
            for model_trn in model_trns:
                print(f"            model_trn: {model_trn}")
                for id in ids:
                    filename = f"processed_{model_arch}_{model_trn}_{lang}_{prompt_set}{id}.txt"
                    dataset_pathname = os.path.join(dataset_base_dir, filename)
                    results_pathname = dataset_pathname.replace(".txt", "_semantic_diversity.txt")

                    try:
                        with open(dataset_pathname, "r", encoding="utf-8") as f:
                            sentences = f.readlines()
                        # Calculate the average cosine similarity
                        avg_sim = avg_cosine_similarity(model, sentences)
                    except FileNotFoundError:
                        avg_sim = float('nan')

                    # Print and write the results to a file
                    summary = f"Semantic diversity: {1 - avg_sim:.4f}"
                    print(f"{1 - avg_sim:.4f}")
                    with open(results_pathname, "w", encoding="utf-8") as f:
                        print(summary, file=f)