In [None]:
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# === File paths and labels ===
files = [
    "/content/data/australian_parliament.csv",
    "/content/data/european_parliament.csv",
    "/content/data/ParlEE.csv",
    "/content/data/scottish_parliament.csv",
    "/content/data/uk_parliament.csv",
    "/content/data/ungdc.csv",
    "/content/data/unsc.csv",
    "/content/data/us_presidential_debates.csv"
]

labels = [
    "Australian Parliament Debates",
    "European Parliament Debates",
    "ParlEE Corpus",
    "Scottish Parliament Debates",
    "UK Parliament Debates",
    "UN General Debate (UNGDC)",
    "UN Security Council (UNSC)",
    "US Presidential Debates"
]

In [None]:
# === Model setup ===
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-mpnet-base-v2", device=device)


# === Cleaning function ===
def clean_text(text):
    text = str(text).lower()
    return re.sub(r"[^\w\s]", "", text).strip()


# === Embedding function ===
MAX_PHRASES = 20000
BATCH_SIZE = 128


def get_dataset_embedding(file_path):
    df = pd.read_csv(file_path, usecols=[0])
    phrases = df.iloc[:, 0].dropna().astype(str).apply(clean_text).tolist()
    phrases = phrases[:MAX_PHRASES]

    if not phrases:
        return np.zeros(model.get_sentence_embedding_dimension())

    embeddings = model.encode(
        phrases,
        batch_size=BATCH_SIZE,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=device
    )
    return np.mean(embeddings, axis=0)


# === Embed datasets ===
dataset_embeddings = []
for file, label in zip(files, labels):
    print(f"Encoding {label} ...")
    emb = get_dataset_embedding(file)
    dataset_embeddings.append(emb)

# === Compute cosine similarity ===
similarity_matrix = cosine_similarity(dataset_embeddings)

# === Cluster datasets ===
clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=0.2,
    metric="precomputed",
    linkage="average"
)
cluster_labels = clustering.fit_predict(1 - similarity_matrix)

# === Print clusters ===
clusters = {}
for label, cluster_id in zip(labels, cluster_labels):
    clusters.setdefault(cluster_id, []).append(label)

print("\n=== Dataset Clusters ===")
for cluster_id, group in clusters.items():
    print(f"Cluster {cluster_id}:")
    for name in group:
        print(f"  - {name}")

In [None]:
# === Visualize similarity matrix ===
plt.figure(figsize=(10, 8))
sns.heatmap(
    similarity_matrix,
    xticklabels=labels,
    yticklabels=labels,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    cbar=True
)
plt.title("Dataset Cosine Similarity")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("cosine-similarity.pdf", dpi=1200)
plt.show()