In [1]:
import umap
import hdbscan

import pandas as pd
import numpy as np
import pandas as pd

from tqdm import tqdm
from urllib.parse import unquote
from embedding_extraction import extract_embeddings
from scipy.spatial import distance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pages = pd.read_csv("./articles.tsv", sep="\t", skiprows=12, names=["name"])

processed_docs = []
for doc in tqdm(pages["name"]):
    with open(f"./data/plaintext_articles/{doc}.txt", "r") as fp:
        processed_docs.append({
            "title": unquote(doc),
            "plain_text": " ".join(fp.read().replace("#copyright\n\n", '').split(" ")[:4000]).strip()
        })

100%|██████████| 4604/4604 [00:02<00:00, 1572.42it/s]


In [3]:
dataset = pd.DataFrame(processed_docs)
dataset.to_csv("./data/full_data.csv")

dataset

Unnamed: 0,title,plain_text
0,Áedán_mac_Gabráin,Áedán mac Gabráin\n\n2007 Schools Wikipedia Se...
1,Åland,Åland\n\n2007 Schools Wikipedia Selection. Rel...
2,Édouard_Manet,Édouard Manet\n\n2007 Schools Wikipedia Select...
3,Éire,Éire\n\n2007 Schools Wikipedia Selection. Rela...
4,Óengus_I_of_the_Picts,Óengus I of the Picts\n\n2007 Schools Wikipedi...
...,...,...
4599,Zionism,Zionism\n\n2007 Schools Wikipedia Selection. R...
4600,Zirconium,Zirconium\n\n2007 Schools Wikipedia Selection....
4601,Zoroaster,Zoroaster\n\n2007 Schools Wikipedia Selection....
4602,Zuid-Gelders,Zuid-Gelders\n\n2007 Schools Wikipedia Selecti...


In [4]:
embeddings = extract_embeddings(
    text=dataset["plain_text"].tolist(),
    foldername=f"./data",
)

Could not find file './data/embed_data-gpt4.pickle'.Regenerating the embeddings.


100%|██████████| 288/288 [15:51<00:00,  3.30s/it]


In [5]:
umap_model = umap.UMAP(
    n_neighbors=32,
    n_components=48,
    min_dist=0,
    metric="cosine",
    random_state=42,
    n_jobs=1,
    low_memory=True
)

low_dim_mapper = umap_model.fit(embeddings)
low_dim_embeds = low_dim_mapper.embedding_

# Center the embeddings around the mean
low_dim_embeds = low_dim_embeds - np.mean(low_dim_embeds, axis=0)


hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=8,
    cluster_selection_method="eom",
    prediction_data=True,
).fit(low_dim_embeds)

cluster_label_probs = hdbscan.prediction.all_points_membership_vectors(
    hdbscan_model
)

cluster_labels = cluster_label_probs.argmax(1)

In [6]:
def compute_base_coherence_matrix(embeds, cluster_probs: np.ndarray):
    # Compute cosine similarity and fix rounding errors
    # Here we use the dot product, since the high-dimensional embeddings are normalized
    cos_sim = np.clip(embeds @ embeds.T, -1, 1)

    # Compute angular similarity
    ang_sim = 1 - np.arccos(cos_sim) / np.pi

    # Diagonals may sometimes be NaN. Probably from rounding errors
    # We set them to 0 here since we're not interested in self-loops.
    np.fill_diagonal(ang_sim, 0)

    # Compute topic similarity
    topic_sim = 1 - distance.cdist(
        XA=cluster_probs,
        XB=cluster_probs,
        metric='jensenshannon'
    )

    coherence_matrix = (ang_sim * topic_sim) ** 2

    # Warn user about NaN values.
    if np.isnan(cos_sim).any():
        print("WARNING: Cosine Similarity matrix contains NaN values.")
    if np.isnan(ang_sim).any():
        print("WARNING: Angular Similarity matrix contains NaN values.")
    if np.isnan(topic_sim).any():
        print("WARNING: Topic Similarity matrix contains NaN values.")
    if np.isnan(coherence_matrix).any():
        print("WARNING: Coherence matrix contains NaN values.")

    return coherence_matrix, ang_sim, topic_sim

In [7]:
coherence_matrix, ang_sim, topic_sim = compute_base_coherence_matrix(
    embeddings.numpy(),
    cluster_label_probs
)

In [12]:
coherence_matrix

array([[0.        , 0.01205888, 0.01020721, ..., 0.01246944, 0.00919119,
        0.012418  ],
       [0.01205888, 0.        , 0.01024107, ..., 0.12595476, 0.01456592,
        0.19447013],
       [0.01020721, 0.01024107, 0.        , ..., 0.01112479, 0.00839769,
        0.01056464],
       ...,
       [0.01246944, 0.12595476, 0.01112479, ..., 0.        , 0.01020719,
        0.20622464],
       [0.00919119, 0.01456592, 0.00839769, ..., 0.01020719, 0.        ,
        0.01235051],
       [0.012418  , 0.19447013, 0.01056464, ..., 0.20622464, 0.01235051,
        0.        ]])