In [2]:
# import umap
from umap.umap_ import UMAP
import hdbscan
import torch
import os
from dotenv import load_dotenv
import pickle
import time

import pandas as pd
import numpy as np

from openai import OpenAI
from tqdm import tqdm
from urllib.parse import unquote
from scipy.spatial import distance

In [3]:
# pages = pd.read_csv("../data/Wikispeedia/articles.tsv", sep="\t", skiprows=12, names=["name"])

# processed_docs = []
# for doc in tqdm(pages["name"]):
#     with open(f"../data/Wikispeedia/plaintext_articles/{doc}.txt", "r") as fp:
#         processed_docs.append({
#             "title": unquote(doc),
#             "plain_text": " ".join(fp.read().replace("#copyright\n\n", '').split(" ")[:4000]).strip()
#         })

In [4]:
# dataset = pd.DataFrame(processed_docs)
# dataset.to_csv("../data/full_text_data.csv", index=False)

# dataset

In [5]:
dataset = pd.read_csv("../data/full_text_data.csv")
dataset

Unnamed: 0,title,plain_text
0,Áedán_mac_Gabráin,Áedán mac Gabráin\n\n2007 Schools Wikipedia Se...
1,Åland,Åland\n\n2007 Schools Wikipedia Selection. Rel...
2,Édouard_Manet,Édouard Manet\n\n2007 Schools Wikipedia Select...
3,Éire,Éire\n\n2007 Schools Wikipedia Selection. Rela...
4,Óengus_I_of_the_Picts,Óengus I of the Picts\n\n2007 Schools Wikipedi...
...,...,...
4599,Zionism,Zionism\n\n2007 Schools Wikipedia Selection. R...
4600,Zirconium,Zirconium\n\n2007 Schools Wikipedia Selection....
4601,Zoroaster,Zoroaster\n\n2007 Schools Wikipedia Selection....
4602,Zuid-Gelders,Zuid-Gelders\n\n2007 Schools Wikipedia Selecti...


In [6]:
# Load environment variables from the .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_OpenAI_embeddings(texts, model="text-embedding-3-small"):
    texts = [text.replace("\n", " ") for text in texts]  # Clean up text
    response = client.embeddings.create(input=texts, model=model)
    return [res_data.embedding for res_data in response.data]


def batched_embeddings(texts, batch_size=16, model="text-embedding-3-small"):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = get_OpenAI_embeddings(batch_texts, model=model)
        embeddings.extend(batch_embeddings)
        time.sleep(0.75)  # Prevent making too many requests too fast
    return torch.tensor(embeddings)


def extract_embeddings(text, file):
    try:
        with open(f"{file}", 'rb') as handle:
            obj = pickle.load(handle)
            embeddings = obj["embeddings"]
            del obj

            print(f"File '{file}' loaded successfully.")
    except FileNotFoundError:
        print(f"Could not find file '{file}'. Regenerating the embeddings.")
        embeddings = batched_embeddings(text, batch_size=16)

        with open(f"{file}", 'wb') as handle:
            pickle.dump(
                obj={"embeddings": embeddings},
                file=handle,
                protocol=pickle.HIGHEST_PROTOCOL
            )

    return embeddings


embeddings = extract_embeddings(
    text=dataset["plain_text"].tolist(),
    file="../data/gpt4_embeddings.pkl",
)

File '../data/gpt4_embeddings.pkl' loaded successfully.


In [8]:
umap_model = UMAP(
    n_neighbors=32,
    n_components=48,
    min_dist=0,
    metric="cosine",
    random_state=42,
    n_jobs=1,
    low_memory=True
)

low_dim_mapper = umap_model.fit(embeddings)
low_dim_embeds = low_dim_mapper.embedding_

# Center the embeddings around the mean
low_dim_embeds = low_dim_embeds - np.mean(low_dim_embeds, axis=0)


hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=8,
    cluster_selection_method="eom",
    prediction_data=True,
).fit(low_dim_embeds)

cluster_label_probs = hdbscan.prediction.all_points_membership_vectors(
    hdbscan_model
)

cluster_labels = cluster_label_probs.argmax(1)

In [9]:
def compute_base_coherence_matrix(embeds, cluster_probs: np.ndarray):
    # Compute cosine similarity and fix rounding errors
    # Here we use the dot product, since the high-dimensional embeddings are normalized
    cos_sim = np.clip(embeds @ embeds.T, -1, 1)

    # Compute angular similarity
    ang_sim = 1 - np.arccos(cos_sim) / np.pi

    # Diagonals may sometimes be NaN. Probably from rounding errors
    # We set them to 0 here since we're not interested in self-loops.
    np.fill_diagonal(ang_sim, 0)

    # Compute topic similarity
    topic_sim = 1 - distance.cdist(
        XA=cluster_probs,
        XB=cluster_probs,
        metric='jensenshannon'
    )

    coherence_matrix = (ang_sim * topic_sim) ** (1/2)

    # Warn user about NaN values.
    if np.isnan(cos_sim).any():
        print("WARNING: Cosine Similarity matrix contains NaN values.")
    if np.isnan(ang_sim).any():
        print("WARNING: Angular Similarity matrix contains NaN values.")
    if np.isnan(topic_sim).any():
        print("WARNING: Topic Similarity matrix contains NaN values.")
    if np.isnan(coherence_matrix).any():
        print("WARNING: Coherence matrix contains NaN values.")

    return coherence_matrix, ang_sim, topic_sim

In [10]:
coherence_matrix, ang_sim, topic_sim = compute_base_coherence_matrix(
    embeddings.numpy(),
    cluster_label_probs
)

In [11]:
coherence_matrix

array([[0.        , 0.33182796, 0.31785333, ..., 0.33567734, 0.3096299 ,
        0.31812011],
       [0.33182796, 0.        , 0.31861573, ..., 0.59975807, 0.34623951,
        0.33235212],
       [0.31785333, 0.31861573, 0.        , ..., 0.32629773, 0.30271915,
        0.30777674],
       ...,
       [0.33567734, 0.59975807, 0.32629773, ..., 0.        , 0.3185637 ,
        0.34137462],
       [0.3096299 , 0.34623951, 0.30271915, ..., 0.3185637 , 0.        ,
        0.31672138],
       [0.31812011, 0.33235212, 0.30777674, ..., 0.34137462, 0.31672138,
        0.        ]])

In [12]:
with open(f"../data/coherence_graph.pkl", 'wb') as handle:
    pickle.dump(
        obj=coherence_matrix,
        file=handle,
        protocol=pickle.HIGHEST_PROTOCOL
    )