Here first we reduce dims, cluster and then we build the trees with the full 1.5k dims to speed up 

In [116]:
from pgvector.psycopg import register_vector
import psycopg
import os

conn = psycopg.connect(**psycopg.conninfo.conninfo_to_dict(os.environ["DATABASE_URL"]))
conn.autocommit = True

conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

In [117]:
documents = conn.execute(
    """
    SELECT id, description, date, time, is_taxonomy, raw, embedding FROM documents 
    WHERE is_taxonomy = FALSE AND date > '2023-07-01'
    """
).fetchall()

In [118]:
conn.execute("DROP TABLE IF EXISTS edges")
conn.execute(
    """CREATE TABLE edges (
             id bigserial PRIMARY KEY, 
             parent_id bigint,
             child_id bigint,
             weight float
    )"""
)

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7fac3e365fd0>

In [119]:
embeddings = []
raw = []
date = []
ids = []
for row in documents:
    embeddings.append(row[6])
    raw.append(row[5])
    date.append(row[2]) 
    ids.append(row[0])

In [120]:
import numpy as np
import umap
import hdbscan

umap_model = umap.UMAP(n_neighbors=15,
                       n_components=100, 
                       min_dist=0.1, 
                       metric='cosine') 
reduced_data = umap_model.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, # minimum size of clusters
                            gen_min_span_tree=True) # useful for visualization
cluster_labels = clusterer.fit_predict(reduced_data)



In [121]:
clusters = {}
for label, raw_item, date,id in zip(cluster_labels, raw, date, ids):
    if label not in clusters:
        clusters[label] = [(raw_item, date, id)]
    else:
        clusters[label].append((raw_item, date, id))

clustered_raw_items = [np.array(clusters[label]) for label in clusters if label != -1]


In [122]:
sorted_clusters = []
for cluster in clustered_raw_items:
    sorted_clusters.append(sorted(cluster, key=lambda x: x[1]))

In [123]:
# get the top 10 largest clusters
sorted_clusters = sorted(sorted_clusters, key=lambda x: len(x), reverse=True)

In [124]:
def create_edges(cluster):
    return conn.execute(
        """
    WITH Documents AS (
        SELECT
            id,
            description,
            date,
            time,
            is_taxonomy,
            raw,
            embedding
        FROM
            documents
        WHERE
            id = ANY(%(ids)s)
    ),
    DocumentPairs AS (
        SELECT
            ranked_similarities.doc_id,
            ranked_similarities.compared_doc_id,
            ranked_similarities.similarity
        FROM (
            SELECT
                a.id AS doc_id,
                b.id AS compared_doc_id,
                (1 - (a.embedding <=> b.embedding)) AS similarity,
                ROW_NUMBER() OVER (PARTITION BY a.id ORDER BY (1 - (a.embedding <=> b.embedding)) DESC) AS rank
            FROM
                Documents a
            JOIN
                Documents b ON a.id != b.id 
                AND (a.date > b.date OR (a.date = b.date AND a.time > b.time))
        ) AS ranked_similarities
        WHERE
            ranked_similarities.rank = 1
    )
    INSERT INTO edges (parent_id, child_id, weight)
    SELECT
        doc_id,
        compared_doc_id,
        1-similarity
    FROM
        DocumentPairs;
""", {"ids": [item[2] for item in cluster]})

In [125]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a new directed graph
dag = nx.DiGraph()

for cluster in sorted_clusters[:10]:
    for item in cluster:
        dag.add_node(item[2], label=item[0])

    create_edges(cluster)

In [126]:
# Add edges to the graph
for row in conn.execute("SELECT parent_id, child_id, weight FROM edges"): 
    parent_id, child_id, weight = row
    dag.add_edge(parent_id, child_id, weight=weight)

In [127]:
nx.write_graphml(dag, "../_data/dag_hdbscan.graphml") # for graphia