In [1]:
from pgvector.psycopg import register_vector
import psycopg
import os
import pandas as pd
import plotly.express as px


conn = psycopg.connect(**psycopg.conninfo.conninfo_to_dict(os.environ["DATABASE_URL"]))
conn.autocommit = True

conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# visualize the embeddings using UMAP
import umap
import numpy as np
import matplotlib.pyplot as plt

results = conn.execute("SELECT embedding, description FROM documents").fetchall()

embeddings = np.array([np.array(result[0]) for result in results])
descriptions = [result[1] for result in results]

# Apply UMAP
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=8, random_state=42)
umap_embeddings = umap_reducer.fit_transform(embeddings)

# Convert embeddings and descriptions into a DataFrame
df = pd.DataFrame(umap_embeddings)
df['Description'] = descriptions

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [3]:
# # calculate the convex hull using scipy
# from scipy.spatial import ConvexHull
# hull = ConvexHull(umap_embeddings)

# hull_info = {
#     'vertices': hull.vertices,
#     'number_of_vertices': len(hull.vertices),
#     'volume': hull.volume,
#     'area': hull.area
# }

# hull_info

In [4]:
import torch
from dgl.geometry import farthest_point_sampler
x = torch.rand((2, 10, 3))
point_idx = farthest_point_sampler(x, 1)
print(point_idx)

ModuleNotFoundError: No module named 'distutils'

In [None]:
import numpy as np
from scipy.spatial.distance import pdist, squareform

distances = squareform(pdist(umap_embeddings))

# Greedy algorithm to find vertices
selected_indices = [np.random.randint(0, len(hull.vertices))]
for _ in range(10):  # We already have one vertex, so we need 4 more
    min_distances_to_selected = distances[:, selected_indices].min(axis=1)
    next_index = min_distances_to_selected.argmax()
    selected_indices.append(next_index)

for x in df.loc[selected_indices]['Description'].values:
    print(x)


In [None]:
# plot the hull in a 3d interactive plot
import plotly.graph_objects as go
fig = px.scatter_3d(df, x=0, y=1, z=2, color='Description')
fig.update_traces(marker=dict(size=3))
fig.update_layout(
    title_text='UMAP projection of the documents',
    showlegend=False
)
fig.add_traces(
    go.Mesh3d(x=umap_embeddings[hull.vertices, 0], y=umap_embeddings[hull.vertices, 1], z=umap_embeddings[hull.vertices, 2], opacity=0.5, color='yellow')
)
fig.show()