In [1]:
import numpy as np
import pandas as pd
import os
from google.colab import files

In [2]:
uploaded=files.upload()

Saving clip_text_embeddings.csv to clip_text_embeddings.csv


In [3]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.6


In [4]:
#importing the packages for dim red techniques
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

In [5]:
#metrics for comparing dim red techniques
from sklearn.manifold import trustworthiness
from sklearn.metrics import pairwise_distances

In [6]:
df=pd.read_csv('clip_text_embeddings.csv')

In [7]:
X=df.drop(columns=['prompt']).values

In [9]:
#PCA
pca=PCA(n_components=2)
X_pca=pca.fit_transform(X)

In [11]:
#UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
X_umap = umap_model.fit_transform(X)

In [12]:
#t-SNE
tsne=TSNE(n_components=2)
X_tsne=tsne.fit_transform(X)

# Trustworthiness Formula

Trustworthiness measures how well the local neighborhood structure is preserved in the reduced-dimensional space compared to the original space. It is defined as:

$Tk = 1 - \frac{2}{nk(2n-3k-1)} \sum_{i=1}^{n} \sum_{j \in N_k^i} (r_{ij}^X - k)$

Where:

- **n** is the number of data points.
- **k** is the number of nearest neighbors.
- $( r_{ij}^X )$ is the rank of the $ j^{th} $ nearest neighbor of data point i in the original space.
- $( N_k^i $) represents the set of k nearest neighbors of data point i in the original space.


In [13]:
def compute_trustworthiness(X, X_embedded, n_neighbors=5):
    return trustworthiness(X, X_embedded, n_neighbors=n_neighbors)

# Continuity Formula

Continuity measures how well points that are close in the original space remain close in the reduced space. It is defined as:

$C_k = 1 - \frac{2}{nk(2n-3k-1)} \sum_{i=1}^{n} \sum_{j \in V_i} (r_{ij}^Y - k)$

Where:

- **n** is the number of data points.
- **k** is the number of nearest neighbors.
- $ r_{ij}^Y $ is the rank of the $ j^{th}$ nearest neighbor of data point i in the reduced space.
- V_i represents the set of data points in the original space that are nearest neighbors of data point i, but not in the reduced space.


In [31]:
def compute_continuity(X, X_embedded, n_neighbors=5):
    og_dist=pairwise_distances(X)
    emb_dist=pairwise_distances(X_embedded)

    n_samples = X.shape[0]

    #rank of each data pt's nearest neighbour in the og space and embedded space
    rank_og=np.argsort(np.argsort(og_dist,axis=1), axis=1)
    rank_emb=np.argsort(np.argsort(emb_dist,axis=1), axis=1)

    continuity_score = 0.0

    for i in range(n_samples):
        #indices of nearest neighbours in original space and embedded space
        original_neighbors=np.argsort(og_dist[i])[:n_neighbors]
        embedded_neighbors=np.argsort(emb_dist[i])[:n_neighbors]

        diff=np.setdiff1d(original_neighbors, embedded_neighbors)

        #updates continuity score based on rank diff
        for j in diff:
            continuity_score += (rank_emb[i, j] - n_neighbors)

    continuity_score = 1 - (2.0 / (n_samples * n_neighbors * (2 * n_samples - 3 * n_neighbors - 1))) * continuity_score

  #continuity score generated here is based on the same formula as above, except it is normalised to a range between 0 and 1.
    return continuity_score

In [32]:
trust_pca = compute_trustworthiness(X, X_pca)
cont_pca = compute_continuity(X, X_pca)
print(f"PCA Trustworthiness: {trust_pca}, Continuity: {cont_pca}")

PCA Trustworthiness: 0.9997002016129032, Continuity: 0.9997993951612903


In [33]:
trust_umap = compute_trustworthiness(X, X_umap)
cont_umap = compute_continuity(X, X_umap)
print(f"UMAP Trustworthiness: {trust_umap}, Continuity: {cont_umap}")

UMAP Trustworthiness: 0.9995566532258064, Continuity: 0.9997377016129032


In [34]:
trust_tsne = compute_trustworthiness(X, X_tsne)
cont_tsne = compute_continuity(X, X_tsne)
print(f"t-SNE Trustworthiness: {trust_tsne}, Continuity: {cont_tsne}")

t-SNE Trustworthiness: 0.9995864919354839, Continuity: 0.9997508064516129


# Geodesic Distance Preservation Formula

Geodesic distance preservation measures how well the geodesic distances (shortest path distances along the manifold) are preserved in the reduced-dimensional space compared to the original space. It is defined as:

$
\text{Stress} = \sum_{i=1}^{n} \sum_{j=1}^{n} (d_{ij} - \hat{d}_{ij})^2
$

$
\text{Original Sum} = \sum_{i=1}^{n} \sum_{j=1}^{n} d_{ij}^2
$

$
\text{Geodesic Distance Preservation} = \sqrt{\frac{\text{Stress}}{\text{Original Sum}}}
$

Where:
- n is the number of data points.
- $d_{ij}$ is the geodesic distance (shortest path distance) between data points i and j in the original space.
- $\hat{d}_{ij}$ is the distance between data points i and j in the reduced-dimensional space.


In [44]:
from sklearn.manifold import Isomap
def compute_geodesic_distances(X, n_neighbors=5):
    isomap = Isomap(n_neighbors=n_neighbors, n_components=2)
    isomap.fit(X)
    geodesic_distances = isomap.dist_matrix_
    return geodesic_distances

def compute_gdp(X, X_embedded, n_neighbors=5):
    geodesic_distances = compute_geodesic_distances(X, n_neighbors=n_neighbors)
    embedded_distances = pairwise_distances(X_embedded)

    stress = np.sum((geodesic_distances - embedded_distances)**2)
    original_sum = np.sum(geodesic_distances**2)

    return np.sqrt(stress / original_sum)

gdp_pca=compute_gdp(X, X_pca)
gdp_umap=compute_gdp(X, X_umap)
gdp_tsne=compute_gdp(X, X_tsne)

print(f"Geodesic Distance Preservation PCA: {gdp_pca}")
print(f"Geodesic Distance Preservation UMAP: {gdp_umap}")
print(f"Geodesic Distance Preservation t-SNE: {gdp_tsne}")

Geodesic Distance Preservation PCA: 0.572656858579311
Geodesic Distance Preservation UMAP: 0.9827383342867161
Geodesic Distance Preservation t-SNE: 0.9487485812802086


In [46]:
max_trustworthiness = max(trust_pca, trust_umap, trust_tsne)
max_continuity = max(cont_pca, cont_umap, cont_tsne)
min_gdp=min(gdp_pca, gdp_umap, gdp_tsne)

print(f"Max Trustworthiness: {max_trustworthiness}")
print(f"Max Continuity: {max_continuity}")
print(f"Best Geodesic Distance Preservation: {min_gdp}")

Max Trustworthiness: 0.9997002016129032
Max Continuity: 0.9997993951612903
Best Geodesic Distance Preservation: 0.572656858579311


trustworthiness for local structure preservation, **higher** the score better the preservation; ranges from 0 to 1


continuity for global structure preservation by embeddings, **higher** the score better the preservation; ranges from 0 to 1


gdp specifically focuses on preserving the intrinsic geometry of the manifold, **lower** the value, better the preservation.