In [1]:
import datasets
import numpy as np
import umap
import openTSNE
import sklearn.decomposition
import SQuaD_MDS
import phate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = datasets.load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train")

In [3]:
wikipedia_vectors = np.asarray(docs['emb'])

In [4]:
wikipedia_vectors.shape

(485859, 768)

In [5]:
np.allclose(np.linalg.norm(wikipedia_vectors, axis=1), 1.0)

False

In [6]:
np.linalg.norm(wikipedia_vectors, axis=1)

array([13.46731907, 13.41383776, 13.0557906 , ..., 14.24729346,
       14.5828553 , 15.07801916])

In [11]:
%%time
tsne_data_map = openTSNE.TSNE(metric="cosine", n_iter=1000, random_state=42).fit(wikipedia_vectors)

CPU times: user 42min 15s, sys: 36.7 s, total: 42min 51s
Wall time: 1h 1min 14s


In [12]:
%%time
pca_data_map = sklearn.decomposition.PCA(n_components=2).fit_transform(wikipedia_vectors)

CPU times: user 36.8 s, sys: 731 ms, total: 37.5 s
Wall time: 4.48 s


In [13]:
%%time
mds_data_map = SQuaD_MDS.run_SQuaD_MDS(wikipedia_vectors, {'in python':True})

running the python version...
LR :  1
CPU times: user 25min 56s, sys: 11.9 s, total: 26min 8s
Wall time: 25min 37s


In [14]:
np.save("wikipedia_tsne_data_map_1.npy", tsne_data_map)

In [15]:
np.save("wikipedia_pca_data_map_1.npy", pca_data_map)

In [16]:
np.save("wikipedia_mds_data_map_1.npy", mds_data_map)

In [17]:
%%time
A = openTSNE.affinity.Uniform(wikipedia_vectors, symmetrize="max", k_neighbors=100)

CPU times: user 26min 6s, sys: 27.7 s, total: 26min 33s
Wall time: 26min 34s


In [18]:
%%time
le_data_map = openTSNE.initialization.spectral(A.P)

CPU times: user 8.86 s, sys: 421 ms, total: 9.28 s
Wall time: 8.01 s


In [19]:
%%time
phate_data_map = phate.PHATE(n_pca=10, n_landmark=10_000).fit_transform(wikipedia_vectors)

Calculating PHATE...
  Running PHATE on 485859 observations and 768 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 6.53 seconds.
    Calculating KNN search...
    Calculated KNN search in 1287.28 seconds.
    Calculating affinities...




    Calculated affinities in 1.02 seconds.
  Calculated graph and diffusion operator in 1295.00 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 30.57 seconds.
    Calculating KMeans...
    Calculated KMeans in 64.32 seconds.
  Calculated landmark operator in 125.02 seconds.
  Calculating optimal t...
    Automatically selected t = 26
  Calculated optimal t in 149.71 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 34.51 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 158.53 seconds.
Calculated PHATE in 1762.80 seconds.
CPU times: user 1h 8s, sys: 1min 14s, total: 1h 1min 22s
Wall time: 29min 22s


In [20]:
np.save("wikipedia_le_data_map_1.npy", le_data_map)

In [21]:
np.save("wikipedia_phate_data_map_1.npy", phate_data_map)

In [22]:
umap_data_map = umap.UMAP(metric="cosine", random_state=42, n_epochs=500).fit_transform(wikipedia_vectors)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [23]:
np.save("wikipedia_umap_data_map_1.npy", umap_data_map)