# UMAP Experiment

In [1]:
import os
os.environ["NUMBA_NUM_THREADS"] = "80"

import datasets

import warnings
warnings.filterwarnings("ignore")

import pickle

from umap import UMAP as UMAP_LEARN
from cuml.manifold import UMAP as UMAP_CUML


In [2]:
# Run this cell only if GUMAP is installed
from gpumap import GPUMAP as UMAP_GPUMAP

In [3]:
import rmm

POOL_SIZE_GB=15

RESULTS_FILE="results/results.pickle"

rmm.reinitialize(
    pool_allocator=True, # default is False
    managed_memory=False, # default is False
    initial_pool_size=int(1024*1024*1024*POOL_SIZE_GB), # set to 2GiB. Default is 1/2 total GPU memory
    devices=0, # GPU device  IDs to register. By default registers only GPU 0.
    logging=False, # default is False -- has perf overhead
)

0

In [4]:
import matplotlib.pyplot as plt

In [5]:
def draw_chart(model, X, y, dataset, model_name, classes=None):
    
    embedding = model.fit_transform(X, y)
    
    fig, ax = plt.subplots(1, figsize=(14, 10))
    plt.scatter(embedding[:,1], embedding[:,0], s=0.3, c=y, cmap='Spectral', alpha=1.0)
    plt.setp(ax, xticks=[], yticks=[])
    cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
    cbar.set_ticks(np.arange(10))
    if classes is not None:
        cbar.set_ticklabels(classes)
    plt.title("%s Embedded via %s" % (dataset, model_name));

In [6]:
from cuml.metrics import trustworthiness
# from sklearn.manifold.t_sne import trustworthiness
from sklearn.model_selection import train_test_split
import time
import numpy as np

batchSize = 5000


def _run_build_and_train_once(model_class, X, y=None, kwargs={}, verbose=False, eval_trust=True):
    
    results = {}
    
#     if "knn_graph" in kwargs:
#         knn_graph = kwargs["knn_graph"]
#         del kwargs["knn_graph"]
#     else:
#         knn_graph = None
    
    
#     kwargs["n_epochs"] = 500

    # Unsupervised
    if verbose:
        print("Unsupervised")
    model = model_class(**kwargs)

    try:
        start = time.time()
        embeddings = model.fit_transform(X)
        end = time.time() - start

        if verbose:
            print("Time: "+ str(end))
    
        n_neighbors = model.n_neighbors
        del model

        if eval_trust:
            if verbose:
                print("Done. Evaluating trustworthiness")
            trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=batchSize)
        else:
            trust = None
        
        if verbose:
            print(str(trust))
        results["unsupervised"] = {"time": end, "trust": trust}
    except:
        import traceback
        traceback.print_exc()
    
    # Supervised

    if y is not None:
        if verbose:
            print("Supervised")
        kwargs["target_metric"] = "categorical"
        model = model_class(**kwargs)

        try:
            start = time.time()
            embeddings = model.fit_transform(X, y)
            end = time.time() - start


            n_neighbors = model.n_neighbors
            del model

            if eval_trust:
                if verbose:
                    print("Done. Evaluating trustworthiness")
                trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=batchSize)
            else:
                trust = None

            if verbose:
                print(str(trust))
                print("Time: "+ str(end))

            results["supervised"] = {"time": end, "trust": trust}
        except:
            import traceback
            traceback.print_exc()
    
    # Transform
    

    if verbose:
        print("Transform")
    model = model_class(**kwargs)

    try:

        model.fit(X)
        start = time.time()
        embeddings = model.transform(X)
        end = time.time() - start

        n_neighbors = model.n_neighbors
        del model

        if eval_trust:
            if verbose:
                print("Done. Evaluating trustworthiness")
            trust = trustworthiness(X, embeddings, n_neighbors=n_neighbors, batch_size=batchSize)
        else:
            trust = None

        if verbose:
            print(str(trust))
            print("Time: "+ str(end))
        results["xform"] = {"time": end, "trust": trust}        
    except:
        import traceback
        traceback.print_exc()
        
    return results

def build_and_train(model_class, X, y=None, kwargs={}, n_trials=4, verbose=False, eval_trust=True):
    
    results = []
    
    for trial in range(n_trials):
        results.append(_run_build_and_train_once(model_class, X, y, kwargs, verbose, eval_trust))
        
    return results

In [7]:
def store_results(results, filename):
    import pickle
    with open(filename, 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
final_results = pickle.load( open(RESULTS_FILE, "rb" ) ) if os.path.exists(RESULTS_FILE) else {}

In [9]:
final_results

{'digits': {'umapcuml': [{'unsupervised': {'time': 0.4452664852142334,
     'trust': 0.9876636433389232},
    'supervised': {'time': 0.47392821311950684, 'trust': 0.9879378905549919},
    'xform': {'time': 0.15683841705322266, 'trust': 0.9847805133487961}},
   {'unsupervised': {'time': 0.18512916564941406, 'trust': 0.9874644094493787},
    'supervised': {'time': 0.2050032615661621, 'trust': 0.9864168160345743},
    'xform': {'time': 0.42592787742614746, 'trust': 0.9856609318173406}},
   {'unsupervised': {'time': 0.44254136085510254, 'trust': 0.9869448789027268},
    'supervised': {'time': 0.4717979431152344, 'trust': 0.9879864913274599},
    'xform': {'time': 0.14838576316833496, 'trust': 0.9858305953573715}},
   {'unsupervised': {'time': 0.36017560958862305, 'trust': 0.9863258882554476},
    'supervised': {'time': 0.474459171295166, 'trust': 0.9875988980757733},
    'xform': {'time': 0.15300321578979492, 'trust': 0.9862826829215756}}],
  'umaplearn': [{'unsupervised': {'time': 9.26321

## Pen Digits Dataset

In [17]:
from sklearn.datasets import load_digits

data = load_digits()

X = data.data
y = data.target

In [18]:
results_digits = final_results["digits"] if "digits" in final_results else {}

In [19]:
results_digits["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [25]:
results_digits["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

In [26]:
results_digits["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-19-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1990, in transform
    use_gpu=self.use_gpu
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 131, in smooth_knn_dist
    SMOOTH_K_TOLERANCE, MIN_K_DIST_SCALE
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/smooth_knn_dist_gpu.py", line 170, in smooth_knn_dist_gpu
    distances_mean, smooth_k_tolerance, min_k_dist_scale
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/numba/cuda/compiler.py", line 758, in __call__
    kernel = self.specialize(*args)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/numba/cuda/compiler.py", line 769, in specialize
    kernel = self.compile(argtypes)
  File "/raid/cnolet/miniconda3/envs/umap3/li

In [27]:
final_results["digits"] = results_digits

In [28]:
final_results["digits"]

{'umapcuml': [{'unsupervised': {'time': 0.7027411460876465,
    'trust': 0.9867100727610446},
   'supervised': {'time': 0.18091797828674316, 'trust': 0.9862312589544937},
   'xform': {'time': 0.31970643997192383, 'trust': 0.9826555470441466}},
  {'unsupervised': {'time': 0.16164946556091309, 'trust': 0.9871587306666064},
   'supervised': {'time': 0.1764993667602539, 'trust': 0.9879818696533138},
   'xform': {'time': 0.31267285346984863, 'trust': 0.9824522142942735}},
  {'unsupervised': {'time': 0.15775108337402344, 'trust': 0.987041348508318},
   'supervised': {'time': 0.17386841773986816, 'trust': 0.9879716852401503},
   'xform': {'time': 0.3037276268005371, 'trust': 0.9832519730470656}},
  {'unsupervised': {'time': 0.15770816802978516, 'trust': 0.9871846413193981},
   'supervised': {'time': 0.1699216365814209, 'trust': 0.9874626109698468},
   'xform': {'time': 0.3010995388031006, 'trust': 0.9808050579936455}}],
 'umaplearn': [{'unsupervised': {'time': 9.263211965560913,
    'trust': 

In [29]:
store_results(final_results, RESULTS_FILE)

## Fashion MNIST Dataset

In [23]:
# https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py
def load_mnist(path, kind='train'):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels


In [24]:
train, train_labels = load_mnist('data/fashion', kind='train')
test, test_labels = load_mnist('data/fashion', kind='t10k')
X = (np.array(np.vstack([train, test]), dtype=np.float64) [:50000]/ 255.0).astype(np.float32)
y = np.array(np.hstack([train_labels, test_labels]))[:50000].astype(np.float32)

In [25]:
results_fashion = final_results["fashion_mnist"] if "fashion_mnist" in final_results else {}

In [23]:
results_fashion["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [26]:
results_fashion["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

In [12]:
results_fashion["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._sea

In [27]:
final_results["fashion_mnist"] = results_fashion

In [28]:
store_results(final_results, RESULTS_FILE)

In [29]:
final_results["fashion_mnist"]

{'umapgpumap': [{'unsupervised': {'time': 6.479771852493286,
    'trust': 0.9745745792397836},
   'supervised': {'time': 6.876313924789429, 'trust': 0.9676141851918549}},
  {'unsupervised': {'time': 3.3777499198913574, 'trust': 0.9750927790516971},
   'supervised': {'time': 6.480828523635864, 'trust': 0.9696567894031254}},
  {'unsupervised': {'time': 3.3025641441345215, 'trust': 0.9748744066270485},
   'supervised': {'time': 6.351450681686401, 'trust': 0.9664509547458497}},
  {'unsupervised': {'time': 3.4731950759887695, 'trust': 0.9748966109477026},
   'supervised': {'time': 6.198164224624634, 'trust': 0.9668087698207842}}],
 'umapcuml': [{'unsupervised': {'time': 0.589625358581543,
    'trust': 0.97601825773189},
   'supervised': {'time': 1.0603547096252441, 'trust': 0.9757726834610587},
   'xform': {'time': 0.44457364082336426, 'trust': 0.9736058731016267}},
  {'unsupervised': {'time': 0.40508532524108887, 'trust': 0.9739971370030214},
   'supervised': {'time': 1.025468349456787, 't

In [39]:
classes = [
    'T-shirt/top',
    'Trouser',
    'Pullover',
    'Dress',
    'Coat',
    'Sandal',
    'Shirt',
    'Sneaker',
    'Bag',
    'Ankle boot']

In [None]:
draw_chart(UMAP_LEARN(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "UMAP-learn", classes)

In [None]:
draw_chart(UMAP_CUML(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "cuML UMAP", classes)

In [None]:
draw_chart(UMAP_GPUMAP(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "GPUUMAP", classes)

### CIFAR-100

In [27]:
import pickle
import os
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def load_cifar100(path="data/cifar100/cifar-100-python"):
    train = unpickle(os.path.join(path, "train"))
    test = unpickle(os.path.join(path, "test"))
    
    return train, test

In [28]:
train, test = load_cifar100()

train, train_labels = (train[b"data"], train[b"fine_labels"])
test, test_labels = (test[b"data"], test[b"fine_labels"])

In [29]:
X = (np.array(np.vstack([train, test]), dtype=np.float64) [:60000]/ 255.0).astype(np.float32)
y = np.array(np.hstack([train_labels, test_labels]))[:60000].astype(np.float32)

In [30]:
results_cifar100 = final_results["cifar100"] if "cifar100" in final_results else {}

In [44]:
results_cifar100["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

In [31]:
results_cifar100["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [13]:
results_cifar100["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-5-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._sea

In [32]:
final_results["cifar100"] = results_cifar100

In [33]:
store_results(final_results, RESULTS_FILE)

In [34]:
results_cifar100

{'umaplearn': [{'unsupervised': {'time': 106.26967883110046,
    'trust': 0.8441760104151036},
   'supervised': {'time': 99.81106400489807, 'trust': 0.8474044828110034},
   'xform': {'time': 99.87579226493835, 'trust': 0.8498454070541855}},
  {'unsupervised': {'time': 106.59346199035645, 'trust': 0.844329741901062},
   'supervised': {'time': 97.1928915977478, 'trust': 0.8490670250262601},
   'xform': {'time': 98.72296786308289, 'trust': 0.8435752178705541}},
  {'unsupervised': {'time': 107.33383107185364, 'trust': 0.8472284890319066},
   'supervised': {'time': 100.0251407623291, 'trust': 0.8463326112046655},
   'xform': {'time': 100.32673573493958, 'trust': 0.8508993518441328}},
  {'unsupervised': {'time': 103.20404744148254, 'trust': 0.8423912181521435},
   'supervised': {'time': 96.66665291786194, 'trust': 0.8450828778624399},
   'xform': {'time': 100.74139332771301, 'trust': 0.8453433263121233}}],
 'umapcuml': [{'unsupervised': {'time': 0.813828706741333,
    'trust': 0.827138247643

In [None]:
from sklearn.manifold.t_sne import trustworthiness as sktrust
from sklearn.datasets import load_digits

data = load_digits()

model = UMAP_CUML()
embeddings = model.fit_transform(data.data)

In [None]:
start = time.time()
sktrust(data.data, embeddings)
print(str(time.time() - start))

## Shuttle Dataset

In [37]:
import scipy.io

if not os.path.exists("data/shuttle.mat"):
    !wget -P data/ https://www.dropbox.com/s/mk8ozgisimfn3dw/shuttle.mat

mat = scipy.io.loadmat("data/shuttle.mat")

In [38]:
X = mat["X"].astype(np.float32)
y = mat["y"].astype(np.int32).ravel()

In [39]:
y

array([1, 0, 0, ..., 0, 1, 0], dtype=int32)

In [40]:
results_shuttle = final_results["shuttle"] if "shuttle" in final_results else {}

In [39]:
results_shuttle["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [15]:
results_shuttle["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

In [13]:
results_shuttle["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._sea

In [40]:
final_results["shuttle"] = results_shuttle

In [41]:
store_results(final_results, RESULTS_FILE)

In [42]:
results_shuttle

{'umapcuml': [{'unsupervised': {'time': 0.7837574481964111,
    'trust': 0.4907984035370634},
   'supervised': {'time': 0.5476815700531006, 'trust': 0.9999999959879131},
   'xform': {'time': 0.4578070640563965, 'trust': 0.9999987639175375}},
  {'unsupervised': {'time': 0.5676589012145996, 'trust': 0.9999980015103526},
   'supervised': {'time': 0.4594261646270752, 'trust': 0.9999976843618024},
   'xform': {'time': 0.2353208065032959, 'trust': 0.9999980647629775}},
  {'unsupervised': {'time': 0.34301042556762695, 'trust': 0.9999991344960172},
   'supervised': {'time': 0.4870455265045166, 'trust': 0.9999997347595531},
   'xform': {'time': 0.44645023345947266, 'trust': 0.9999988579663878}},
  {'unsupervised': {'time': 0.6358237266540527, 'trust': 0.9999971075067181},
   'supervised': {'time': 0.7296688556671143, 'trust': 0.9999981510643496},
   'xform': {'time': 0.45848917961120605, 'trust': 0.9999989773328898}}],
 'umaplearn': [{'unsupervised': {'time': 43.14485430717468,
    'trust': 0.9

## COIL-20 Dataset

In [17]:
from datasets.coil20.feed import feed
path = "data/coil-20"
feed(feed_path=path, dataset_type='processed')

from datasets import pa2np
X, Y = pa2np("./data/coil-20X_processed.pa"), pa2np("./data/coil-20Y_processed.pa")

features = X.shape[2]*X.shape[3]
new_X = np.zeros((X.shape[0], features))

from skimage import color
for i in range(X.shape[0]):
    img = X[i, :, :, :]
    shape = features
    gray = color.rgb2gray(np.moveaxis(img, 0, 2)).reshape(shape)
    new_X[i] = gray
    
X = new_X.astype(np.float32)
y = Y.astype(np.float32)

In [18]:
results_coil20 = final_results["coil20"] if "coil20" in final_results else {}

In [12]:
results_coil20["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [19]:
results_coil20["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-13-f338fa776278>", line 97, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1990, in transform
    use_gpu=self.use_gpu
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 131, in smooth_knn_dist
    SMOOTH_K_TOLERANCE, MIN_K_DIST_SCALE
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/smooth_knn_dist_gpu.py", line 170, in smooth_knn_dist_gpu
    distances_mean, smooth_k_tolerance, min_k_dist_scale
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/numba/cuda/compiler.py", line 758, in __call__
    kernel = self.specialize(*args)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/numba/cuda/compiler.py", line 769, in specialize
    kernel = self.compile(argtypes)
  File "/raid/cnolet/miniconda3/envs/umap3/li

In [18]:
results_coil20["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

NameError: name 'y' is not defined

In [19]:
final_results["coil20"] = results_coil20

NameError: name 'results_coil20' is not defined

In [17]:
store_results(final_results, RESULTS_FILE)

In [22]:
final_results["coil20"]

{'umapcuml': [{'unsupervised': {'time': 2.064183473587036,
    'trust': 0.9921860542617423},
   'supervised': {'time': 0.394045352935791, 'trust': 0.9866257285867378},
   'xform': {'time': 0.17340397834777832, 'trust': 0.9927199482474712}},
  {'unsupervised': {'time': 0.22162199020385742, 'trust': 0.9926057267571029},
   'supervised': {'time': 0.23158931732177734, 'trust': 0.9862040957683159},
   'xform': {'time': 0.37630367279052734, 'trust': 0.9909749666745079}},
  {'unsupervised': {'time': 0.4388558864593506, 'trust': 0.9909937857758959},
   'supervised': {'time': 0.43929457664489746, 'trust': 0.986986493374108},
   'xform': {'time': 0.1761476993560791, 'trust': 0.9907711910922921}},
  {'unsupervised': {'time': 0.30503273010253906, 'trust': 0.9927721908570533},
   'supervised': {'time': 0.4133152961730957, 'trust': 0.9864624010036854},
   'xform': {'time': 0.18416094779968262, 'trust': 0.9922615920437021}}],
 'umapgpumap': [{'unsupervised': {'time': 2.5237808227539062,
    'trust': 

## MNIST Dataset

In [16]:
from datasets.mnist.feed import feed
path = "data/mnist"
feed(feed_path=path)

from datasets import pa2np
X, Y = pa2np("./data/mnistX.pa"), pa2np("./data/mnistY.pa")

X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
y = Y

In [17]:
results_mnist = final_results["mnist"] if "mnist" in final_results else {}

In [18]:
results_mnist["umapcuml"] = build_and_train(UMAP_CUML, X, y, {})

In [22]:
results_mnist["umaplearn"] = build_and_train(UMAP_LEARN, X, y, {})

In [20]:
results_mnist["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, y, {})

Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-4-c64b9b523b44>", line 90, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._sea

In [22]:
final_results["mnist"] = results_mnist

In [23]:
store_results(final_results, RESULTS_FILE)

In [24]:
results_mnist

{'umapcuml': [{'unsupervised': {'time': 0.8702592849731445,
    'trust': 0.9559595178151625},
   'supervised': {'time': 1.011225700378418, 'trust': 0.9553705150307618},
   'xform': {'time': 0.4377431869506836, 'trust': 0.954108408575139}},
  {'unsupervised': {'time': 0.6537017822265625, 'trust': 0.9574214731202516},
   'supervised': {'time': 0.8979723453521729, 'trust': 0.9560532148990446},
   'xform': {'time': 0.4433917999267578, 'trust': 0.9531227299909409}},
  {'unsupervised': {'time': 0.6645529270172119, 'trust': 0.956424851544944},
   'supervised': {'time': 0.8808555603027344, 'trust': 0.9563548094547354},
   'xform': {'time': 0.4418752193450928, 'trust': 0.952863784209873}},
  {'unsupervised': {'time': 0.642711877822876, 'trust': 0.9567874069133908},
   'supervised': {'time': 0.879960298538208, 'trust': 0.9574419861872977},
   'xform': {'time': 0.44603633880615234, 'trust': 0.954058174077841}}],
 'umaplearn': [{'unsupervised': {'time': 52.32333850860596,
    'trust': 0.9575792140

## scRNA

This benchmark requires a pickle file to be output from the GPU notebook [here](https://github.com/clara-parabricks/rapids-single-cell-examples)

In [10]:
X = pickle.load( open( "data/scrna.pickle", "rb" ) )

In [11]:
X.shape

(65462, 5000)

In [12]:
results_scrna = final_results["scrna"] if "scrna" in final_results else {}

In [28]:
results_scrna["umapcuml"] = build_and_train(UMAP_CUML, X, None, {})

In [20]:
results_scrna["umaplearn"] = build_and_train(UMAP_LEARN, X, None, {})

In [13]:
results_scrna["umapgpumap"] = build_and_train(UMAP_GPUMAP, X, None, {})

Traceback (most recent call last):
  File "<ipython-input-6-f338fa776278>", line 97, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-6-f338fa776278>", line 97, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._search_graph.indptr,
AttributeError: 'GPUMAP' object has no attribute '_search_graph'
Traceback (most recent call last):
  File "<ipython-input-6-f338fa776278>", line 97, in _run_build_and_train_once
    embeddings = model.transform(X)
  File "/raid/cnolet/miniconda3/envs/umap3/lib/python3.7/site-packages/gpumap/gpumap_.py", line 1977, in transform
    self._sea

In [20]:
final_results["scrna"] = results_scrna

In [21]:
store_results(final_results, RESULTS_FILE)

In [22]:
results_scrna

{'umapcuml': [{'unsupervised': {'time': 3.8234424591064453,
    'trust': 0.6178009579665475},
   'xform': {'time': 3.4701781272888184, 'trust': 0.5337737424809548}},
  {'unsupervised': {'time': 3.8994381427764893, 'trust': 0.6201815623705682},
   'xform': {'time': 3.9311397075653076, 'trust': 0.9708745134888634}},
  {'unsupervised': {'time': 4.300644397735596, 'trust': 0.6187572904540646},
   'xform': {'time': 3.5282957553863525, 'trust': 0.8702917658612157}},
  {'unsupervised': {'time': 4.389519453048706, 'trust': 0.9781377849332045},
   'xform': {'time': 3.602308988571167, 'trust': 0.6201129287721822}}],
 'umapgpumap': [{'unsupervised': {'time': 13.082006216049194,
    'trust': 0.6168243354282248}},
  {'unsupervised': {'time': 10.061619758605957, 'trust': 0.9434946335833367}},
  {'unsupervised': {'time': 10.218225479125977, 'trust': 0.6203727995014007}},
  {'unsupervised': {'time': 10.199477195739746, 'trust': 0.9425912295012873}}],
 'umaplearn': [{'unsupervised': {'time': 222.381418

## Scale Benchmark

Test UMAP variants at different `n_samples` and `n_components`. Need to download the "GoogleNews-vectors-negative300.bin.gz" dataset.

In [21]:
from gensim.models import KeyedVectors

vecs = KeyedVectors.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary=True)

X = vecs.vectors

In [22]:
scale_results = {}
from os import path
import pickle
if path.exists("results/scale_results.pickle"):
    with open("results/scale_results.pickle", 'rb') as handle:
        scale_results = pickle.load(handle)

In [23]:
scale_results

{'umapcuml': {1024.0: [{'unsupervised': {'time': 0.25798463821411133,
     'trust': None},
    'xform': {'time': 0.30170559883117676, 'trust': None}}],
  334243.55555555556: [{'unsupervised': {'time': 7.277303218841553,
     'trust': None},
    'xform': {'time': 13.905571937561035, 'trust': None}}],
  667463.1111111111: [{'unsupervised': {'time': 28.77500081062317,
     'trust': None},
    'xform': {'time': 56.95573329925537, 'trust': None}}],
  1000682.6666666667: [{'unsupervised': {'time': 64.54534125328064,
     'trust': None},
    'xform': {'time': 127.1585602760315, 'trust': None}}],
  1333902.2222222222: [{'unsupervised': {'time': 114.33410835266113,
     'trust': None},
    'xform': {'time': 225.36999034881592, 'trust': None}}],
  1667121.7777777778: [{'unsupervised': {'time': 176.93605589866638,
     'trust': None},
    'xform': {'time': 351.2978549003601, 'trust': None}}],
  2000341.3333333335: [{'unsupervised': {'time': 255.18308854103088,
     'trust': None},
    'xform': {'

In [34]:
from cuml.neighbors import NearestNeighbors
import cupy as cp

def perform_n_samples_test(model, X, start_samples=1024, n_indep=10, n_trials=1, n_components=2):
    import math
    results = {}
    s = np.linspace(start_samples, X.shape[0], n_indep)
    for n_samples in s:
        print("Testing " + str(n_samples))
        samples = np.random.choice(np.arange(X.shape[0]), math.floor(n_samples))
        X_sampled = X[samples]
        
        d, i = NearestNeighbors(n_neighbors=15).fit(X_sampled).kneighbors(X_sampled)
        
        coo = cp.sparse.coo_matrix((cp.asarray(d.ravel()), (cp.repeat(cp.arange(d.shape[0]), 15), cp.asarray(i.ravel()))))
        
        results[n_samples] = build_and_train(model, X_sampled, y=None, kwargs={"n_components": n_components, "knn_graph":coo}, verbose=True, n_trials=n_trials, eval_trust=False)
        
    return results

In [None]:
%%time
start_components = 2
stop_components = 1024


def perform_n_components_test(model, X, model_name):

    import math

    n_components = np.linspace(start_components, stop_components, 3)

    print(n_components)

    for components in n_components:
        print("Testing " + str(math.floor(components)) + " components")
        scale_results[model_name + "_" + str(math.floor(components)) + "_components"] = perform_n_samples_test(model, X, n_components=math.floor(components))
        store_results(scale_results, "results/scale_results.pickle")

In [None]:
%%time
perform_n_components_test(UMAP_CUML, X, "umapcuml")

In [None]:
store_results(scale_results, "results/scale_results.pickle")

In [37]:
%%time
scale_results["umapcuml"] = perform_n_samples_test(UMAP_CUML, X)

Testing 1024.0
Unsupervised
Time: 1.2285232543945312
None
Transform
None
Time: 0.0038461685180664062
Testing 334243.55555555556
Unsupervised
Time: 0.767493486404419
None
Transform
None
Time: 0.11623358726501465
Testing 667463.1111111111
Unsupervised
Time: 1.6629040241241455
None
Transform
None
Time: 0.2637910842895508
Testing 1000682.6666666667
Unsupervised
Time: 2.7732017040252686
None
Transform
None
Time: 0.4247303009033203
Testing 1333902.2222222222
Unsupervised
Time: 3.8045833110809326
None
Transform
None
Time: 0.5741469860076904
Testing 1667121.7777777778
Unsupervised
Time: 5.041834831237793
None
Transform
None
Time: 0.7472741603851318
Testing 2000341.3333333335
Unsupervised
Time: 5.964860916137695
None
Transform
None
Time: 0.9174463748931885
Testing 2333560.888888889
Unsupervised
Time: 7.245739221572876
None
Transform
None
Time: 1.0399751663208008
Testing 2666780.4444444445
Unsupervised
Time: 8.139975309371948
None
Transform
None
Time: 1.2124700546264648
Testing 3000000.0
Unsuper

In [38]:
store_results(scale_results, "results/scale_results_precompute_knn.pickle")

## Evaluating Distributed UMAP

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask import array as da
import numpy as np

cluster = LocalCUDACluster()
client = Client(cluster)

from cuml.dask.manifold import UMAP as DIST_UMAP

In [None]:
n_neighbors = 10

local_model = UMAP_CUML(n_neighbors=n_neighbors)


sampling_ratio=1.0

selection = np.random.RandomState(42).choice(
    [True, False], len(X), replace=True,
    p=[sampling_ratio, 1.0 - sampling_ratio])

X_sampled = X[selection]
y_sampled = y[selection]

local_model.fit(X, y)

In [None]:
dist_model = DIST_UMAP(local_model)

n_parts = 25
import math
chunk_size = math.ceil(len(y) / n_parts) 
X_dist = da.from_array(X, chunks=(chunk_size))
dist_pred = dist_model.transform(X_dist)
dist_pred = dist_pred.compute()

In [None]:
trustworthiness(dist_pred, X, n_neighbors=n_neighbors)

In [None]:
embedding = dist_pred

fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(embedding[:,1], embedding[:,0], s=0.3, c=y, cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
if classes is not None:
    cbar.set_ticklabels(classes)
plt.title("%s Embedded via %s" % ("Fasion MNIST", "cuML"));

In [None]:
local_pred = local_model.transform(X)

In [None]:
trustworthiness(local_pred, X, n_neighbors=10)

In [None]:
embedding = local_pred

fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(embedding[:,1], embedding[:,0], s=0.3, c=y, cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
if classes is not None:
    cbar.set_ticklabels(classes)
plt.title("%s Embedded via %s" % ("Fasion MNIST", "cuML"));

In [None]:
Fashion  (n_neighbors=50, min_dist=0.01): {'umapcuml': {'unsupervised': {'time': 1.691455602645874,
   'trust': 0.9756608852027691},
  'supervised': {'time': 2.351792097091675, 'trust': 0.9720590243890228},
  'xform': {'time': 2.395686626434326, 'trust': 0.9721990809657708}},
 'umapgpumap': {'unsupervised': {'time': 8.455759525299072,
   'trust': 0.966088003226282},
  'supervised': {'time': 11.22615909576416, 'trust': 0.946432295621991},
  'xform': {'time': 12.111764907836914, 'trust': 0.9451657405457423}},
 'umaplearn': {'unsupervised': {'time': 129.88665342330933,
   'trust': 0.9785036958283618},
  'supervised': {'time': 167.90911030769348, 'trust': 0.9759003309442159},
  'xform': {'time': 169.7406485080719, 'trust': 0.9752538816065772}}}

Fashion default: {'umaplearn': {'unsupervised': {'time': 70.07299304008484,
   'trust': 0.977692662316851},
  'supervised': {'time': 89.2359881401062, 'trust': 0.9768497406042687},
  'xform': {'time': 88.11038374900818, 'trust': 0.9766910591282214}},
 'umapcuml': {'unsupervised': {'time': 1.0602912902832031,
   'trust': 0.9756579316886288},
  'supervised': {'time': 1.2528660297393799, 'trust': 0.9753220467475495},
  'xform': {'time': 1.2345595359802246, 'trust': 0.9758428593923967}},
 'umapgpumap': {'unsupervised': {'time': 8.032711505889893,
   'trust': 0.9745967157039829},
  'supervised': {'time': 11.146373987197876, 'trust': 0.9578011941244143},
  'xform': {'time': 11.902178287506104, 'trust': 0.9567292975640662}}}

Fashion (n_neighbors:5, min_dist:0.01):
{'umaplearn': {'unsupervised': {'time': 46.197495460510254,
   'trust': 0.9844126808574477},
  'supervised': {'time': 50.494110107421875, 'trust': 0.9824418088522914},
  'xform': {'time': 49.34948992729187, 'trust': 0.9834457064275237}},
 'umapcuml': {'unsupervised': {'time': 0.8439412117004395,
   'trust': 0.9790508081077477},
  'supervised': {'time': 0.9139008522033691, 'trust': 0.9791517042272303},
  'xform': {'time': 0.9139800071716309, 'trust': 0.9787407767702361}},
 'umapgpumap': {'unsupervised': {'time': 8.223836183547974,
   'trust': 0.9789080655198471},
  'supervised': {'time': 11.491980791091919, 'trust': 0.9687381260279149},
  'xform': {'time': 12.231558799743652, 'trust': 0.9701084106214162}}}

Cifar-100 default: {'umaplearn': {'unsupervised': {'time': 114.65315198898315,
   'trust': 0.846886604198276},
  'supervised': {'time': 107.9702365398407, 'trust': 0.843361072892757},
  'xform': {'time': 106.28120398521423, 'trust': 0.8480973411474398}},
 'umapcuml': {'unsupervised': {'time': 2.404963731765747,
   'trust': 0.8350443612829362},
  'supervised': {'time': 2.6520259380340576, 'trust': 0.8384520656992588},
  'xform': {'time': 2.6225061416625977, 'trust': 0.8362343168029226}},
 'umapgpumap': {'unsupervised': {'time': 7.503326892852783,
   'trust': 0.8334274633424295},
  'supervised': {'time': 10.095724105834961, 'trust': 0.8307568528676733},
  'xform': {'time': 10.978502750396729, 'trust': 0.8307642823082182}}}

Cifar-100 (n_neighbors:5): {'umaplearn': {'unsupervised': {'time': 67.82002830505371,
   'trust': 0.8581266525536738},
  'supervised': {'time': 76.06079483032227, 'trust': 0.8611072494110326},
  'xform': {'time': 82.10008549690247, 'trust': 0.8609734462928391}},
 'umapcuml': {'unsupervised': {'time': 2.640801429748535,
   'trust': 0.8384547320198249},
  'supervised': {'time': 2.2339065074920654, 'trust': 0.8409368907521003},
  'xform': {'time': 2.2544515132904053, 'trust': 0.8369009832977731}},
 'umapgpumap': {'unsupervised': {'time': 9.76174783706665,
   'trust': 0.8398398837067165}}}

Cifar-100 (n_neighbors:50): {'umaplearn': {'unsupervised': {'time': 247.36368441581726,
   'trust': 0.8406418578934047},
  'supervised': {'time': 216.87787103652954, 'trust': 0.8390893277040276},
  'xform': {'time': 218.298357963562, 'trust': 0.838630387632215}},
 'umapcuml': {'unsupervised': {'time': 3.350353240966797,
   'trust': 0.832504358512239},
  'supervised': {'time': 3.9427123069763184, 'trust': 0.8328878041702475},
  'xform': {'time': 3.9466609954833984, 'trust': 0.8346442010196163}},
 'umapgpumap': {'unsupervised': {'time': 11.36575698852539,
   'trust': 0.8350418430691954}}}


In [None]:
from sklearn import Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.pipeline import Pipeline, FeatureUnion

In [None]:
from cuml import PCA, UMAP

In [None]:
pipeline = Pipeline([('pp', PCA()), ('viz', UMAP())])

In [None]:
from cuml.datasets import make_blobs

In [None]:
x,_ = make_blobs()

In [None]:
pipeline.fit(x)

In [43]:
scale_results = pickle.load(open("results/scale_results_with_knn.pickle", "rb"))

In [44]:
scale_results

{'umapcuml': {1024.0: [{'unsupervised': {'time': 0.25798463821411133,
     'trust': None},
    'xform': {'time': 0.30170559883117676, 'trust': None}}],
  334243.55555555556: [{'unsupervised': {'time': 7.277303218841553,
     'trust': None},
    'xform': {'time': 13.905571937561035, 'trust': None}}],
  667463.1111111111: [{'unsupervised': {'time': 28.77500081062317,
     'trust': None},
    'xform': {'time': 56.95573329925537, 'trust': None}}],
  1000682.6666666667: [{'unsupervised': {'time': 64.54534125328064,
     'trust': None},
    'xform': {'time': 127.1585602760315, 'trust': None}}],
  1333902.2222222222: [{'unsupervised': {'time': 114.33410835266113,
     'trust': None},
    'xform': {'time': 225.36999034881592, 'trust': None}}],
  1667121.7777777778: [{'unsupervised': {'time': 176.93605589866638,
     'trust': None},
    'xform': {'time': 351.2978549003601, 'trust': None}}],
  2000341.3333333335: [{'unsupervised': {'time': 255.18308854103088,
     'trust': None},
    'xform': {'