## UMAP MNMG 3D visualization of Google News word2vec embeddings

Please first read https://plotly.com/python/getting-started to properly install the Plotly library

In [1]:
from cuml.dask.manifold import UMAP as UMAP_MNMG
from cuml.manifold import UMAP

from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask.array as da

import pandas as pd
import numpy as np

In [2]:
# Load the GoogleNews word2vec dataset and extract embeddings
from gensim.models import KeyedVectors
wv_from_bin = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
original_data = wv_from_bin.vectors

In [4]:
def prepare_data(original_data, sampling_ratio, n_parts):
    # Number of samples
    n_samples = original_data.shape[0]
    
    # Number of samples for local train
    n_sampling = int(n_samples * sampling_ratio)
    
    # Generate local train data
    selection = np.random.choice(n_samples, n_sampling)
    lX = original_data[selection]
    
    # Number of samples per partition
    n_samples_per_part = int(n_samples / n_parts)
    
    # Generate partitioning of distributed data for inference
    chunks = [n_samples_per_part] * n_parts
    chunks[-1] += n_samples % n_samples_per_part
    chunks = tuple(chunks)
    dX = da.from_array(original_data, chunks=(chunks, -1))
    return lX, dX

In [5]:
sampling_ratio = 0.05 # 5% of samples used for training
n_parts = 8 # 8 data split in 8 partitions, also 8 workers/GPUs

# Prepare local train data and distributed inference data
lX, dX = prepare_data(original_data, sampling_ratio, n_parts)

In [None]:
# Train local model
local_model = UMAP(n_components=3)
local_model.fit(lX)

In [7]:
# Start Dask client
cluster = LocalCUDACluster(n_workers=8, threads_per_worker=1)
client = Client(cluster)

# Pass trained model and compute distributed inference
model = UMAP_MNMG(local_model)
transformed = model.transform(dX).compute()

# Stop Dask client
client.close()

In [10]:
# Create dataframe with words and associated embeddings
result = pd.DataFrame({'dim1': transformed[:, 0], 'dim2': transformed[:, 1], 'dim3': transformed[:, 2]})
result['word'] = wv_from_bin.vocab

In [11]:
# Creating backup
import pickle
pickle.dump(result, open("gnews-embedding.p", "wb"))

In [12]:
# Loading backup
import pickle
result = pickle.load(open("gnews-embedding.p", "rb"))

In [None]:
import plotly.express as px

# 3D visualization of 500 words
fig = px.scatter_3d(result.head(500), x='dim1', y='dim2', z='dim3', text='word')
fig.show()