This converts fingerprints.npy to .tsv formatted t-SNE embeddings and plots of those embeddings in the tsne/ and plot/ folders respectively. If you add multiple values to perplexity and initial_dims then all combinations will be computed (in parallel). Good perplexities are in the range 1-200 with the best range around 30-100. Good initial_dims are in the range 30 and higher, with the dimensionality of your input data being the highest possible value (e.g., a 32x32 fingerprint would have a highest possible initial_dims value of 32x32=1024).

Change the "mode" to try different t-SNE variations.

"fingerprints" will only use fingerprints.npy
"predicted_labels" will only use predicted_labels.npy
"predicted_encoding" will only use predicted_encoding.npy
"combined" will use all of the above data

In [3]:
import os
from matplotlib import pyplot as plt
from time import time
from utils import normalize
from utils.bhtsne import bh_tsne
from os.path import join
from pathos.multiprocessing import ProcessingPool as Pool
import numpy as np
import itertools

In [4]:
def mkdir_p(path):
    try:
        os.makedirs(path, exist_ok=True)
    except OSError as error:
        if error.errno != errno.EEXIST:
            raise

data_root = 'data/drums/'
initial_dims = [30]
perplexities = [30]
mode = 'fingerprints'
# mode = 'predicted_labels'
# mode = 'predicted_encoding'
# mode = 'combined'

In [5]:
def save_tsv(data, fn):
    np.savetxt(fn, data, fmt='%.5f', delimiter='\t')

def tsne(data, data_root, prefix, initial_dims=30, perplexity=30):
    mkdir_p(join(data_root, 'tsne'))
    mkdir_p(join(data_root, 'plot'))
    
    figsize = (32, 32)
    pointsize = 2

    X_2d = list(bh_tsne(data, initial_dims=initial_dims, perplexity=perplexity, no_dims=2))
    X_2d = normalize(np.array(X_2d))
    save_tsv(X_2d, join(data_root, 'tsne', f'{prefix}.{initial_dims}.{perplexity}.2d.tsv'))
    
    plt.figure(figsize=figsize)
    plt.scatter(X_2d[:, 0], X_2d[:, 1], edgecolor='none', s=pointsize)
    plt.tight_layout()
    plt.savefig(join(data_root, 'plot', f'{prefix}.{initial_dims}.{perplexity}.png'))
    plt.close()
    
    X_3d = list(bh_tsne(data, initial_dims=initial_dims, perplexity=perplexity, no_dims=3))
    X_3d = normalize(np.array(X_3d))
    save_tsv(X_3d, join(data_root, 'tsne', f'{prefix}.{initial_dims}.{perplexity}.3d.tsv'))
    
    plt.figure(figsize=figsize)
    plt.scatter(X_2d[:, 0], X_2d[:, 1], edgecolor='none', s=pointsize, c=X_3d)
    plt.tight_layout()
    plt.savefig(join(data_root, 'plot', f'{prefix}.{initial_dims}.{perplexity}.jpg'))
    plt.close()

if mode == 'fingerprints' or mode == 'combined':
    fingerprints = np.load(join(data_root, 'fingerprints.npy'))
    fingerprints = fingerprints.reshape(len(fingerprints), -1)
if mode == 'predicted_labels' or mode == 'combined':
    predicted_labels = np.load(join(data_root, 'predicted_labels.npy'))
    predicted_labels -= predicted_labels.min()
    predicted_labels /= predicted_labels.max()
if mode == 'predicted_encoding' or mode == 'combined':
    predicted_encoding = np.load(join(data_root, 'predicted_encoding.npy'))
    std = predicted_encoding.std(axis=0)
    predicted_encoding = predicted_encoding[:, std > 0] / std[std > 0]

if mode == 'fingerprints':
    data = fingerprints
elif mode == 'predicted_labels':
    data = predicted_labels
elif mode == 'predicted_encoding':
    data = predicted_encoding
elif mode == 'combined':
    data = np.hstack((fingerprints, predicted_labels, predicted_encoding))

print(data.shape)
data = data.astype(np.float64)

def job(params):
    start = time()
    tsne(data, data_root, mode, initial_dims=params[0], perplexity=10)
    print(f'initial_dims={params[0]}, perplexity={params[1]}, {time() - start} seconds')

(459, 1024)


In [6]:
params = list(itertools.product(initial_dims, perplexities))
with Pool() as pool:
    pool.map(job, params)

  data_file.write(pack(f'{len(sample)}d', *sample))


Read the 459 x 30 data matrix successfully!
Using current time as random seed...
Using no_dims = 2, perplexity = 10.000000, and theta = 0.500000
Computing input similarities...
Building tree...
 - point 0 of 459
Input similarities computed in 0.01 seconds (sparsity = 0.095196)!
Learning embedding...
Iteration 50: error is 66.239345 (50 iterations in 0.07 seconds)
Iteration 100: error is 63.745635 (50 iterations in 0.07 seconds)
Iteration 150: error is 63.434117 (50 iterations in 0.06 seconds)
Iteration 200: error is 63.348118 (50 iterations in 0.06 seconds)
Iteration 250: error is 2.792637 (50 iterations in 0.06 seconds)
Iteration 300: error is 0.820472 (50 iterations in 0.07 seconds)
Iteration 350: error is 0.713146 (50 iterations in 0.07 seconds)
Iteration 400: error is 0.689981 (50 iterations in 0.07 seconds)
Iteration 450: error is 0.673516 (50 iterations in 0.07 seconds)
Iteration 500: error is 0.666655 (50 iterations in 0.07 seconds)
Iteration 550: error is 0.660601 (50 iteration