# 785 cuML t-SNE

In [1]:
from cuml import TSNE
from collections import defaultdict
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
data_dir = "/home/gh/autosubs/data/"

In [2]:
def run_tsne(data, ncomponents=2, perplexity=25.0, lr=200., niter=5000, verbose=1, n_iter_without_progress=250):
    tsne = TSNE(n_components=ncomponents
             , perplexity=perplexity
             , early_exaggeration=12.0
             , learning_rate=lr
             , n_iter=niter
             , n_iter_without_progress=n_iter_without_progress
             , min_grad_norm=1e-07
             , metric='euclidean'
             , init='random'
             , verbose=verbose
             , random_state=0
             , method='barnes_hut'
             , angle=0.5)
    
    return tsne.fit_transform(data)

In [3]:
# knnw_processed_comma = pd.read_csv(data_dir + "processed_comma.csv")
# knnw_processed = pd.read_csv(data_dir + "processed.csv")
# knnw_en_sub = pd.read_csv(data_dir + "knnw_en_sub.csv", sep=";")
knnw_spectrogram = np.load(data_dir + "knnw_en.log_spectrogram.npy", allow_pickle=True)
knnw_wav2vec = np.load(data_dir + "wav2vec.npy", allow_pickle=True)
wsj_spectro = np.load(data_dir + "wsj/train.npy", allow_pickle=True)
wsj_text = np.load(data_dir + "wsj/train_transcripts.npy", allow_pickle=True)

In [4]:
print(knnw_spectrogram.shape)
print(knnw_wav2vec.shape)
print(knnw_wav2vec[0].shape)

(129, 1370493)
(1285,)
(207, 512)


# t-SNE on Corpuses

### WSJ

In [5]:
# wsj_sentences = []

# for sentence in wsj_text:
#     z = " ".join([word.decode("utf-8") for word in sentence])
#     wsj_sentences.append(z)

# wsj_sentences = np.array(wsj_sentences)
# count_vec = CountVectorizer()
# wsj_docmat = count_vec.fit_transform(wsj_sentences)
# svd_components = 100
# tsvd = TruncatedSVD(n_components=svd_components, n_iter=5, random_state=0, tol=0.0)
# wsj_reduced = tsvd.fit_transform(wsj_docmat)
# print(f"ncomp {svd_components} explained variance {tsvd.explained_variance_ratio_.sum()}")

In [6]:
# wsj_tsne = run_tsne(wsj_reduced, perplexity=5.0, lr=100., niter=5000, verbose=5)
# np.save(data_dir + "wsj_transcript_tsne.npy", wsj_tsne, allow_pickle=True)

### KNNW

In [7]:
# count_vec = CountVectorizer()
# knnw_docmat = count_vec.fit_transform(knnw_processed_comma['Text'].values)
# svd_components = 25
# tsvd = TruncatedSVD(n_components=svd_components, algorithm='randomized', n_iter=5, random_state=0, tol=0.0)
# tsvd.fit(knnw_docmat)
# print(f"ncomp {svd_components} explained variance {tsvd.explained_variance_ratio_.sum()}")
# knnw_reduced = tsvd.fit_transform(knnw_docmat)

In [8]:
# knnw_tsne = run_tsne(knnw_reduced, perplexity=25.0, lr=200., niter=5000, verbose=5)
# np.save(data_dir + "knnw_transcript_tsne.npy", knnw_tsne, allow_pickle=True)

# t-SNE on Spectrograms

## WSJ

In [9]:
flat_wsj_spectro = []
for utterance in wsj_spectro:
    for frame in utterance:
        flat_wsj_spectro.append(frame)

flat_wsj_spectro = np.array(flat_wsj_spectro)

In [10]:
flat_wsj_spectro

array([[1.5781574e-04, 4.6676921e-04, 6.0391880e-04, ..., 8.4340958e-05,
        7.8386125e-05, 5.9076814e-05],
       [3.9359243e-06, 8.8821416e-06, 6.0467682e-05, ..., 1.1556587e-05,
        1.5931026e-05, 1.9300111e-05],
       [4.3008840e-06, 4.1246236e-05, 1.8424325e-04, ..., 1.7501028e-05,
        1.5845562e-05, 7.0566102e-06],
       ...,
       [1.3842387e-04, 2.4957865e-04, 0.0000000e+00, ..., 3.2638627e-06,
        2.6198034e-06, 1.7233031e-06],
       [2.9245995e-03, 1.4689696e-03, 0.0000000e+00, ..., 2.6954149e-06,
        3.8057726e-06, 2.8651132e-06],
       [1.4540179e-03, 2.6830547e-05, 0.0000000e+00, ..., 1.0375397e-06,
        9.4315465e-07, 1.6158091e-06]], dtype=float32)

In [11]:
# svd_components = 5
# tsvd = TruncatedSVD(n_components=svd_components, algorithm='randomized', n_iter=5, random_state=0, tol=0.0)
# wsj_spectral_reduced = tsvd.fit_transform(flat_wsj_spectro)
# print(f"ncomp {svd_components} explained variance {tsvd.explained_variance_ratio_.sum()}")

In [12]:
np.random.seed(23)
limit = 50000
idxs = [i for i in range(flat_wsj_spectro.shape[0])]
idxs = np.random.choice(idxs, size=limit)
# wsj_spectral_reduced = wsj_spectral_reduced[:limit]
wsj_spectral_reduced = flat_wsj_spectro[idxs]

In [13]:
wsj_spectral_tsne = run_tsne(wsj_spectral_reduced, ncomponents=2, perplexity=50.0, lr=50., niter=1000, verbose=6)
np.save(data_dir + "wsj_spectral_tsne.npy", wsj_spectral_tsne, allow_pickle=True)

[D] [23:36:33.571295] ../src/tsne/tsne_runner.cuh:88 Data size = (50000, 40) with dim = 2 perplexity = 50.000000
[W] [23:36:33.571547] # of Nearest Neighbors should be at least 3 * perplexity. Your results might be a bit strange...
[D] [23:36:33.571559] ../src/tsne/tsne_runner.cuh:129 Getting distances.
[D] [23:36:34.272106] ../src/tsne/tsne_runner.cuh:155 Now normalizing distances so exp(D) doesn't explode.
[D] [23:36:34.273349] ../src/tsne/tsne_runner.cuh:163 Searching for optimal perplexity via bisection search.
[D] [23:36:34.324744] ../src/tsne/barnes_hut.cuh:75 N_nodes = 99999 blocks = 82
[D] [23:36:34.325441] ../src/tsne/barnes_hut.cuh:169 Start gradient updates!
[D] [23:36:35.503511] ../src/tsne/barnes_hut.cuh:279 SymmetrizeTime = 2 (0)
DistancesTime = 701 (41)
NormalizeTime = 1 (0)
PerplexityTime = 49 (3)
BoundingBoxKernel_time = 50 (3)
ClearKernel1_time  = 310 (18)
TreeBuildingKernel_time  = 51 (3)
ClearKernel2_time  = 525 (30)
SummarizationKernel_time  = 6 (0)
SortKernel_time

## KNNW

In [17]:
knnw_spectrogram = knnw_spectrogram.T

In [20]:
svd_components = 32
tsvd = TruncatedSVD(n_components=svd_components, algorithm='randomized', n_iter=5, random_state=0, tol=0.0)
knnw_spectro_svd = tsvd.fit_transform(knnw_spectrogram)
print(f"ncomp {svd_components} explained variance {tsvd.explained_variance_ratio_.sum()}")

ncomp 32 explained variance 0.9327141046524048


In [50]:
np.random.seed(27)
limit = 50000
idxs = [i for i in range(knnw_spectrogram.shape[0])]
idxs = np.random.choice(idxs, size=limit)
knnw_spectral_reduced = knnw_spectro_svd[idxs]

In [54]:
knnw_spectral_tsne = run_tsne(knnw_spectral_reduced, ncomponents=2, perplexity=50.0, lr=50., niter=10000, verbose=6)#, n_iter_without_progress=250)
np.save(data_dir + "knnw_spectral_tsne.npy", knnw_spectral_tsne, allow_pickle=True)

[D] [23:43:28.839085] cuml/common/logger.cpp:3080 Learning rate is adaptive. In TSNE paper, it has been shown that as n->inf, Barnes Hut works well if n_neighbors->30, learning_rate->20000, early_exaggeration->24.
[D] [23:43:28.839238] cuml/common/logger.cpp:3080 cuML uses an adpative method.n_neighbors decreases to 30 as n->inf. Likewise for the other params.
[D] [23:43:28.839327] cuml/common/logger.cpp:3080 New n_neighbors = 42, learning_rate = 16666.666666666664, exaggeration = 24.0
[D] [23:43:28.839360] ../src/tsne/tsne_runner.cuh:88 Data size = (50000, 32) with dim = 2 perplexity = 50.000000
[W] [23:43:28.839373] # of Nearest Neighbors should be at least 3 * perplexity. Your results might be a bit strange...
[D] [23:43:28.839386] ../src/tsne/tsne_runner.cuh:129 Getting distances.
[D] [23:43:28.994620] ../src/tsne/tsne_runner.cuh:155 Now normalizing distances so exp(D) doesn't explode.
[D] [23:43:28.996724] ../src/tsne/tsne_runner.cuh:163 Searching for optimal perplexity via bisect

## KNNW - WAV2VEC

In [13]:
flat_knnw_wav2vec = []
for utterance in knnw_wav2vec:
    for frame in utterance:
        flat_knnw_wav2vec.append(frame)

flat_knnw_wav2vec = np.array(flat_knnw_wav2vec)

In [None]:
svd_components = 64
tsvd = TruncatedSVD(n_components=svd_components, algorithm='randomized', n_iter=5, random_state=0, tol=0.0)
knnw_wav2vec_reduced = tsvd.fit_transform(flat_knnw_wav2vec)
print(f"ncomp {svd_components} explained variance {tsvd.explained_variance_ratio_.sum()}")

In [None]:
knnw_spectrogram = knnw_spectrogram.T
np.random.seed(23)
limit = 10000
idxs = [i for i in range(knnw_spectrogram.shape[0])]
idxs = np.random.choice(idxs, size=limit)

knnw_wav2vec_reduced = knnw_wav2vec_reduced[:100]

In [None]:
knnw_wav2vec_tsne = run_tsne(knnw_wav2vec_reduced, ncomponents=2, perplexity=10.0, lr=100., niter=1000, verbose=1, n_iter_without_progress=250)
np.save(data_dir + "knnw_wav2vec_tsne.npy", knnw_wav2vec_tsne, allow_pickle=True)