# t-SNE Explorations

In [None]:
%matplotlib notebook

import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from   sklearn.decomposition import PCA
from   sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding
from   sklearn.preprocessing import StandardScaler

np.set_printoptions(suppress=True)
np.set_printoptions(precision=4)
plt_style = 'seaborn-talk'

# Experimental: HDBScan is a state-of-the-art clustering algorithm
hdbscan_available = True
try:
    import hdbscan
except ImportError:
    hdbscan_available = False

## Plotting function

## Generate the data
To create a simple test case, we generate some Gaussian point clouds in $\mathbb{R}^3$. Then we see how they look embedded in $\mathbb{R}^2$ and $\mathbb{R}^3$ using various algorithms, including t-SNE.

There's one point cloud centered at the origin and three on each of the three axes. They are well separated, as can be seen in the first visualization below.

In [2]:
num_dimensions = 3
num_clusters = 4
num_points = 100
cluster_separation = 6
centers = np.array([(0,0,0), (1,0,0), (0,1,0), (0,0,1)], dtype=float) * cluster_separation
data = np.zeros((num_clusters * num_points, num_dimensions), dtype=float)
labels = np.zeros(num_clusters * num_points, dtype=int)
for c in range(num_clusters):
    start = c * num_points
    end = start + num_points
    data[start:end, :] = np.random.randn(num_points, num_dimensions) + centers[c]
    labels[start:end] = c
data_df = pd.DataFrame(data, columns=('x','y','z'))
data_df['label'] = labels
data_df.head()

Unnamed: 0,x,y,z,label
0,0.020857,0.048423,-0.572323,0
1,-0.851294,0.473435,1.289394,0
2,-1.917264,-0.244721,1.202384,0
3,-0.737913,-0.232346,1.386065,0
4,0.772299,-0.930009,0.878075,0


## Prep the data for fitting and visualizing

In [3]:
X = data_df.iloc[:,0:-1]
X_std = StandardScaler().fit_transform(X)
y = data_df['label'].values

def do_plot(X_fit, title=None, labels=y):
    dimension = X_fit.shape[1]
    label_types = sorted(list(set(labels)))
    num_labels = len(label_types)
    colors = cm.Accent(np.linspace(0, 1, num_labels))
    with plt.style.context(plt_style):
        fig = plt.figure()
        if dimension == 2:
            ax = fig.add_subplot(111)
            for lab, col in zip(label_types, colors):
                ax.scatter(X_fit[labels==lab, 0],
                           X_fit[labels==lab, 1],
                           c=[col])
        elif dimension == 3:
            ax = fig.add_subplot(111, projection='3d')
            for lab, col in zip(label_types, colors):
                ax.scatter(X_fit[labels==lab, 0],
                           X_fit[labels==lab, 1],
                           X_fit[labels==lab, 2],
                           c=[col])
        else:
            raise Exception('Unknown dimension: %d' % dimension)
        plt.title(title)
        plt.show()

## 3D view

In [4]:
do_plot(X.values, 'Original Data')

<IPython.core.display.Javascript object>

In [5]:
X.values[:,0:-1].shape

(400, 2)

## Dumb 2D projection
The easiest way to embed our data into two dimensions is to project onto the plane $x=0$.

In [6]:
do_plot(X.values[:,1:], 'Plane $x=0$')

<IPython.core.display.Javascript object>

## PCA (2D)

In [7]:
do_plot(PCA(n_components=2).fit_transform(X), 'PCA')

<IPython.core.display.Javascript object>

## Isomap

In [8]:
do_plot(Isomap(n_components=2).fit_transform(X), 'Isomap')

<IPython.core.display.Javascript object>

## Locally Linear Embedding

In [9]:
do_plot(LocallyLinearEmbedding(n_components=2).fit_transform(X), 'Locally Linear Embedding')

<IPython.core.display.Javascript object>

## Spectral Embedding

In [10]:
do_plot(SpectralEmbedding(n_components=2).fit_transform(X), 'Spectral Embedding')

<IPython.core.display.Javascript object>

## Multi-dimensional scaling (MDS)

In [11]:
do_plot(MDS(n_components=2).fit_transform(X), 'Multi-dimensional Scaling')

<IPython.core.display.Javascript object>

## t-SNE (2D)

In [12]:
tsne2 = TSNE(n_components=2, random_state=0)
do_plot(tsne2.fit_transform(X), 't-SNE')

<IPython.core.display.Javascript object>

## PCA (3D)

In [13]:
do_plot(PCA(n_components=3).fit_transform(X), 'PCA')

<IPython.core.display.Javascript object>

## t-SNE (3D)

In [14]:
tsne3 = TSNE(n_components=3, learning_rate=100, random_state=0)
do_plot(tsne3.fit_transform(X), 't-SNE')

<IPython.core.display.Javascript object>

## HDBScan
HDBScan is a fairly recent and well-regarded clustering algorithm. The reason it's here is to see how well it does on some fairly simple data and visualize its results via t-SNE.

In [15]:
if hdbscan_available:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
    cluster_labels = clusterer.fit_predict(X_std) + 1
    print('Number of clusters: {}'.format(max(cluster_labels)))
    do_plot(tsne3.fit_transform(X), 'HDBScan', labels=cluster_labels)

Number of clusters: 4


<IPython.core.display.Javascript object>