# t-SNE Explorations

In [192]:
%matplotlib notebook

import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import sqlite3
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from   sklearn.decomposition import PCA
from   sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding
from   sklearn.preprocessing import StandardScaler

np.set_printoptions(suppress=True)
np.set_printoptions(precision=4)
plt_style = 'seaborn-talk'

# Experimental: HDBScan is a state-of-the-art clustering algorithm
hdbscan_available = True
try:
    import hdbscan
except ImportError:
    hdbscan_available = False

In [193]:
conn = sqlite3.connect('../../livs.db')  # Create db and establish connection
conn.row_factory = sqlite3.Row
curs = conn.cursor()
result = []
rows = curs.execute('select * from livs where Huvudgrupp = "Grönsaker" OR Huvudgrupp = "Sås dressing majonnäs" OR Huvudgrupp = "Gröt"')
for row in rows:
        result.append(row)

db_contents = np.array(result)

conn.close()

In [194]:
data = db_contents[0:2000,4:7]
print(data.shape)
labels = db_contents[0:2000,60].reshape(len(labels),1)
print(labels.shape)
data=np.hstack((data,labels))

(189, 3)
(189, 1)


## Plotting function

## Generate the data
To create a simple test case, we generate some Gaussian point clouds in $\mathbb{R}^3$. Then we see how they look embedded in $\mathbb{R}^2$ and $\mathbb{R}^3$ using various algorithms, including t-SNE.

There's one point cloud centered at the origin and three on each of the three axes. They are well separated, as can be seen in the first visualization below.

In [195]:
data_df = pd.DataFrame(data, columns=('x','y','z'))
data_df['label'] = labels
data_df.head()

ValueError: Shape of passed values is (4, 189), indices imply (3, 189)

In [196]:
data.shape

(189, 4)

In [197]:
labels

array([['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Sås dressing majonnäs'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['Grönsaker'],
       ['G

## Prep the data for fitting and visualizing

In [198]:
X = data_df.iloc[:,0:-1]
X_std = StandardScaler().fit_transform(X)
y = data_df['label'].values

In [199]:
X.values.shape

(189, 3)

In [200]:
X_fit = X
labels = y
title=None
dimension = X_fit.shape[1]
dimension

3

In [201]:
labels

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2], dtype=object)

In [202]:
label_types = np.array(sorted(list(set(data.T[3]))))
print(label_types)

['Grönsaker' 'Gröt' 'Sås dressing majonnäs']


In [203]:
dummy = np.empty(len(data)).reshape(len(data),1)
data=np.hstack((data,dummy))
#Gör om text-labels till löpnummer för kategorin
for index, label in enumerate(data.T[3]):
    ind = np.where(label==label_types)
    data[index][4] = ind[0][0]

In [204]:
data.T[4]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2], dtype=object)

In [205]:
label_types = np.array(sorted(list(set(data.T[4]))))
num_labels = len(label_types)
colors = cm.Accent(np.linspace(0, 1, num_labels))
colors

array([[ 0.498 ,  0.7882,  0.498 ,  1.    ],
       [ 0.2196,  0.4235,  0.6902,  1.    ],
       [ 0.4   ,  0.4   ,  0.4   ,  1.    ]])

In [None]:
with plt.style.context(plt_style):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    for lab, col in zip(label_types, colors):
        ax.scatter(X_fit[labels==lab, 0],
                   X_fit[labels==lab, 1],
                   X_fit[labels==lab, 2],
                   c=col)
    plt.title('title')
    plt.show()

## 3D view

In [None]:
do_plot(X.values, 'Original Data',labels)

In [None]:
X.values[:,0:-1].shape

## Dumb 2D projection
The easiest way to embed our data into two dimensions is to project onto the plane $x=0$.

In [None]:

X.values[:,1:]

In [None]:
do_plot(X.values[:,1:], 'Plane $x=0$')

## PCA (2D)

In [None]:
do_plot(PCA(n_components=2).fit_transform(X), 'PCA')

## Isomap

In [None]:
do_plot(Isomap(n_components=2).fit_transform(X), 'Isomap')

## Locally Linear Embedding

In [None]:
do_plot(LocallyLinearEmbedding(n_components=2).fit_transform(X), 'Locally Linear Embedding')

## Spectral Embedding

In [None]:
do_plot(SpectralEmbedding(n_components=2).fit_transform(X), 'Spectral Embedding')

## Multi-dimensional scaling (MDS)

In [None]:
do_plot(MDS(n_components=2).fit_transform(X), 'Multi-dimensional Scaling')

## t-SNE (2D)

In [None]:
tsne2 = TSNE(n_components=2, random_state=0)
do_plot(tsne2.fit_transform(X), 't-SNE')

## PCA (3D)

In [None]:
do_plot(PCA(n_components=3).fit_transform(X), 'PCA')

## t-SNE (3D)

In [None]:
tsne3 = TSNE(n_components=3, learning_rate=100, random_state=0)
do_plot(tsne3.fit_transform(X), 't-SNE')

## HDBScan
HDBScan is a fairly recent and well-regarded clustering algorithm. The reason it's here is to see how well it does on some fairly simple data and visualize its results via t-SNE.

In [None]:
if hdbscan_available:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
    cluster_labels = clusterer.fit_predict(X_std) + 1
    print('Number of clusters: {}'.format(max(cluster_labels)))
    do_plot(tsne3.fit_transform(X), 'HDBScan', labels=cluster_labels)

Slask

In [None]:
num_dimensions = 3 #Gjort endast för 3D...
num_clusters = 4
num_points = 100
cluster_separation = 6
centers = np.array([(0,0,0), (1,0,0), (0,1,0), (0,0,1)], dtype=float) * cluster_separation
data = np.zeros((num_clusters * num_points, num_dimensions), dtype=float)
labels = np.zeros(num_clusters * num_points, dtype=int)
for c in range(num_clusters):
    start = c * num_points
    end = start + num_points
    data[start:end, :] = np.random.randn(num_points, num_dimensions) + centers[c]
    labels[start:end] = c

In [None]:
def do_plot(X_fit, labels, title=None):
    dimension = X_fit.shape[1]
    label_types = sorted(list(set(labels)))
    num_labels = len(label_types)
    colors = cm.Accent(np.linspace(0, 1, num_labels))
    with plt.style.context(plt_style):
        fig = plt.figure()
        if dimension == 2:
            ax = fig.add_subplot(111)
            for lab, col in zip(label_types, colors):
                ax.scatter(X_fit[labels==lab, 0],
                           X_fit[labels==lab, 1],
                           c=col)
        elif dimension == 3:
            ax = fig.add_subplot(111, projection='3d')
            for lab, col in zip(label_types, colors):
                ax.scatter(X_fit[labels==lab, 0],
                           X_fit[labels==lab, 1],
                           X_fit[labels==lab, 2],
                           c=col)
        else:
            raise Exception('Unknown dimension: %d' % dimension)
        plt.title(title)
        plt.show()

In [None]:
X.values.T[0]

In [None]:
for label in label_types:
    print(data[np.where(data.T[4]==label)].T[0])
    

In [None]:

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

n = 189

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for label_type in label_types:
    print(label_type)
    xs = data[np.where(data.T[4]==label_type)].T[0]
    ys = data[np.where(data.T[4]==label_type)].T[1]
    zs = data[np.where(data.T[4]==label_type)].T[2]
    ax.scatter(xs, ys, zs, c=colors[label_type], marker=m)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.title('title')
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

n = 189

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for label_type in label_types:
    print(label_type)
    xs = data[np.where(data.T[4]==label_type)].T[0]
    ys = data[np.where(data.T[4]==label_type)].T[1]
    ax.scatter(xs, ys, c=colors[label_type], marker=m)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
plt.title('title')
ax.annotate('hej', (2,30))
plt.show()

#for i, _ in enumerate(data):
#ax.annotate('hej', (2,3,4))
