# Dimensionality Reduction

* Read about the data sets
  * Newsgroups: https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
  * MNIST: https://en.wikipedia.org/wiki/MNIST_database
* Experiment with feature extraction on the newsgroup data. For example:
 * Change which newsgroups are included (be ware of using too much data)
 * Change the feature extraction: word counts vs tf-idf, ngrams for words and characters
* Dimensionality reduction
 * Try out the three different types
 * Change the distance metrics for tSNE and UMAP

In [None]:
import numpy as np

import io
from tqdm import tqdm
import cv2
import ipywidgets

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

## Data

Here are two alternatives that overwrite each other. For the text data, experiment with different feature encodings, ngram orders, and metrics. If the clusters after dimensionality reduction looks better, see it as an indication that the model assumptions fit the data to a higher degree.

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
X = np.asarray(vectorizer.fit_transform(newsgroups_train.data).todense())
y = newsgroups_train.target
labels = newsgroups_train.target_names

In [None]:
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
labels = [str(n) for n in range(10)]

## Linear dimensionality reduction

Principal component analysis (PCA) finds a linear subspace. We can plot the cummulative variance explained for n components as follows.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=min(200, X.shape[1]//2))
X_pca = pca.fit_transform(X)

plt.figure(figsize=(6, 4))
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.ylabel("Cumulative explained variance [%]")
plt.xlabel("n components")
plt.show()

Let's plot the original data and the data after dimensionality reduction. Note how PCA finds more interesting directions.

In [None]:
def plot_2d(X, y, labels, ax=None):
  if ax is None:
    fig = plt.figure(figsize=(6, 6), dpi=100)
    ax = fig.subplots(1, 1)
  ax.scatter(X[:, 0], X[:, 1], c=y, s=15, cmap='tab10', alpha=.5)
  for label in np.unique(y):
    ax.text(np.mean(X[y==label, 0]),
            np.mean(X[y==label, 1]),
            labels[label],
            fontsize=16, zorder=1)
  if ax is None:
    fig.tight_layout(pad=0)
    fig.show()

fig = plt.figure(figsize=(13, 6))
ax = fig.subplots(1, 2)
plot_2d(X, y, labels, ax[0])
plot_2d(X_pca, y, labels, ax[1])
fig.tight_layout(pad=0)
fig.show()

Let's try 3D.

In [None]:
def plot_3d(X, y, labels, angle=None, ax=None):
  if ax is None:
    fig = plt.figure(figsize=(8, 6), dpi=100)
    ax = fig.add_subplot(1, 1, 1, projection='3d')
  ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y,
              alpha=.5, cmap='tab10')
  for label in np.unique(y):
    ax.text(np.mean(X[y==label, 0]),
            np.mean(X[y==label, 1]),
            np.mean(X[y==label, 2]),
            labels[label],
            fontsize=10,
            horizontalalignment='center',
            verticalalignment='center')
  if angle is not None:
    ax.view_init(20, angle % 360)
  if ax is None:
    fig.tight_layout(pad=0)
    fig.show()

fig = plt.figure(figsize=(8, 6), dpi=100)
ax = fig.add_subplot(1, 1, 1, projection='3d')
plot_3d(X_pca, y, labels, ax=ax)
fig.show()

A widget for rotating the data.

In [None]:
interact_plot = ipywidgets.interact(lambda angle: plot_3d(X_pca, y, labels, angle=angle),
                                    angle=ipywidgets.IntSlider(value=0, min=0, max=359, step=1, description="View angle"));
output = interact_plot.widget.children[-1] # This should prevent flickering
output.layout.height = '500px'

In [None]:
# fig = plt.figure(figsize=(8, 5), dpi=100)
# ax = fig.add_subplot(1, 1, 1, projection='3d')
# plot_3d(X_pca, y, labels, ax=ax)
# fig.tight_layout(pad=0)

# def init():
#     ax.view_init(20, 0)
#     return fig,

# def update(frame):
#     angle = frame % 360
#     ax.view_init(20, angle)
#     return fig,

# anim = FuncAnimation(fig, update, frames=tqdm(range(1, 360, 2), total=360, desc="Creating video"), interval=1000//20, init_func=init, blit=True, )
# anim.save('pca.mp4', fps=20, extra_args=['-vcodec', 'libx264'])
# HTML(anim.to_html5_video())
# # anim.save('pca.gif', writer='imagemagick', fps=20)

## Non-linear dimensionality reduction

### t-SNE

In [None]:
from sklearn.manifold import TSNE
import os

In [None]:
tsne = TSNE(n_components=2, metric='euclidean', n_jobs=os.cpu_count())
# tsne = TSNE(n_components=2, metric='cosine', n_jobs=os.cpu_count())
X_tsne2 = tsne.fit_transform(X)

In [None]:
plot_2d(X_tsne2, y, labels)

In [None]:
tsne = TSNE(n_components=3, metric='euclidean', n_jobs=os.cpu_count())
# tsne = TSNE(n_components=2, metric='cosine', n_jobs=os.cpu_count())
X_tsne3 = tsne.fit_transform(X)

In [None]:
plot_3d(X_tsne3, y, labels)

In [None]:
# fig = plt.figure(figsize=(8, 5), dpi=100)
# ax = fig.add_subplot(1, 1, 1, projection='3d')
# plot_3d(X_tsne3, y, labels, ax=ax)
# fig.tight_layout(pad=0)

# def init():
#     ax.view_init(20, 0)
#     return fig,

# def update(frame):
#     angle = frame % 360
#     ax.view_init(20, angle)
#     return fig,

# anim = FuncAnimation(fig, update, frames=tqdm(range(1, 360, 2), total=360, desc="Creating video"), interval=1000//20, init_func=init, blit=True, )
# anim.save('tsne.mp4', fps=20, extra_args=['-vcodec', 'libx264'])
# HTML(anim.to_html5_video())
# # anim.save('tsne.gif', writer='imagemagick', fps=20)

### UMAP

In [None]:
try:
  from umap import UMAP
except:
  !pip install umap-learn
finally:
  from umap import UMAP

In [None]:
umap = UMAP(n_components=2, metric='euclidean')
# umap = UMAP(n_components=2, metric='cosine')
X_umap2 = umap.fit_transform(X)

In [None]:
plot_2d(X_umap2, y, labels)

In [None]:
umap = UMAP(n_components=3, metric='euclidean')
# umap = UMAP(n_components=3, metric='cosine')
X_umap3 = umap.fit_transform(X)

In [None]:
plot_3d(X_umap3, y, labels)

In [None]:
# fig = plt.figure(figsize=(8, 5), dpi=100)
# ax = fig.add_subplot(1, 1, 1, projection='3d')
# plot_3d(X_umap3, y, labels, ax=ax)
# fig.tight_layout(pad=0)

# def init():
#     ax.view_init(20, 0)
#     return fig,

# def update(frame):
#     angle = frame % 360
#     ax.view_init(20, angle)
#     return fig,

# anim = FuncAnimation(fig, update, frames=tqdm(range(1, 30, 2), total=360, desc="Creating video"), interval=1000//20, init_func=init, blit=True, )
# anim.save('umap.mp4', fps=20, extra_args=['-vcodec', 'libx264'])
# HTML(anim.to_html5_video())
# # anim.save('umap.gif', writer='imagemagick', fps=20)