# Python Machine Learning for Biology
# Dimensionality Reduction and Manifold Learning

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

### Principal Components Analysis

#### Load the cancer dataset

#### Store features and response as variables

#### Before applying PCA, each features should be centered on zero with unit variance

#### Use PCA to find the first two principal components of the breast cancer dataset

#### Plot the PCA-transformed version of the breast cancer dataset

#### Plot the magnitude of each feature value for the first two principal components

In [None]:
feature_names = list(X.columns)

In [None]:
fig = plt.figure(figsize=(8, 4))
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
plt.gca().set_xticks(np.arange(-.5, len(feature_names)))
plt.gca().set_yticks(np.arange(0.5, 2))
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12)
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12)
plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, 
                                             pca.components_.max()], pad=0.65)
plt.show()

### Independent Work
Conduct a PCA on the iris dataset to reduce dimensions down to 2.

### Manifold Learning Methods

### Multidimensional scaling (MDS)

In [None]:
from sklearn.manifold import MDS

#### Make 'hello' data

In [None]:
def make_hello(N=1000, rseed=42):
    # Make a plot with "HELLO" text; save as PNG
    fig, ax = plt.subplots(figsize=(4, 1))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
    ax.axis('off')
    ax.text(0.5, 0.4, 'HELLO', va='center', ha='center', weight='bold', size=85)
    fig.savefig('hello.png')
    plt.close(fig)
    
    # Open this PNG and draw random points from it
    from matplotlib.image import imread
    data = imread('hello.png')[::-1, :, 0].T
    rng = np.random.RandomState(rseed)
    X = rng.rand(4 * N, 2)
    i, j = (X * data.shape).astype(int).T
    mask = (data[i, j] < 1)
    X = X[mask]
    X[:, 0] *= (data.shape[0] / data.shape[1])
    X = X[:N]
    return X[np.argsort(X[:, 0])]


In [None]:
X = make_hello(1000)

In [None]:
colorize = dict(c=X[:, 0], cmap=plt.cm.get_cmap('rainbow', 5))
plt.scatter(X[:, 0], X[:, 1], **colorize)
plt.axis('equal')
plt.show()

#### Get pairwise distances

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
D = pairwise_distances(X)
D.shape

In [None]:
plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.show()

#### MDS with 2 components

In [None]:
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)

In [None]:
out = model.fit_transform(D)

#### Plot MDS

In [None]:
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal');

### Independent Work
Conduct an MDS on the cancer dataset (or another dataset of your choice)

### t-SNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
import matplotlib.patheffects as PathEffects

#### Load the data

In [None]:
digits = load_digits()
digits.data.shape

In [None]:
print(digits['DESCR'])

#### Take a look at the digits

In [None]:
nrows, ncols = 2, 5
plt.figure(figsize=(6,3))
plt.gray()
for i in range(ncols * nrows):
    ax = plt.subplot(nrows, ncols, i + 1)
    ax.matshow(digits.images[i,...])
    plt.xticks([]); plt.yticks([])
    plt.title(digits.target[i])
    plt.show()
plt.savefig('digits-generated.png', dpi=150)

#### Redorder the datapoints according to the handwritten numbers

In [None]:
X = np.vstack([digits.data[digits.target==i]
               for i in range(10)])
y = np.hstack([digits.target[digits.target==i]
               for i in range(10)])

#### Run the t-SNE algorithm on the dataset

In [None]:
digits_proj = TSNE().fit_transform(X)

#### Function to display transformed dataset

In [None]:
def scatter(x, colors):
    # We choose a color palette with seaborn.
    palette = np.array(sns.color_palette("hls", 10))

    # We create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40,
                    c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # We add the labels for each digit.
    txts = []
    for i in range(10):
        # Position of each label.
        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts

#### Plot

In [None]:
scatter(digits_proj, y)
plt.savefig('digits-tsne.png', dpi=150)