# Data Utilities

This is a library of utilities which we use for exploratory data analysis on the "Mixed Signals" MNIST data.



In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt
import h5py
import umap
from sklearn import manifold

In [2]:
def load_data(fname):
    """Load dataset and labels from HDF5 file with properties (X, labels)."""
    with h5py.File(fname, 'r') as f:
        X = f["X"].value
        try:
            labels = f["labels"].value
        except KeyError:
            labels = None
    return X, labels

In [3]:
def normalize_image_data(X):
    """
    Normalize image data through the following process:
    
    1. Convert pixel intensities (0-255) to floats (0.0-1.0).
    2. Mean center the data.
    3. Compute the covariance.
    
    Returns:
    
    * X0 - the mean centered data
    * Xbar - mean values
    * CX - the mean centered covariance matrix
    """
    X = X / 255.0
    Xbar = np.mean(X, 0)
    X0 = X - Xbar
    CX = X0.T @ X0
    return X0, Xbar, CX

In [4]:
def graph_variance_explained(S, n=100):
    """
    Produce a graph of the covariance explained by principal components from `S`,
    the vector representation of the diagonal matrix of singular values
    resulting from SVD on the covariance matrix.
    """
    S_cum = np.add.accumulate(S)
    S_tot = S_cum[-1]
    expl = S_cum / S_tot
    X = np.arange(1,n+1)
    plt.figure(figsize=(10,6))
    plt.plot(X, expl[:n], label="Total Variance Explained")
    plt.plot(X, S[:n] / S[0], label="Singular Value (% of first)")
    plt.legend()

In [5]:
def show_img(x):
    """Show a single image by reshaping to 28x28 and plotting the values."""
    fig = plt.imshow(x.reshape(28,28))
    fig.axes.get_xaxis().set_visible(False)
    fig.axes.get_yaxis().set_visible(False)
    return fig

In [6]:
def show_data(X, cols=5, width=10):
    """Show a sample of data values represented by the rows of `X` as a grid of images."""
    n = min(len(X), 200)
    rows = 1 + (n//cols)
    height = (width / float(cols)) * rows
    plt.figure(figsize=(width,height))
    for i in range(n):
        plt.subplot(rows,cols,i+1)
        fig = show_img(X[i].reshape(28,28))
    plt.tight_layout()
    plt.subplots_adjust(wspace=0, hspace=0)

In [7]:
def show_pcs(U, n=10):
    """
    Display the first `n` principal components as images.
    These are taken as columns of the `U` matrix from the SVD.
    """
    rows = 1 + (n//5)
    plt.figure(figsize=(15,rows*2))
    for i in range(n):
        plt.subplot(rows,5,i+1)
        fig = show_img(U[:,i])
        plt.title("PC {}".format(i+1))
    plt.tight_layout()

In [8]:
def show_embeddings(Xlist, titles=None, labels=None):
    """
    Display multiple embeddings as a horizontal strip of 2-D scatterplots.
    
    Inputs:
    * Xlist - list of embedding matrices, each with shape (N,2)
    * titles - corresponding list of plot titles (optional)
    * labels - array labels (for coloring only)    
    """
    num = len(Xlist)
    plt.figure(figsize=(5*num,5))
    for i, X in enumerate(Xlist):
        plt.subplot(1,num,i+1)
        plt.scatter(X[:,0], X[:,1], c=labels, cmap='tab10', s=1)
        plt.axis('equal')
        plt.xticks([])
        plt.yticks([])
        if titles:
            plt.title(titles[i])
    plt.tight_layout()

In [9]:
def isomap_embedding(X):
    """Compute the 2-D Isomap embedding of the given data."""
    return manifold.Isomap(n_neighbors=15, n_components=2).fit_transform(X)

In [10]:
def lle_embedding(X):
    """Compute the 2-D LLE embedding of the given data."""
    return manifold.LocallyLinearEmbedding(n_neighbors=15, n_components=2).fit_transform(X)

In [15]:
def mlle_embedding(X):
    """Compute the 2-D LLE embedding of the given data."""
    return manifold.LocallyLinearEmbedding(n_neighbors=15, n_components=2, method='modified').fit_transform(X)

In [11]:
def umap_embedding(X):
    """Compute the 2-D UMAP embedding of the given data."""
    return umap.UMAP(n_neighbors=15, n_components=2).fit_transform(X)

In [12]:
import cache_magic

%cache magic is now registered in ipython


In [17]:
%cache

FileNotFoundError: [Errno 2] No such file or directory: '/Users/dbeach/Desktop/mixed_signals/.cache_magic/mlle_mnist_all/data.txt'