In [1]:
import pandas as pd
import numpy as np
from pyemd import emd_samples
from IPython.display import display

# To import packages from the parent directory
import sys
sys.path.insert(0,'..')

# Autoreload allows us to update our code mid-notebook
%load_ext autoreload
%autoreload 2

# Our code
import processing
import visualization

# Data Processing

In [2]:
# Load the data in as a Pandas DataFrame
df = pd.read_pickle('../data/sample_morphologies.pickle')
print(f'Length of df: {len(df)}')

Length of df: 496


We will use a data set containing 4 versions (i.e., different random initial values) of trajectories generated with parameters $BR \in \{ 0.50, 0.53, 0.56, 0.59\}$ and $CHI \in \{ 2.2, 2.8, 3.4, 4.0 \}$ for a total of 64 trajectories, each containing a number of morphologies.

Since many of the morphology samples look very similar, we'll only keep the first and every 25th morphology (i.e., when `timestep % 25 == 0`) of each trajectory.  This will reduce processing time and make the visualizations easier to interpret.

In [3]:
print('Unique BR values:', np.unique(df.BR))
print('Unique CHI values:', np.unique(df.CHI))
print('Unique version values:', np.unique(df.version))
print('Total Trajectories:', len(df.groupby(['BR', 'CHI', 'version'])))
print('Total Morphologies:', len(df))

Unique BR values: [0.5  0.53 0.56 0.59]
Unique CHI values: [2.2 2.8 3.4 4. ]
Unique version values: [0 1 2 3]
Total Trajectories: 64
Total Morphologies: 496


In [4]:
# Print a sample record
sample = df.iloc[206] # grab random sample
print(sample) # print the sample record

BR                                                       0.56
CHI                                                       3.4
count                                                     266
image       [0.03752598166465759, 0.03517324849963188, 0.0...
timestep                                                  250
version                                                     2
Name: 206, dtype: object


We can vizualize the data as a Heatmap with Plotly.  This function generates a template figure and it's layout to be used with our data.

In [5]:
sample_image = sample.image.reshape((100, 400))
visualization.get_image_figure(sample_image)#, height=250, width=1000)

FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

## Components

We want to isolate each component for individual analysis.

In [6]:
components = processing.extract_components(sample_image)

print('Original')
display(visualization.get_image_figure(sample_image))

for idx, component in enumerate(components):
    if idx % 25 == 0:
        print(f'Component {idx}')
        display(visualization.get_image_figure(component))
    if idx > 100:
        break

Original


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

Component 0


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

Component 25


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

Component 50


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

Component 75


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

Component 100


FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

# Earth Mover's Distance

Introduced in the paper ["A Metric for Distributions with Applications to Image Databases"](https://users.cs.duke.edu/~tomasi/papers/rubner/rubnerIccv98.pdf), 1998 (Rubner, Tomasi, and Guibas), the Earth Mover's Distance (EMD) is a metric which can be used to determine the similarity between two distributions.

EMD has many applications, including image database retrieval and GAN training.  A nice, brief explanation can be [found here](https://lilianweng.github.io/lil-log/2017/08/20/from-GAN-to-WGAN.html#what-is-wasserstein-distance), and a more thorough write-up can be [found here](http://infolab.stanford.edu/pub/cstr/reports/cs/tr/99/1620/CS-TR-99-1620.ch4.pdf).

[PyEMD](https://pypi.org/project/pyemd/) provides an effecient implementation of the Earth Mover's Distance.  We can use it to determine pairwise EMDs for our sample morphologies.  To do this, we need to extract signatures from our images.

In [7]:
sample_x = df.iloc[206].image.reshape((100, 400))
sample_y = df.iloc[103].image.reshape((100, 400))

display(visualization.get_image_figure(sample_x))
display(visualization.get_image_figure(sample_y))

FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

FigureWidget({
    'data': [{'hoverinfo': 'none',
              'showscale': False,
              'type': 'hea…

In [8]:
sigs_x = processing.apply_to_components(sample_x, processing.pixel_ratio_sig)
sigs_y = processing.apply_to_components(sample_y, processing.pixel_ratio_sig)

emd_samples(sigs_x, sigs_y)

0.47036472246696165

To better visualize the distances between morphologies, we may create a distance matrix.

In [29]:
def get_dist_matrix(sigs_x, sigs_y=None):
    if sigs_y is None:
        sigs_y = sigs_x
    dist_matrix = []
    for y in sigs_y:
        row = []
        for x in sigs_x:
            row.append(emd_samples(x, y))
        dist_matrix.append(row)
    return dist_matrix

We can extract signatures out of each of our images by applying them with `apply_signatures`

In [30]:
def get_signatures(images, sig_funcs):
    sigs = []
    for image in images:
        sigs.append(processing.apply_to_components(image, sig_funcs))
    return sigs

In [31]:
images = df.iloc[:10].image.apply(lambda d: d.reshape((100, 400)))
sigs = get_signatures(images, processing.pixel_ratio_sig)

In [43]:
viz = visualization.get_distance_matrix_figure(get_dist_matrix(sigs))
viz.data[0].update(zmin=0., zmax=0.4) # this adjusts the color scale
display(viz)

FigureWidget({
    'data': [{'type': 'heatmap',
              'uid': '3d9dcbcd-f636-4c14-980e-96b9bb8aedc7',
 …