In [1]:
import pandas as pd
import numpy as np
from pyemd import emd_samples
import plotly.graph_objects as go
from IPython.display import display

# To import packages from the parent directory
import sys
sys.path.insert(0,'..')

# Autoreload allows us to update our code mid-notebook
%load_ext autoreload
%autoreload 2

# Our code
import processing
import visualization

In [2]:
file_path = '../data/sample_morphologies.pickle'
#file_path = '../data/20200423-1804_states.pickle'
df = pd.read_pickle(file_path)
print(f'Length of df: {len(df)}')

Length of df: 11681


Collect final states

In [None]:
final_states = []
for grouping, group in df.groupby(['BR', 'CHI', 'version']):
    final_state = group[group.timestep == max(group.timestep)].iloc[0]
    final_states.append(final_state)
    
df_finals = pd.DataFrame(final_states)

Select 3 samples for analysis.

In [None]:
sample_x = df_finals.iloc[40]
sample_y = df_finals.iloc[8]
sample_z = df_finals.iloc[52]

sample_image_x = sample_x.image.reshape((100, 400))
sample_image_y = sample_y.image.reshape((100, 400))
sample_image_z = sample_z.image.reshape((100, 400))

fig_x = visualization.get_image_figure(sample_image_x)
fig_y = visualization.get_image_figure(sample_image_y)
fig_z = visualization.get_image_figure(sample_image_z)

print('sample_x')
print(sample_x)
display(fig_x)
print()

print('sample_y')
print(sample_y)
display(fig_y)
print()

print('sample_y')
print(sample_y)
display(fig_y)

Measure and store the signatures and measurements for the samples in the dataset.

In [None]:
df_sigs = df_finals.copy()
df_sigs['perimeter_area_ratio_sig'] = np.empty(len(df_sigs), dtype=np.object)
df_sigs['rect_area_ratio_sig'] = np.empty(len(df_sigs), dtype=np.object)
df_sigs['average_pixel_intensity_sig'] = np.empty(len(df_sigs), dtype=np.object)

df_sigs['average_pixel_intensity'] = np.empty(len(df_sigs), dtype=np.object)
df_sigs['fft'] = np.empty(len(df_sigs), dtype=np.object)

for idx, row in df_sigs.iterrows():
    comp_sample = row.image.reshape((100, 400))
    comp_components = processing.extract_components(comp_sample, background=1)    
    df_sigs.at[
        idx,
        'perimeter_area_ratio_sig'
    ] = processing.apply_to_components(comp_components, processing.perimeter_area_ratio_sig)
    
    df_sigs.at[
        idx,
        'rect_area_ratio_sig'
    ] = processing.apply_to_components(comp_components, processing.rect_area_ratio_sig)
    
    df_sigs.at[
        idx,
        'average_pixel_intensity_sig'
    ] = processing.apply_to_components(comp_components, processing.average_pixel_intensity_sig)
    
    df_sigs.at[
        idx,
        'average_pixel_intensity'
    ] = np.mean(comp_sample)
    
    df_sigs.at[
        idx,
        'fft'
    ] = np.fft.fft2(comp_sample)

# Similarity Query

Sort the dataset be most/least similar.

Set the query morphology.

In [None]:
sample_idx = 8
sample = df_sigs.iloc[sample_idx]
sample_image = sample.image.reshape((100, 400))

print(sample)
display(visualization.get_image_figure(sample_image))

Calculate distances between query sample signatures and database signatures.

In [None]:
dist_data = []

perimeter_area_ratio_sig = processing.apply_to_components(components, processing.perimeter_area_ratio_sig)
rect_area_ratio_sig = processing.apply_to_components(components, processing.rect_area_ratio_sig)

for idx, row in df_sigs.iterrows():
    dist_data.append({
        'BR': row.BR,
        'CHI': row.CHI,
        'version': row.version,
        'image': row.image,
        'emd_perimeter_area_ratio': emd_samples(row.perimeter_area_ratio_sig, sample.perimeter_area_ratio_sig),
        'emd_rect_area_ratio': emd_samples(row.rect_area_ratio_sig, sample.rect_area_ratio_sig),
        'emd_average_pixel_intensity': emd_samples(row.average_pixel_intensity_sig, sample.average_pixel_intensity_sig),
        'l2_average_pixel_intensity': np.linalg.norm(row.average_pixel_intensity - sample.average_pixel_intensity),
        'l2_fft': np.linalg.norm(row.fft - sample.fft)
    })

df_dist = pd.DataFrame(dist_data)

In [None]:
dist_list = [
    'emd_perimeter_area_ratio',
    'emd_rect_area_ratio',
    'emd_average_pixel_intensity',
    'l2_average_pixel_intensity',
    'l2_fft'
]

Display the most similar/dissimilar images for each signature.

In [None]:
for idx in range(4):
    print('-' * 30)
    print(f'Nearest: {idx}')
    print('-' * 30)
    print()
    for dist_name in dist_list:
        print(dist_name)        
        nearest = df_dist.sort_values(dist_name).iloc[idx]
        print(nearest)
        display(visualization.get_image_figure(nearest.image.reshape((100, 400))))
        print('\n\n====================================================\n\n')

In [None]:
for idx in range(3):
    print('-' * 30)
    print(f'Furthest: {idx}')
    print('-' * 30)
    print()
    for dist_name in dist_list:
        print(dist_name)        
        furthest = df_dist.sort_values(dist_name).iloc[-1-idx]
        print(furthest)
        display(visualization.get_image_figure(furthest.image.reshape((100, 400))))
        print('\n\n====================================================\n\n')

## Distance Matrix

In [None]:
chi_values = df_sigs.CHI.unique()
br_values = df_sigs.BR.unique()
version_values = df_sigs.version.unique()

def get_dist_matrix(sigs_x, sigs_y=None):
    if sigs_y is None:
        sigs_y = sigs_x
    dist_matrix = []
    for y in sigs_y:
        row = []
        for x in sigs_x:
            row.append(emd_samples(x, y))
        dist_matrix.append(row)
    return dist_matrix

In [None]:
dist_name_pretty = {
    'emd_perimeter_area_ratio': 'Average EMD (Perimeter to Area Ratio)',
    'emd_rect_area_ratio': 'Average EMD (Rectangle Area Ratio)',
    'emd_average_pixel_intensity': 'Average EMD (Average Pixel Intensity)',
    'l2_average_pixel_intensity': 'Average L2 (Average Pixel Intensity)',
    'l2_fft': 'Average L2 (FFT)'
}

print(sample)
display(visualization.get_image_figure(sample_image))
print()

for dist_name in dist_list:
    print(dist_name)
    dist_matrix = []
    for chi in chi_values:
        row = []
        for br in br_values:
            version_dists = []
            for version in version_values:
                comp_sample = df_dist[
                    (df_dist.CHI == chi) &
                    (df_dist.BR == br) &
                    (df_dist.version == version)
                ].iloc[0]
                if (comp_sample[['CHI', 'BR', 'version']] == sample[['CHI', 'BR', 'version']]).all():
                    continue
                version_dists.append(comp_sample[dist_name])
            row.append(np.mean(version_dists))
        dist_matrix.append(row)
    
    title = f'''$\\text{{{dist_name_pretty[dist_name]}}}\\\\
    \\text{{Sample: }}(\phi={sample.BR}, \chi={sample.CHI}, \\text{{version}}={sample.version})$'''

    viz = visualization.get_distance_matrix_figure(
        dist_matrix,
        labels_x = br_values,
        labels_y = chi_values,
        scaleanchor=None,
        showticklabels=True,
        title=title,
        xtitle='$\phi$',
        ytitle='$\chi$'
    )
    display(viz)

# MDS

In [None]:
from sklearn.manifold import MDS
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

In [None]:
dist_matrix = []

sort_by = ['CHI', 'BR']
for _, x_row in df_sigs.sort_values(sort_by).iterrows():
    matrix_row = []
    for _, y_row in df_sigs.sort_values(sort_by).iterrows():
        matrix_row.append(emd_samples(x_row.perimeter_area_ratio_sig, y_row.perimeter_area_ratio_sig))
    dist_matrix.append(matrix_row)

dist_matrix = np.array(dist_matrix)

In [None]:
plt.imshow(dist_matrix)

In [None]:
dist_mds = MDS(dissimilarity='precomputed')
dist_embedding = dist_mds.fit_transform(dist_matrix)

dist_embedding[:, 0] -= min(dist_embedding[:, 0])
dist_embedding[:, 1] -= min(dist_embedding[:, 1])

dist_embedding[:, 0] /= max(dist_embedding[:, 0])
dist_embedding[:, 1] /= max(dist_embedding[:, 1])

plt.scatter(dist_embedding[:, 0], dist_embedding[:, 1], c=np.linspace(0, 1, num=dist_embedding.shape[0]))
plt.show()

In [None]:
print(br_values)
print(chi_values)

In [None]:
keys = [
    { 'br': 0.5, 'chi': 2.8, 'version': 0 },
    { 'br': 0.5, 'chi': 2.8, 'version': 1 },
    { 'br': 0.56, 'chi': 3.4, 'version': 0 },
    { 'br': 0.56, 'chi': 3.4, 'version': 1 }
]

sigs = []
colors = []
color_options = ['r', 'b', 'k', 'c']
for kdx, key in enumerate(keys):
    color_option = color_options[kdx]
    traj = df[(df.BR == key['br']) & (df.CHI == key['chi']) & (df.version == key['version']) & (df.timestep != 0)]
    for idx, row in traj.iterrows():
        image = row.image.reshape((100, 400))
        comps = processing.extract_components(image, background=1)
        sig = processing.apply_to_components(comps, processing.perimeter_area_ratio_sig)
        if len(sig) == 0:
            continue
        sigs.append(sig)
        colors.append(color_option)

In [None]:
dist_matrix = []

for sig_x in sigs:
    matrix_row = []
    for sig_y in sigs:
        matrix_row.append(emd_samples(sig_x, sig_y))
    dist_matrix.append(matrix_row)

dist_matrix = np.array(dist_matrix)

In [None]:
dist_mds = MDS(dissimilarity='precomputed')
dist_embedding = dist_mds.fit_transform(dist_matrix)

# dist_embedding[:, 0] -= min(dist_embedding[:, 0])
# dist_embedding[:, 1] -= min(dist_embedding[:, 1])

# dist_embedding[:, 0] /= max(dist_embedding[:, 0])
# dist_embedding[:, 1] /= max(dist_embedding[:, 1])

plt.scatter(dist_embedding[:, 0], dist_embedding[:, 1], c=colors)
plt.show()

In [None]:
plt.imshow(dist_matrix)