In [1]:
import os
import numpy as np
from umap import UMAP
import matplotlib.pyplot as plt

In [2]:
%matplotlib notebook

In [3]:
working_dir = '/media/jswaney/Drive/Justin/organoid_phenotyping/datasets/'
os.listdir(working_dir)

['cyto_labels_combined.npy',
 'cyto_profiles_combined.npy',
 'cyto_profiles_combined_samples.npy',
 'cyto_tsne_combined.npy',
 'cyto_umap_combined.npy',
 'd35_wt',
 'd60_wt',
 'niche_labels_combined.npy',
 'niche_proximities_combined.npy',
 'niche_proximities_samples.npy',
 'niche_proximity_combined.png',
 'niche_tsne_combined.npy',
 'old']

In [4]:
profiles = np.load(os.path.join(working_dir, 'cyto_profiles_combined.npy'))
profiles.shape

(30000, 3, 5)

In [5]:
features = profiles.reshape((30000, -1))

Optional scaling and preprocessing

In [170]:
means = profiles.mean(axis=0).mean(axis=-1) * np.ones((5, 3))
scaled_profiles = profiles / means.T

In [172]:
from sklearn.preprocessing import scale

In [201]:
scaled_features = scale(features)

Flattened profiles seem best here...

In [6]:
np.random.seed(7)
x_umap = UMAP(metric='euclidean').fit_transform(features)
x_umap.shape

(30000, 2)

In [7]:
plt.plot(x_umap[:, 0], x_umap[:, 1], '.', alpha=0.1, markersize=3)
plt.show()

<IPython.core.display.Javascript object>

Filter bad profiles

In [204]:
good_idx = np.where(x_umap[:, 0] > -5)
x_umap = x_umap[good_idx]
features = features[good_idx]

In [274]:
profiles = profiles[good_idx]

In [321]:
plt.figure(figsize=(6, 6))
plt.plot(x_umap[:, 0], x_umap[:, 1], '.', alpha=0.2, markersize=3)
plt.show()

<IPython.core.display.Javascript object>

In [8]:
from sklearn.cluster import AgglomerativeClustering

Agglomerative Clustering

In [11]:
clustering = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(x_umap)
labels = clustering.labels_

class_labels = np.unique(labels)
class_labels

array([0, 1, 2, 3, 4, 5])

In [12]:
plt.figure(figsize=(6, 6))
for i in range(0, class_labels.max() + 1):
    idx = np.where(labels == i)[0]
    plt.plot(x_umap[idx, 0], x_umap[idx, 1], '.', alpha=0.3, markersize=3, label=f'Cluster {i}')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

Show tSNE with these cluster labels

In [211]:
x_tsne = np.load(os.path.join(working_dir, 'cyto_tsne_combined.npy'))

In [212]:
x_tsne = x_tsne[good_idx]

In [304]:
for i in range(0, class_labels.max() + 1):
    idx = np.where(labels == i)[0]
    plt.plot(x_tsne[idx, 0], x_tsne[idx, 1], '.', alpha=0.1, markersize=3)
plt.show()

<IPython.core.display.Javascript object>

Show organoid labels

In [14]:
sample_labels = np.load(os.path.join(working_dir, 'cyto_profiles_combined_samples.npy'))

In [33]:
sample_labels = sample_labels[good_idx]

In [112]:
plt.figure(figsize=(8, 4))

plt.subplot(121)
for i in range(0, class_labels.max() + 1):
    idx = np.where(labels == i)[0]
    plt.plot(x_tsne[idx, 0], x_tsne[idx, 1], '.', alpha=0.1, markersize=3)
    
plt.subplot(122)
for i in np.unique(sample_labels):
    idx = np.where(sample_labels == i)[0]
    if i == 5:
        plt.plot(x_tsne[idx, 0], x_tsne[idx, 1], 'r.', alpha=0.1, markersize=3)
    else:
        plt.plot(x_tsne[idx, 0], x_tsne[idx, 1], 'b.', alpha=0.1, markersize=3)
plt.show()

<IPython.core.display.Javascript object>

In [15]:
plt.figure(figsize=(8, 4))

plt.subplot(121)
for i in range(0, class_labels.max() + 1):
    idx = np.where(labels == i)[0]
    plt.plot(x_umap[idx, 0], x_umap[idx, 1], '.', alpha=0.3, markersize=2, label=f'Cluster {i}')
# plt.legend()
    
plt.subplot(122)
for i in np.unique(sample_labels):
    idx = np.where(sample_labels == i)[0]
    if i == 5:
        plt.plot(x_umap[idx, 0], x_umap[idx, 1], 'r.', alpha=0.2, markersize=2)
    else:
        plt.plot(x_umap[idx, 0], x_umap[idx, 1], 'b.', alpha=0.2, markersize=2)
plt.show()

<IPython.core.display.Javascript object>

Inpsect profiles interactively

In [16]:
from sklearn.neighbors import NearestNeighbors

In [17]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [31]:
x = -6
y = 4

min_val = -8
max_val = 8

embedding = x_umap

fig = plt.figure(figsize=(9, 4))

ax0 = plt.subplot(121)
for i in range(class_labels.max() + 1):
    idx = np.where(labels == i)[0]
    plt.plot(embedding[idx, 0], embedding[idx, 1], '.', markersize=3, alpha=0.1)
line, = plt.plot(x, y, 'k*', markersize=8)
plt.xlabel('tSNE 1')
plt.ylabel('tSNE 2')

ax1 = plt.subplot(122)
ax1.set_ylim([0, 80])

nbrs = NearestNeighbors(1).fit(embedding)
dist, idx = nbrs.kneighbors(np.asarray([[x, y]]))
profile = profiles[idx][0, 0]

names = ['tbr1', 'sox2', 'dn']
colors = ['g', 'r', 'b']
line_profiles = []
for i, p in enumerate(profile):
    line_profile, = plt.plot(p, label=names[i], color=colors[i])
    line_profiles.append(line_profile)
plt.xlabel('distance')
plt.ylabel('count')
plt.legend(loc=1)
    
def update(x=0.0, y=0.0):
    line.set_xdata([x])
    line.set_ydata([y])
    
    _, idx = nbrs.kneighbors(np.asarray([[x, y]]))
    profile = profiles[idx][0, 0]
    for line_profile, p in zip(line_profiles, profile):
        line_profile.set_ydata(p)
    ax1.autoscale_view(None, False, True)
    fig.canvas.draw_idle()
    
interact(update, 
         x=widgets.FloatSlider(min=min_val, max=max_val, step=0.05),
         y=widgets.FloatSlider(min=min_val, max=max_val, step=0.05))

plt.show()

<IPython.core.display.Javascript object>

interactive(children=(FloatSlider(value=0.0, description='x', max=8.0, min=-8.0, step=0.05), FloatSlider(value…

Show sample of profiles from each cluster

In [19]:
colors = ['g', 'r', 'b']
plt.figure(figsize=(4, 12))
for i in np.unique(class_labels):
    idx = np.where(labels == i)[0]
    profile_cluster = profiles[idx]
    average_profile = profile_cluster.mean(axis=0)
    
    plt.subplot(class_labels[-1] + 1, 1, i+1)
    for p, c in zip(average_profile, colors):
        plt.plot(p, color=c)
plt.show()

<IPython.core.display.Javascript object>

In [20]:
n_profiles = 10

plt.figure(figsize=(10, 12))
for i in np.unique(class_labels):
    idx = np.where(labels == i)[0]
    profile_cluster = profiles[idx]
    
    np.random.shuffle(profile_cluster)
    sample = profile_cluster[:n_profiles]
    
    for j, s in enumerate(sample):
        subplot_idx = len(class_labels) * j + i + 1
        plt.subplot(n_profiles, class_labels[-1] + 1, subplot_idx)
        for p, c in zip(s, colors):
            plt.plot(p, color=c)
            plt.axis('off')
plt.show()

<IPython.core.display.Javascript object>

- Euclidean distance metric, 6 clusters on UMAP looks good
- Correlation distance metric, 5 clusters may look better

Build Seaborn Lineplots

In [23]:
import pandas as pd
import seaborn as sns

In [72]:
cluster_to_plot = 5

dist = []
count = []
celltype = []
cluster = []

# cluster_names = ['Interfering',
#                  'Mature',
#                  'Thicc',
#                  'Trash',
#                  'No Tbr1',
#                  'Little Tbr1']

cluster_names = ['Mature',
                 'Trash',
                 'No Tbr1',
                 'Interfering',
                 'Little Tbr1',
                 'Thick']

celltype_names = ['tbr1', 'sox2', 'dn']

for i, profile in enumerate(profiles):
    for k, p in enumerate(profile):
        for b, c in enumerate(p):
            cluster_name = cluster_names[labels[i]]
            if cluster_name == cluster_names[cluster_to_plot]:
                dist.append(b)
                count.append(c)
                cluster.append(cluster_name)
                celltype.append(celltype_names[k])
                
df = pd.DataFrame({'dist': dist, 
                   'count': count, 
                   'celltype': celltype})

In [63]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

In [65]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

In [67]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

In [69]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

In [71]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

In [73]:
sns.lineplot(x="dist", y="count",
             hue="celltype", ci='sd', palette=['g', 'r', 'b'],
             data=df)
plt.title(cluster_names[cluster_to_plot])
plt.show()

<IPython.core.display.Javascript object>

Save the cluster labels

In [75]:
np.save(os.path.join(working_dir, 'cyto_labels_combined.npy'), labels)

Create a classifier