In [None]:
import numpy as np
from numpy import linalg as LA
from scipy.sparse import csc_matrix,coo_matrix
from scipy.sparse.linalg import svds, eigs
from sklearn.decomposition import PCA,TruncatedSVD
import pickle
import pandas as pd
from random import randint

import umap
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans, AffinityPropagation, MeanShift
from sklearn.preprocessing import MinMaxScaler
import kmapper as km
from kmapper.cover import Cover
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import networkx as nx
from community import best_partition # this is not part of networkx

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from matplotlib.colors import ListedColormap
from scipy import ndimage
import imageio
import plotly
import plotly.graph_objs as go

import os

def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]

    return rmin, rmax, cmin, cmax    
def embedding(data,dim):
    projection = mapper.fit_transform(data, projection=umap.UMAP(n_components=dim, n_neighbors=200, 
                                             a=None, angular_rp_forest=False, b=None, init='spectral',
                                           learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
                                           metric_kwds=None, min_dist=0.1, n_epochs=500,
                                           negative_sample_rate=10, random_state=47,
                                           repulsion_strength=1.0, set_op_mix_ratio=0.5, spread=0.25,
                                           target_metric='categorical', target_metric_kwds=None,
                                           target_n_neighbors=-1, target_weight=0.5, transform_queue_size=10.0,
                                           transform_seed=42, verbose=False))
    return projection

In [None]:
'''Loading the data'''
widths = []
heights = []
target = []
directories = ['/home/garner1/Work/dataset/cellImages/training/augmented/Cancer',
               '/home/garner1/Work/dataset/cellImages/training/augmented/Immuno',
              '/home/garner1/Work/dataset/cellImages/training/augmented/Other']
target_id = 0
cell_numb = 100
for directory in directories:
    target_id += 1
    for cell in os.listdir(directory)[:cell_numb]:
        path = os.path.join(directory, cell)
        for img in os.listdir(path):
            filename = os.path.join(path, img)
            img = imageio.imread(filename)
            rmin, rmax, cmin, cmax = bbox(img)
            width = rmax-rmin
            height = cmax-cmin
            widths.append(width)
            heights.append(height)
            target.append(target_id)    

            '''Resizing images to small boxes'''
Mwidths = max(widths)
Mheights = max(heights)
images = []
for directory in directories:
    for cell in os.listdir(directory)[:cell_numb]:
        path = os.path.join(directory, cell)
        for img in os.listdir(path):
            filename = os.path.join(path, img)
            img = imageio.imread(filename)
            rmin, rmax, cmin, cmax = bbox(img)
            padwidth = int(Mwidths-(rmax-rmin))
            padheight = int(Mheights-(cmax-cmin))
            newimg = np.pad(img[rmin:rmax,cmin:cmax],((0,padwidth),(0,padheight)),'constant', constant_values=(0))
            images.append(newimg)

print(len(images),np.asarray(images[0]).shape[0]*np.asarray(images[0]).shape[1])

In [None]:
# sns.set(style='white', context='notebook', rc={'figure.figsize':(168,120)})
# fig, ax_array = plt.subplots(3,10)
# axes = ax_array.flatten()
# for i, ax in enumerate(axes):
#     try:
#         ax.imshow(images[i], cmap='gray_r')
#     except IndexError:
#         i > len(images)
#         pass
# plt.setp(axes, xticks=[], yticks=[], frame_on=False)
# plt.tight_layout(h_pad=0.5, w_pad=0.01)

In [None]:
'''Data loading'''
data = np.zeros((Mwidths*Mheights,len(images)))
for ind in range(len(images)): data[:,ind] = images[ind].flatten() # from 2D arrays to 1D arrays
data = data.transpose()

In [None]:
'''2D visualization of annotated data'''
d2_embedding = umap.UMAP(n_neighbors=50,min_dist=0.0,n_components=2,random_state=42).fit_transform(data)

In [None]:
fig, ax = plt.subplots()
colors = ['tab:red', "tab:blue", 'tab:green']
labels = ['cancer','immuno','other']

cancer = (np.asarray(target) == 1)
ax.scatter(d2_embedding[cancer, 0], d2_embedding[cancer, 1], c=colors[0], s=10, label=labels[0]);

immuno = (np.asarray(target) == 2)
ax.scatter(d2_embedding[immuno, 0], d2_embedding[immuno, 1], c=colors[1], s=10, label=labels[1]);

other = (np.asarray(target) == 3)
ax.scatter(d2_embedding[other, 0], d2_embedding[other, 1], c=colors[2], s=10, label=labels[2]);

ax.legend()
plt.show()

In [None]:
'''3D visualization of annotated data'''
d3_embedding = umap.UMAP(n_neighbors=50,min_dist=0.0,n_components=3,random_state=42,).fit_transform(data)

In [None]:
# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()
# Configure the trace.
cancer = (np.asarray(target) == 1)
fig = go.Figure(data=[go.Scatter3d(
    x=d3_embedding[cancer,0],  # <-- Put your data instead
    y=d3_embedding[cancer,1],  # <-- Put your data instead
    z=d3_embedding[cancer,2],  # <-- Put your data instead
    mode='markers',
    marker=dict(size=3,color='red',opacity=0.25),
    name='cancer')])
immuno = (np.asarray(target) == 2)
fig.add_trace(go.Scatter3d(
    x=d3_embedding[immuno,0],  # <-- Put your data instead
    y=d3_embedding[immuno,1],  # <-- Put your data instead
    z=d3_embedding[immuno,2],  # <-- Put your data instead
    mode='markers',
    marker=dict(size=3,color='blue',opacity=0.25),
    name='immuno'))
other = (np.asarray(target) == 3)
fig.add_trace(go.Scatter3d(
    x=d3_embedding[other,0],  # <-- Put your data instead
    y=d3_embedding[other,1],  # <-- Put your data instead
    z=d3_embedding[other,2],  # <-- Put your data instead
    mode='markers',
    marker=dict(size=3,color='green',opacity=0.25),
    name='other'))
                
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
'''
HDBSCAN clusters in 2D
low min sample size seems to refuce unclustered data;
larger min cluster size decrease cluster numbers
'''
d2_labels = hdbscan.HDBSCAN(min_samples=100,min_cluster_size=100).fit_predict(d2_embedding)

sns.set(style='white', rc={'figure.figsize':(10,8)})
clustered = (d2_labels >= 0)
plt.scatter(d2_embedding[~clustered, 0],
            d2_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            s=20,
            alpha=0.5)
plt.scatter(d2_embedding[clustered, 0],
            d2_embedding[clustered, 1],
            c=d2_labels[clustered],
            s=20,
            cmap='Spectral');

In [None]:
'''
HDBSCAN clusters in 3D
low min sample size seems to refuce unclustered data;
larger min cluster size decrease cluster numbers
PCA reduction might not be a good idea because shape space is non-linear and the linear reduction could distort distances and later clustering
'''
d3_labels = hdbscan.HDBSCAN(min_samples=100,min_cluster_size=100).fit_predict(d3_embedding)

import plotly.graph_objects as go

fig = go.Figure()
size = 2
for cluster in set(d3_labels):
    clustered = (d3_labels == cluster)
    fig.add_trace(go.Scatter3d(
        x=d3_embedding[clustered,0],  # <-- Put your data instead
        y=d3_embedding[clustered,1],  # <-- Put your data instead
        z=d3_embedding[clustered,2],  # <-- Put your data instead
        name="cluster "+str(cluster),
        mode="markers",
        marker=dict(color=cluster+1,size=size, opacity=1)
    ))
fig.update_layout(title_text="HDBSCAN clusters in 3D ",
                  title_font_size=30)
fig.show()
##########
clustered = (d3_labels >= 0)
print('The percentage of clustered data points is '+str(np.sum(clustered) *1.0/ data.shape[0]*100)+'%')


In [None]:
from itertools import compress

for label in set(d3_labels): 
    clustered = (d3_labels == label)
    images_in_cluster = list(compress(images, clustered))
    sns.set(style='white', context='notebook', rc={'figure.figsize':(168,120)})
    n_cols = 30
    n_rows = len(images)/n_cols + 1
    fig, ax_array = plt.subplots(n_rows,n_cols)
    axes = ax_array.flatten()
    for i, ax in enumerate(axes):
        try:
            ax.imshow(images_in_cluster[i], cmap='gray_r')
        except IndexError:
            i > len(images_in_cluster)
            pass
    fig.suptitle('Cluster# '+str(label), fontsize=300)
    plt.setp(axes, xticks=[], yticks=[], frame_on=False)
#     plt.tight_layout(h_pad=0.5, w_pad=0.01)
    plt.savefig('cluster_'+str(label)+'.png')