## Documentation of image-based clustering analysis 

This notebook is not used for the OpenCell publication. It documents historical attempts to analyze the target localization encodings (that is, the latent-space representations  of the trained cytoself model). 

In [None]:
import anndata as ad
import IPython
import json
import leidenalg
import numpy as np
import os
import pandas as pd

import sknetwork
import scanpy as sc
import scanpy.external as sce
import sklearn.cluster
import sklearn.manifold
import sklearn.decomposition
import sys

from matplotlib import pyplot as plt
from matplotlib import rcParams

In [None]:
%load_ext autoreload
%autoreload 1

sys.path.append('../scripts/')
%aimport cytoself_analysis.clustering_workflows
%aimport cytoself_analysis.ground_truth_labels

from cytoself_analysis import (
    loaders, clustering_workflows, ground_truth_labels, analysis_utils, go_utils
)

In [None]:
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)
rcParams['font.family'] = 'sans-serif'
rcParams['axes.grid'] = False

In [None]:
# optional: update the cached lines/ payloads
ground_truth_labels.cache_target_metadata(data_dirpath=os.path.abspath('../data'))

In [None]:
root_dirpath = '/Users/keith.cheveralls/clustering-results'
res = loaders.load_december_results(root_dirpath=root_dirpath, dataset='full', rep=3)

In [None]:
res.test_labels.shape, res.test_vq2.shape, res.test_vq1_ind.shape, res.test_vq2_ind.shape

In [None]:
res.concatenate_orphans()

In [None]:
res.test_labels.shape, res.test_vq2.shape, res.test_vq2_ind.shape

In [None]:
res.test_labels = ground_truth_labels.merge_all(
    df=res.test_labels, data_dirpath=os.path.abspath('../data')
)

### Create the ClusteringWorkflow instances

In [None]:
# for vectors
adata = res.export_adata(
    vq='vq2', kind='vectors', using='mean', rerun=True, pub_ready_only=True
)

In [None]:
adata.obs.shape

In [None]:
# vectors
cwv = clustering_workflows.ClusteringWorkflow(adata=adata)
cwv.preprocess(
    do_log1p=False,
    do_scaling=False,
    n_top_genes=None,
    n_pcs=200,
)
cwv.calculate_neighbors(n_neighbors=10, n_pcs=200, metric='euclidean')

In [None]:
# adjacencies from the raw data (without PCA)
# sc.pp.neighbors(cwv.adata, n_neighbors=10, use_rep='X', metric='correlation')

In [None]:
# for histograms
adata = res.export_adata(vq='vq1', kind='histograms')
cwh1 = clustering_workflows.ClusteringWorkflow(adata=adata)

In [None]:
# for histograms
adata = res.export_adata(vq='vq2', kind='histograms')
cwh2 = clustering_workflows.ClusteringWorkflow(adata=adata)

In [None]:
# VQ1 histograms
cwh1.preprocess(
    do_log1p=True,
    do_scaling=True,
    n_top_genes=None,
    n_pcs=200,
)
cwh1.calculate_neighbors(n_neighbors=10, n_pcs=200, metric='correlation')

In [None]:
# VQ2 histograms
cwh2.preprocess(
    do_log1p=True,
    do_scaling=True,
    n_top_genes=None,
    n_pcs=200,
)
cwh2.calculate_neighbors(n_neighbors=10, n_pcs=200, metric='correlation')

In [None]:
# aside: how to calculate neighbors directly from the raw vectors
sc.pp.neighbors(cwh2.adata, use_rep='X', n_neighbors=10, metric='correlation')

In [None]:
plt.figure(figsize=(16, 3))
labels = cwh.adata.obs.copy()
ind = labels.loc[labels.target_name == 'C4orf32'].index[0]
plt.plot(cwh.adata.X[int(ind), :])

ind = labels.loc[labels.target_name == 'TMEM208'].index[0]
plt.plot(cwh.adata.X[int(ind), :], alpha=0.7)

In [None]:
# the number of patches per cell line
_ = plt.hist(res.test_labels.cell_line_id.value_counts().values, bins=100)

In [None]:
pca = res.plot_explained_variance(res.target_vectors['vq2']['X'], n_dims=50)

### Compare the computed adata to the cached adata object used by the umap-viewer app

In [None]:
filepath = os.path.join(res.exports_dirpath, '%s-target-%s-adata.h5ad' % ('vq2', 'vectors'))
adata = ad.read_h5ad(filepath)

In [None]:
adata.X.shape

In [None]:
# rows in which any elements are different
np.argwhere(np.any(cwv.adata.X.toarray() != adata.X.toarray(), axis=1))

In [None]:
# rows in which any elements are different
np.argwhere(
    np.any(
        cwv.adata.obsp['connectivities'].toarray() != adata.obsp['connectivities'].toarray(),
        axis=1
    )
)

In [None]:
sc.tl.umap(cwv.adata, init_pos='spectral', min_dist=0.0, random_state=42)

In [None]:
np.argwhere(np.any(cwv.adata.obsm['X_umap'] != adata.obsm['X_umap'], axis=1))

### Load the ClusteringWorkflow from the cached adata object used by the umap viewer

In [None]:
# vectors
cwv = clustering_workflows.ClusteringWorkflow(adata=adata)
cwv.preprocess(
    do_log1p=False,
    do_scaling=False,
    n_top_genes=None,
    n_pcs=200,
)
cwv.calculate_neighbors(n_neighbors=10, n_pcs=200, metric='euclidean')

## 2D UMAPs

In [None]:
cw = cwv
sc.tl.umap(cw.adata, init_pos='spectral', min_dist=0.0, random_state=51)

In [None]:
sc.pl.umap(cw.adata, color='grade_3_annotation', palette='tab10', alpha=0.5)

In [None]:
# compare UMAPs from histograms and vectors
fig, axs = plt.subplots(1, 2, figsize=(16, 7))

sc.pl.umap(cwv.adata, color='grade_3_annotation', palette='tab10', alpha=0.5, ax=axs[0])
axs[0].set_title('VQ2 vectors')

sc.pl.umap(cwh2.adata, color='grade_3_annotation', palette='tab10', alpha=0.5, ax=axs[1])
axs[1].set_title('VQ2 histograms')

### Leiden clustering at a range of resolutions and with multiple seeds

In [None]:
# drop all existing cluster_id columns
cwv.adata.obs.drop(
    labels=[col for col in cwv.adata.obs.columns if col.startswith('cluster_id')], 
    axis=1, 
    inplace=True
)

In [None]:
seeds = range(10, 20)
resolutions = [
    0.1, 0.25, 0.5, 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100
]

In [None]:
# range of random seeds at resolution=30
for seed in seeds:
    cwv.run_leiden(
        resolution=30, 
        random_state=seed, 
        key_added='cluster_id_leiden_res30_seed%s' % seed
    )

In [None]:
# range of resolutions with random_state=42
for resolution in resolutions:
    cwv.run_leiden(
        resolution=resolution, 
        random_state=42, 
        key_added='cluster_id_leiden_res%s' % resolution
    )

In [None]:
# range of random seeds and range of resolutions
for resolution in resolutions: 
    for seed in seeds:
        cwv.run_leiden(
            resolution=resolution, 
            random_state=seed, 
            key_added='cluster_id_leiden_res%s_seed%s' % (resolution, seed)
        )

In [None]:
d = cwv.adata.obs.copy()

In [None]:
cluster_id = d.loc[d.target_name == 'OSTC']['cluster_id_leiden_seed42'].iloc[0] 
d.loc[d['cluster_id_leiden_seed42'] == cluster_id]

In [None]:
# compare the clusters from different random seeds
from sklearn import metrics
metrics.adjusted_rand_score(
    cwv.adata.obs['cluster_id_leiden_seed42'], cwv.adata.obs['cluster_id_leiden_seed48']
)

### Find the random seed for Leiden clustering that yields the greatest modularity

In [None]:
adj = cwv.adata.obsp['connectivities'].toarray()
graph = sc._utils.get_igraph_from_adjacency(adj, directed=True)

In [None]:
qualities = []
for seed in range(1000):
    cwv.run_leiden(resolution=30, random_state=seed)
    labels = [int(val) for val in cwv.adata.obs.leiden]
    mvp = leidenalg.RBConfigurationVertexPartition(
        graph, resolution_parameter=30, initial_membership=labels
    )
    qualities.append(mvp.quality())

In [None]:
_ = plt.hist(qualities, bins=30)

In [None]:
best_seed = np.argmax(qualities)
best_seed

In [None]:
min(qualities), max(qualities)

In [None]:
seed = 304
cwv.run_leiden(
    resolution=30, random_state=seed, key_added='cluster_id_leiden_res30_seed%s' % seed
)

In [None]:
# export the clustering results
df = cwv.adata.obs.copy()[['cell_line_id', 'target_name', 'plate_id', 'well_id', 'leiden']]
df.to_csv(
    '/Users/keith.cheveralls/clustering-results/exports/'
    '2020-december-results-full-rep3--only-pub-ready--mean-VQ2-vectors--n_pcs=200--n_neighbors=10--best-leiden--res=30.csv',
    index=False
)

### Identify 'stable' clusters of targets that are always in the same Leiden cluster

In [None]:
obs = cwv.adata.obs.copy()
# obs.reset_index(inplace=True)

leiden_columns = ['cluster_id_leiden_res30_seed%s' % seed for seed in range(10, 20)]

num_targets = obs.shape[0]
connectivities = np.zeros((num_targets, num_targets), dtype=int)
leiden_ids = obs[leiden_columns].values

for ind in range(num_targets):
    for jnd in range(ind + 1, num_targets):
        connectivities[ind, jnd] = np.sum(leiden_ids[ind, :] == leiden_ids[jnd, :])
        connectivities[jnd, ind] = np.sum(leiden_ids[ind, :] == leiden_ids[jnd, :])

In [None]:
np.argwhere(connectivities[1, :] > 9).flatten()

In [None]:
connectivities.max()

In [None]:
cwv.adata.obs.reset_index(inplace=True)

In [None]:
cwv.adata.obs['stable_leiden_id'] = np.nan

temp_connectivities = connectivities.copy()
temp_connectivities = temp_connectivities > 9

cluster_id = -1

for ind, row in cwv.adata.obs.iterrows():
    neighbor_inds = np.argwhere(temp_connectivities[ind, :] > 0).flatten()
    if not len(neighbor_inds):
        continue
        
    cluster_id += 1
    cwv.adata.obs.at[ind, 'stable_leiden_id'] = cluster_id
    for neighbor_ind in neighbor_inds:
        cwv.adata.obs.at[neighbor_ind, 'stable_leiden_id'] = cluster_id
        temp_connectivities[neighbor_ind, :] = False

In [None]:
cwv.adata.obs.stable_leiden_id.notna().sum()

In [None]:
_ = plt.hist(cwv.adata.obs.stable_leiden_id.value_counts(), bins=np.arange(15))
_ = plt.hist(cwv.adata.obs.cluster_id_leiden_res30_seed11.value_counts(), bins=np.arange(15), alpha=0.5)

### Are higher-res Leiden clusters contained in single lower-res clusters?

Answer: no.

In [None]:
d = cwv.adata.obs
low_res = d.cluster_id_leiden_res30_seed10
high_res = d.cluster_id_leiden_res30_seed11

counts = []
for cluster_id in set(high_res):
    counts.append(len(set(low_res[high_res == cluster_id])))

_ = plt.hist(counts)

In [None]:
# compare two low-res random seeds
cwv.plot_sankey(
    ground_truth_label='cluster_id_leiden_res0.5_seed10', 
    predicted_label='cluster_id_leiden_res0.5_seed13'
)

### Compare Leiden to agglomerative clustering

In [None]:
# agglomerative using PCA coords
cwv.run_agglomerative(130, method='pca')

In [None]:
# using 3D UMAP
cwv.run_agglomerative(30, method='umap', n_umap_components=3)

In [None]:
cwh.plot_umap(color_label='leiden', min_dist=0.1)

In [None]:
cwv.plot_umap(color_label='agg_cluster_id', min_dist=0.1)

In [None]:
cwv.plot_sankey(ground_truth_label='leiden', predicted_label='agg_cluster_id')

### Explore weighted adjacencies

In [None]:
# create a distance-matrix dataframe
cw = cwv
labels = cw.adata.obs.copy()
index_by = 'target_name'

adj = cw.adata.obsp['connectivities'].toarray()
df = pd.DataFrame(data=adj, index=labels[index_by], columns=labels[index_by])
df.shape

In [None]:
target = 'MKI67'
df[target].sort_values(ascending=False).iloc[:10].sort_values(ascending=False)

In [None]:
target = 'C4orf32'
df[target].sort_values(ascending=False).iloc[:20].sort_values(ascending=False)

## Hierarchical clustering of the high-resolution Leiden clusters

using the Paris algorithm and the weighted adjacency matrix summed over the high-resolution Leiden clusters.

In [None]:
sc.tl.umap(cwv.adata, min_dist=0, random_state=51)

In [None]:
sc.pl.umap(cwv.adata, color='label_0', palette='tab10', alpha=0.5)

In [None]:
# for pub-ready-only, C4orf32 is clustered in a group of three related proteins 
# only in seed11 and seed18 - in all others, it's with five (two of which unrelated)

In [None]:
cwv.adata.obs

In [None]:
cwv.calculate_paris_hierarchy(leiden_cluster_column='cluster_id_leiden_res30_seed18')
cwv.full_dendrogram.shape

In [None]:
cwv.calculate_paris_hierarchy(leiden_cluster_column='stable_leiden_id')
cwv.full_dendrogram.shape

In [None]:
cwv.full_dendrogram.max(axis=0)

In [None]:
# plot the full dendrogram
svg = cwv.plot_full_dendrogram(using='svg')
IPython.display.SVG(svg)

In [None]:
# plot the cut dendrogram and the UMAP
_ = cwv.plot_dendrogram_umap(cut_threshold=0.20, ground_truth_label='label_0', orientation='left')

### Aside: manually drawing the dendrogram

In [None]:
cut_dendrogram_ids, cut_dendrogram = sknetwork.hierarchy.cut_straight(
    cwv.full_dendrogram, threshold=0.5, return_dendrogram=True
)

In [None]:
num_nodes = cut_dendrogram_ids.max()

# these are the ids of the merged nodes
compound_node_ids = np.arange(num_nodes + 1, num_nodes + cut_dendrogram.shape[0] + 1)

# node id and distance for the leaf nodes (distance is zero by definition)
leaf_node_data = pd.DataFrame(
    data=list(range(num_nodes + 1)), columns=['node_id']
)
leaf_node_data['kind'] = 'leaf'

# the size is not really zero, but the size of the leiden cluster the leaf node corresponds to
leaf_node_data['size'] = 0

# y position of the leaf nodes is zero by construction
# (that is, the leaf nodes all lie on the x-axis)
leaf_node_data['y'] = 0
leaf_node_data['distance'] = 0

compound_node_data = pd.DataFrame(
    data=np.concatenate((compound_node_ids[:, None], cut_dendrogram), axis=1), 
    columns=['node_id', 'child1_id', 'child2_id', 'distance', 'size']
)
compound_node_data['kind'] = 'compound'

# the dataframe of all nodes
data = pd.concat((leaf_node_data, compound_node_data), axis=0)
data.index = data.node_id

# calculate compound node coords from the scipy dendrogram plotting method
coords = sp_dendrogram(cut_dendrogram)

# the coords returned by the scipy dendrogram method
# are not sorted by distance, but the cut_dendrogram array is,
# so we need this order to determine the merged_node_id from a row index of the coords
order = list(np.argsort(np.array(coords['dcoord'])[:, 1]))

for ind in range(len(coords['dcoord'])):
    
    # the tricky bit: get the compound node id that corresponds to the current row of the coords array
    compound_node_id = compound_node_ids[order.index(ind)]

    # the horizontal position and 'width' of the compound nodes
    # (which is what we needed scipy.dendrogram to calculate for us)
    data.at[compound_node_id, 'x'] = np.mean(coords['icoord'][ind])
    data.at[compound_node_id, 'width'] = np.max(coords['icoord'][ind]) - np.min(coords['icoord'][ind])

    # the vertical position is identical to the node distance
    data.at[compound_node_id, 'y'] = coords['dcoord'][ind][1]
    
    # sanity check - draw the compound node ids on the dendrogram
    plt.plot(
        data.loc[compound_node_id, 'x'], 
        data.loc[compound_node_id, 'y'], 
        'o', 
        c=coords['color_list'][ind]
    )
    plt.gca().annotate(
        str(compound_node_id), 
        (data.loc[compound_node_id, 'x'], data.loc[compound_node_id, 'y']), 
        xytext=(0, -5), 
        textcoords='offset points', 
        va='top', 
        ha='center'
    )

In [None]:
# determine the x position of the leaf nodes
for ind, row in data.iterrows():
    if row.kind != 'leaf':
        continue
    child_column = 'child1_id' if ind in data.child1_id.values else 'child2_id'
    parent_row = data.loc[data[child_column] == ind].iloc[0]
    left_x = parent_row.x - parent_row.width/2
    right_x = parent_row.x + parent_row.width/2
    if right_x in data.x.values:
        data.at[ind, 'x'] = left_x
    else:
        data.at[ind, 'x'] = right_x

In [None]:
plt.figure(figsize=(7, 7))
for ind, row in data.iterrows():
    if row.kind == 'compound':
        plt.plot(
            [row.x, data.loc[row.child1_id].x],
            [row.y, data.loc[row.child1_id].y],
            c='#aaa'
        )
        plt.plot(
            [row.x, data.loc[row.child2_id].x],
            [row.y, data.loc[row.child2_id].y],
            c='#aaa'
        )
        
    plt.plot([row.x], [row.y], 'o', markersize=(row['size']/3 + 3))
    plt.gca().annotate(
        "%d" % row.node_id, 
        (row.x, row.y), 
        xytext=((0, 5) if row.kind == 'compound' else (0, -5)),
        textcoords='offset points', 
        va='top', 
        ha='center',
        fontsize=10
    )

### Aside: construct JSON representations of the dendrogram

In [None]:
# construct the nested JSON required by d3.hierarchy from the cut dendrogram
num_nodes = cut_dendrogram_ids.max()

# these are the indices of the merged nodes
merged_node_inds = list(np.arange(num_nodes + 1, num_nodes + cut_dendrogram.shape[0] + 1))

# this is an array of 
# (merged_node_ind, child1_node_ind, child2_node_ind, distance, merged_node_size)
# dend = np.concatenate((merged_node_inds[:, None], cut_dendrogram), axis=1)

def create_node(node_ind):
    
    # if the node has children
    if node_ind in merged_node_inds:
        row = cut_dendrogram[merged_node_inds.index(node_ind), :]
        child1_node_ind, child2_node_ind, distance, size = row
        data = {
            'type': 'compound',
            'id': int(node_ind),
            'size': int(size),
            'distance': float(distance),
            'children': [create_node(ind) for ind in [child1_node_ind, child2_node_ind]],
        }
    
    # if the node is a leaf (terminal) node
    # (size is the number of Leiden clusters in the node)
    else:
        data = {
            'type': 'leaf',
            'id': int(node_ind),
            'size': len(
                cwv.adata.obs.loc[cwv.adata.obs.paris == node_ind][leiden_cluster_column].unique()
            ),
            'distance': 0,
            'children': [],
        }
    return data

result = create_node(max(merged_node_inds))

In [None]:
# JSON string to copy to observable notebooks
json.dumps(result)

In [None]:
nodes = []
edges = []

def flatten_node(node):
    ind = node['ind']
    nodes.append({'id': ind, 'size': node['size']})
    for child_node in node['children']:
        edges.append({
            'source': ind, 
            'target': child_node['ind'],
            'distance': node['distance'] - child_node['distance'],
        })
        flatten_node(child_node)
        
flatten_node(result)

In [None]:
# json dump for d3 force-directed network layout
json.dumps({'nodes': nodes, 'links': edges})

## Exports for the dash app (for 2020-december results)
This is for reference only and documents how the mass-spec clusters were merged with the target_labels dataframe used by the dash app.

## Export the adata objects for creating gridded UMAPs

In [None]:
res = model_results.ModelResults.load_december_results(
    root_dirpath='/Users/keith.cheveralls/clustering-results/', dataset='full'
)

In [None]:
# for vectors
adata = res.export_adata(vq='vq2', kind='vectors', using='median')
adata.write_h5ad(
    '/Users/keith.cheveralls/image-data/december-results-full-median-vq2-target-vectors-adata.h5ad'
)

In [None]:
adata = ad.read_h5ad(
    '/Users/keith.cheveralls/image-data/december-results-full-median-vq2-target-vectors-adata.h5ad'
)