## Generate the localization-based UMAP, Leiden clusters, and hierarchy

This notebook documents the generation of the figures related to localization-based clustering. This includes the target UMAP, the Leiden clustering, the hierarchical clustering of the Leiden clusters (using the Paris algorithm), GO term enrichment analysis, and the matrix of localization similarities.

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import pathlib
import scanpy as sc
import seaborn as sns
import sys
import os

from matplotlib import pyplot as plt
from matplotlib import rcParams

%load_ext autoreload
%autoreload 1

sys.path.append('../../')
%aimport scripts.cytoself_analysis.clustering_workflows
from scripts.cytoself_analysis import clustering_workflows, go_utils

sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)
rcParams['font.family'] = 'sans-serif'
rcParams['axes.grid'] = False

In [None]:
data_dir = pathlib.Path('../../data/')
output_dir = pathlib.Path(
    '/Users/keith.cheveralls/Box/KC-opencell-paper/image-based-clustering/'
)

def timestamp():
    return datetime.datetime.now().strftime('%Y-%m-%d')

### Load the adata object of target localization encodings
This anndata object includes results of preprocessing: the 200 PCs, kNN matrix, and UMAP coords. The generation of this object is documented in the notebook `generate-localization-encoding.ipynb` and it can be downloaded from Figshare [here](https://figshare.com/articles/dataset/Consensus_protein_localization_encodings_for_all_OpenCell_targets/16754965). 

In [None]:
adata = ad.read_h5ad(data_dir / 'figshare' / 'final-opencell-target-localization-encodings.h5ad')
cwv = clustering_workflows.ClusteringWorkflow(adata=adata)

### UMAP of target localization encodings

In [None]:
sc.pl.umap(cwv.adata, color='grade_3_annotation', palette='tab10', alpha=0.5)

### Leiden clustering

In [None]:
# use three resolutions: high-res clusters at res = 30,
# plus the optimal low-res and intermediate-res clusters
# (from the ARI curves calculated in the clustering-performance notebook)
resolutions = [0.631, 6.31, 30]

# we use seed = 18
seeds = [17, 18, 19]

In [None]:
# range of random seeds and range of resolutions
for resolution in resolutions: 
    for seed in seeds:
        cwv.run_leiden(
            resolution=resolution, 
            random_state=seed, 
            key_added='cluster_id_leiden_res%s_seed%s' % (resolution, seed)
        )

### Paris hierarchy from the Leiden clusters
This uses the Leiden clusters at resolution of 30 with random seed 18.

In [None]:
cwv.calculate_paris_hierarchy(
    leiden_cluster_column='cluster_id_leiden_res30_seed18', shuffled=False
)
cwv.full_dendrogram.shape

In [None]:
# plot the dendrogram and labeled UMAP at a given cut
_ = cwv.plot_dendrogram_umap(
    cut_threshold=0.2, ground_truth_label='label_0', orientation='left'
)

In [None]:
# export the full dendrogram in SVG format
cwv.plot_full_dendrogram(using='sp')
plt.savefig(output_dir / 'opencell-paris-figure' / ('%s-full-dendrogram-seed18.svg') % timestamp())

In [None]:
# export the dendrogram-UMAP plot in SVG format
# (the cut_threshold value was changed manually for each of the three cuts)
cut_threshold = 1.5
fig = cwv.plot_dendrogram_umap(cut_threshold=cut_threshold)
fig.savefig(
    output_dir /
    'opencell-paris-figure' /
    ('%s-dendrogram-and-umap-for-paris-hierarchy-cut-at-%s.svg' % (timestamp(), cut_threshold))
)

In [None]:
# export the UMAPs colored by paris clusters for the two cuts
# (note that the export format must be SVG in order for the scatterplot dots 
# to be grouped by cluster in affinity designer)

fig, axs = plt.subplots(1, 3, figsize=(15, 4))
cut_thresholds = [1.5, 0.2]
cmap = sns.color_palette('tab20')

for ind in range(len(cut_thresholds)):
    ax = axs[ind]
    cwv.cut_dendrogram(cut_thresholds[ind])
    obs = cwv.adata.obs.copy()
    for ind, cluster_id in enumerate(obs.cut_dendrogram_cluster_id.unique()):
        mask = obs.cut_dendrogram_cluster_id == cluster_id
        ax.scatter(
            *cwv.adata.obsm['X_umap'][mask, :].transpose(), 
            alpha=0.5, 
            color=cmap[ind], 
            label=cluster_id
        )
    ax.legend()

if False:
    plt.savefig(output_dir / 'opencell-paris-figure' / ('%s-target-umap-both-cuts.pdf' % timestamp()))

### Export the UMAP coordinates and cluster ids

This includes the Leiden clusters (for resolution 30 and seed 18), the optimal low-res and intermediate-res Leiden clusters, the branch and module ids from the two Paris hierarchy cuts, and the UMAP coords (for see 51).

In [None]:
excel_file = pd.ExcelWriter(output_dir / ('%s-image-based-clustering.xlsx') % timestamp())

In [None]:
# map from branch and module ids to those used in Fig S10 (this was defined by hand)
figure_module_ids = ['N%d' % ind for ind in range(1, 20)]
ordered_raw_module_ids = [0, 16, 9, 4, 12, 1, 3, 6, 5, 10, 15, 7, 18, 2, 8, 17, 13, 11, 14][::-1]

raw_to_figure_ids = {
    'branches': {0: '2', 1: '1', 2: '3'},
    'modules': dict(zip(ordered_raw_module_ids, figure_module_ids)),
}

In [None]:
# append the branch and module ids to adata.obs
_ = cwv.cut_dendrogram(1.5, key_added='hierarchy_branch_id')
_ = cwv.cut_dendrogram(0.2, key_added='hierarchy_module_id')

In [None]:
target_labels = cwv.adata.obs.copy()

umap_coords = cwv.adata.obsm['X_umap']
target_labels['umap_0'] = umap_coords[:, 0]
target_labels['umap_1'] = umap_coords[:, 1]

# rename the branch and module ids to those used in Fig S10
target_labels.hierarchy_branch_id.replace(to_replace=raw_to_figure_ids['branches'], inplace=True)
target_labels.hierarchy_module_id.replace(to_replace=raw_to_figure_ids['modules'], inplace=True)

In [None]:
target_labels.hierarchy_branch_id.unique(), target_labels.hierarchy_module_id.unique()

In [None]:
target_labels.rename(
    columns={
        'cluster_id_leiden_res0.631_seed18': 'low_res_leiden_clusters',
        'cluster_id_leiden_res6.31_seed18': 'intermed_res_leiden_clusters',
        'cluster_id_leiden_res30_seed18': 'high_res_leiden_clusters',
    },
    inplace=True
)

In [None]:
# drop unneeded columns
target_labels = target_labels[[
    'cell_line_id', 
    'ensg_id',
    'target_name',  
    'umap_0', 
    'umap_1', 
    'low_res_leiden_clusters',
    'intermed_res_leiden_clusters',
    'high_res_leiden_clusters', 
    'hierarchy_branch_id', 
    'hierarchy_module_id'
]]

In [None]:
target_labels.to_excel(excel_file, sheet_name='cluster-ids', index=False)

### Enriched GO terms in the Leiden clusters, modules, and branches

This is done using the Panther API. These cells are run separately and manually for the three cluster_id columns: cluster_id_leiden_res30_seed18, hierarchy_branch_id, hierarchy_module_id. Note that this is quite slow. 

In [None]:
# sanity-check
obs = cwv.adata.obs.copy()
target_names = sorted(obs.loc[obs.hierarchy_module_id == 6]['target_name'].unique())
results = go_utils.query_panther(target_names, reference_target_names=obs.target_name.unique())
results.head(5)

In [None]:
obs = cwv.adata.obs.copy()
all_target_names = obs.target_name.unique()

# cluster_id_column, label = 'cluster_id_leiden_res30_seed18', 'leiden-res30-seed18'
# cluster_id_column, label = 'hierarchy_branch_id', 'branches'
cluster_id_column, label = 'hierarchy_module_id', 'modules'

all_query_results = []
for cluster_id in obs[cluster_id_column].unique():
    print(cluster_id)
    
    target_names = np.array(sorted(
        obs.loc[obs[cluster_id_column] == cluster_id]['target_name'].unique()
    ))
    
    for dataset_kind in ['cc', 'bp', 'mf']:
        try:
            query_results = go_utils.query_panther(
                target_names, 
                reference_target_names=all_target_names, 
                dataset_kind=dataset_kind
            )
        except Exception:
            print('API error on paris_cluster_id %s' % paris_cluster_id)
            continue

        query_results['cluster_id'] = cluster_id
        query_results['cluster_size'] = len(target_names)
        query_results['go_annotations_type'] = dataset_kind

        all_query_results.append(query_results)

(
    pd.concat(tuple(all_query_results), axis=0)
    .loc[df.pValue < 0.1]
    .to_csv(
        output_dir / ('%s-panther-go-enrichment-for-%s.csv' % (timestamp(), label)),
        index=False
    )
)

### Clean up and export the GO enrichment results

This uses the raw enrichment results generated above. Non-ssignificantly-enriched terms are filtered out, columns are renamed, and the results written to the excel file created above. 

In [None]:
# clean up the GO enrichment CSVs
max_p_value = 0.01
min_fold_enrichment = 2

columns = [
    'cluster_id', 
    'go_annotations_type', 
    'go_term_id',
    'go_term_label',
    'cluster_size',
    'number_in_list',
    'fold_enrichment', 
    'pValue'
]

go_annotations_names = {
    'cc': 'cellular_component',
    'bp': 'biological_process',
    'mf': 'molecular_function',
}

column_renaming = {'pValue': 'p_value', 'number_in_list': 'go_term_count'}

cluster_kinds = ['branches', 'modules', 'leiden-res30-seed18']
for cluster_kind in cluster_kinds:

    df = pd.read_csv(
        output_dir / ('2021-09-29-panther-go-enrichment-for-%s.csv' % cluster_kind)
    )
    
    # replace the abbreviated annotation types
    df.go_annotations_type.replace(to_replace=go_annotations_names, inplace=True)
    
    # replace the branch and module cluster_ids with those used in the figures
    id_mapping = raw_to_figure_ids.get(cluster_kind)
    if id_mapping is not None:
        df.cluster_id.replace(to_replace=id_mapping, inplace=True)
    
    # replace the GO dataset kinds
    
    df = (
        df[columns]
        .loc[(df.pValue < max_p_value) & (df.fold_enrichment > min_fold_enrichment)]
        .sort_values(by=['cluster_id', 'go_annotations_type', 'pValue'], ascending=True)
        .rename(columns=column_renaming)
    )
    df.to_excel(
        excel_file,
        sheet_name=('GO-enrichment-%s' % cluster_kind),
        index=False
    )

In [None]:
# manually close the excel file
excel_file.close()

In [None]:
df.sort_values(by='pValue', ascending=True)

In [None]:
df.loc[df.pValue < 0.001].go_term_label.value_counts().head(11)

### Calculate the matrix of target-target localization similarities

This matrix is a distance matrix calculated from the target localization encodings, using the correlation distance metric. 

In [None]:
dists = cwv.calculate_distance_matrix(metric='correlation', n_pcs=200)

In [None]:
_ = plt.hist(dists.flatten(), bins=100)

In [None]:
# create a distance-matrix dataframe
index_by = 'target_name'
labels = cwv.adata.obs.copy()
df = pd.DataFrame(data=dists, index=labels[index_by], columns=labels[index_by])
df.shape

In [None]:
# print the list of closest targets
', '.join(
    df['C4orf32']
    .sort_values(ascending=False)
    .iloc[:10]
    .sort_values(ascending=False)
    .index
    .tolist()
)

In [None]:
df = df.applymap(lambda v: np.round(v, decimals=3))

In [None]:
# export the distances from each target to the orphan FAM241A
df['C4orf32'].to_csv(
    output_dir / 
    'opencell-paris-figure' /
    ('%s_vq2-vector-%s-distances-200-PCs-pub-ready-only.csv' % (timestamp(), 'correlation'))
)