## Heatmap of multilocalizing target annotations
__Keith Cheveralls__<br>
__October 2021__

This notebook generates a heatmap of co-occuring annotations for all multilocalizing OpenCell targets, using the manual annotations. (This heatmap appears in Fig S7B.)

In [None]:
import datetime
import itertools
import numpy as np
import pandas as pd
import pathlib
import scanpy as sc
import seaborn as sns
import sys

from matplotlib import pyplot as plt
from matplotlib import rcParams

%load_ext autoreload
%autoreload 1

sys.path.insert(0, '../')
%aimport scripts.external.sankey
%aimport scripts.annotation_comparisons.datasets
%aimport scripts.annotation_comparisons.definitions
from scripts.annotation_comparisons import datasets, plotting

data_dir = pathlib.Path('../data')
output_dir = pathlib.Path(
    '/Users/keith.cheveralls/Box/KC-opencell-paper/'
)

def timestamp():
    return datetime.datetime.now().strftime('%Y-%m-%d')

sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)
rcParams['font.family'] = 'sans-serif'
rcParams['axes.grid'] = False

In [None]:
df = pd.read_csv(data_dir / '2021-09-29-public-annotations-flat.csv')

In [None]:
categories_to_plot = [
    'nucleoplasm',
    'chromatin',
    'nuclear_membrane',
    'nuclear_punctae',
    'nucleolus_fc_dfc',
    'nucleolus_gc',
    'cytoplasmic',
    'cytoskeleton',
    'centrosome',
    'focal_adhesions',
    'membrane',
    'vesicles',
    'er',
    'mitochondria',
]

In [None]:
# retain only the grade-2 or grade-3 annotations 
# (which are necessarily localization annotations)
df = df.loc[df.annotation_grade.isin(['2', '3'])]
df.shape

In [None]:
# retain only the categories to be plotted
dff = df.loc[df.annotation_name.isin(categories_to_plot)]

In [None]:
# sanity check: counts lines with more than one category 
(
    df.ensg_id.unique().shape, 
    dff.ensg_id.unique().shape, 
    (df.ensg_id.value_counts() > 1).sum()
)

In [None]:
# the number of lines that are both cytoplasmic and nucleoplasmic
df.groupby('ensg_id').apply(
    lambda d: set(['cytoplasmic', 'nucleoplasm']).issubset(d.annotation_name.values)
).sum()

In [None]:
all_possible_pairs = list(itertools.combinations(categories_to_plot, 2))

# initialize a dataframe of pairwise counts
pairwise_counts = pd.DataFrame(columns=categories_to_plot, index=categories_to_plot)
pairwise_counts.loc[:] = 0

In [None]:
# explicitly count the pairs of categories
grouped = df.groupby('ensg_id')

for ensg_id in df.ensg_id.unique():
    categories = grouped.get_group(ensg_id).annotation_name.tolist()
    for row_category in pairwise_counts.index:
            for col_category in pairwise_counts.columns:
                if col_category in categories and row_category in categories:
                    pairwise_counts.at[row_category, col_category] += 1

In [None]:
# normalize each row by the frequency of its category
counts = df.groupby('annotation_name').ensg_id.count()
for category in pairwise_counts:
    pairwise_counts.loc[category] /= counts[category]

In [None]:
plt.figure(figsize=(10, 10))

sns.heatmap(
    pairwise_counts.astype(float),
    cmap='YlGnBu', 
    vmax=None, 
    square=True, 
    linewidths=.5,
    annot=True,
    fmt='0.2f'
)

In [None]:
pairwise_counts.to_csv(output_dir / 'multilocalizing-targets-heatmap.csv')