## Comparing OC localization annotations to HPA annotations
__Keith Cheveralls__<br>
__October 2021__

This notebook compares the manual localization annotations of OpenCell targets to the HPA annotations and also to a dataset of annotations from yeast. It is used to generate Sankey diagrams comparing the OC and HPA annotations, and also to generate counts of partial and exact matches for each unique set of annotations. 

In [None]:
import datetime
import numpy as np
import pandas as pd
import pathlib
import scanpy as sc
import sys

from matplotlib import pyplot as plt
from matplotlib import rcParams

%load_ext autoreload
%autoreload 1

sys.path.insert(0, '../')
%aimport scripts.external.sankey
%aimport scripts.annotation_comparisons.datasets
%aimport scripts.annotation_comparisons.definitions
from scripts.annotation_comparisons import datasets, plotting

output_dir = pathlib.Path(
    '/Users/keith.cheveralls/Box/KC-opencell-paper/oc-hpa-comparison/'
)

def timestamp():
    return datetime.datetime.now().strftime('%Y-%m-%d')

sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False)
rcParams['font.family'] = 'sans-serif'
rcParams['axes.grid'] = False

# OpenCell <> HPA

### Load the OpenCell and HPA annotations

Four targets have no consensus OC labels - RBSN, LSM14A, LSMB14B, DDX6 - because they all have only the big_aggregates annotation. 

In [None]:
res = 'high'
oc_ants = datasets.load_oc_annotations(map_kind='hpa', from_preprint=False, res=res)
hpa_ants = datasets.load_hpa_annotations(res=res, exclude_uncertain=False)

In [None]:
ensg_not_in_hpa = sorted(oc_ants.loc[~oc_ants.ensg_id.isin(hpa_ants.ensg_id)].target_name.unique())
len(ensg_not_in_hpa)

In [None]:
names_not_in_hpa = sorted(oc_ants.loc[~oc_ants.target_name.isin(hpa_ants.gene_name)].target_name.unique())
len(names_not_in_hpa)

In [None]:
oc_ants.to_csv(output_dir / ('%s-opencell-consensus-annotations.csv' % timestamp()), index=False)

In [None]:
# aside: targets not in oc_ants and therefore missing from the exports
# (these are all targets w grade-3 big_aggregates)
tmp = pd.read_csv('../data/2021-09-29-public-annotations-flat.csv')
tmp.loc[~tmp.target_name.isin(oc_ants.target_name)]

In [None]:
# most common OC label sets
oc_ants.groupby('ensg_id').consensus_label.agg(tuple).value_counts().head(11)

In [None]:
# most common HPA label sets
hpa.groupby('ensg_id').consensus_label.agg(tuple).value_counts().head(22)

In [None]:
plt.figure(figsize=(6, 4))
_ = plt.hist(
    oc_ants.groupby('ensg_id').count().annotation_name, bins=np.arange(1, 6, 1), density=True, edgecolor='w',
)

### OC-HPA comparison: counts and sankey

Note: for low resolution labels, the sankey diagrams all look nearly the same with or without including cytoskeleton in the 'cytoplasm' category.

In [None]:
res = 'high'
exclude_uncertain = False

oc_ants = datasets.load_oc_annotations(map_kind='hpa', from_preprint=False, res=res)
hpa_ants = datasets.load_hpa_annotations(res=res, exclude_uncertain=exclude_uncertain)

oc_hpa = datasets.merge_targets(
    oc_ants, reference_ants=hpa_ants, reference_kind='hpa', exclude_multilocalizing=False, how='inner'
)
oc_hpa_only_one = datasets.merge_targets(
    oc_ants, reference_ants=hpa_ants, reference_kind='hpa', exclude_multilocalizing=True, how='inner'
)

(
    oc_hpa.shape, 
    oc_hpa.partial_match.sum(), 
    oc_hpa.exact_match.sum(), 
    oc_hpa_only_one.shape,
    oc_hpa_only_one.exact_match.sum()
)

### Sankey for all targets

In [None]:
# all targets
min_count = 30 if res == 'high' else 1
plotting.plot_sankey(
    oc_hpa, 
    res=res, 
    left_category_name='OC', 
    right_category_name='HPA', 
    min_count=min_count, 
    use_dynamic_colormap=False
)
plt.savefig(
    output_dir / ('%s-oc-hpa--all--%s-res--min-count-%s.pdf' % (timestamp(), res, min_count)),
    bbox_inches='tight'
)

In [None]:
# most common mismatched labels
oc_hpa.loc[oc_hpa.partial_match == False].consensus_label_hpa.value_counts()

In [None]:
# all targets that do not match at all 
plotting.plot_sankey(
    oc_hpa.loc[oc_hpa.partial_match == False], right_category_name='HPA', res=res, min_count=5
)
# plt.savefig(output_dir / ('oc-hpa--%s-res--mismatched-only.pdf' % res), bbox_inches='tight')

In [None]:
# targets with only one label
plotting.plot_sankey(oc_hpa_only_one, right_category_name='HPA', res=res, min_count=5)
# plt.savefig(output_dir / ('oc-hpa--%s-res--wo-multilocz.pdf' % res), bbox_inches='tight')

In [None]:
# targets with only one OC and only one HPA label *and* that do not match
plotting.plot_sankey(
    oc_hpa_only_one[oc_hpa_only_one.consensus_label_hpa != oc_hpa_only_one.consensus_label_oc],
    min_count=5
)
# plt.savefig(output_dir / 'only-one-targets-mismatched.pdf', bbox_inches='tight')

### Sankey for manually-curated discrepant targets

This should include 'uncertain' HPA annotations, because Manu included these in the curated lisrt of discrepancies. 

In [None]:
res = 'high'
exclude_uncertain = False

oc_hpa = datasets.merge_targets(
    oc_ants=datasets.load_oc_annotations(map_kind='hpa', from_preprint=False, res=res),
    reference_ants=datasets.load_hpa_annotations(res=res, exclude_uncertain=exclude_uncertain),
    reference_kind='hpa', 
    exclude_multilocalizing=False
)

In [None]:
curated = pd.read_csv('../data/curated-OC-HPA-discrepant-targets.csv')
curated.rename(columns={'Gene': 'target_name', 'ENSG ID': 'ensg_id'}, inplace=True)
curated.shape

In [None]:
set(curated.ensg_id).difference(oc_ants.ensg_id)

In [None]:
plotting.plot_sankey(
    oc_hpa[oc_hpa.ensg_id.isin(curated.ensg_id)], res=res, min_count=3, use_dynamic_colormap=True
)
plt.savefig(
    output_dir / ('%s-oc-hpa--all--%s-res--curated-discrepant.pdf' % (timestamp(), res)),
    bbox_inches='tight'
)

### Export CSVs

In [None]:
# export all consensus annotations and counts of exact and partial matches (for supp table)
datasets.export_consensus_annotations(output_dir, timestamp(), reference_kind='hpa')

In [None]:
oc_hpa_only_one = datasets.merge_targets(
    oc_ants, reference_ants=hpa_ants, reference_kind='hpa', exclude_multilocalizing=True
)
oc_hpa_only_one.exact_match.sum(), oc_hpa_only_one.shape

In [None]:
# export the list of targets w only one OC and HPA annotation
(
    oc_hpa_only_one[
        ['ensg_id', 'target_name', 'consensus_label_oc', 'consensus_label_hpa', 'exact_match']
    ]
    .sort_values(by=['exact_match', 'consensus_label_oc', 'consensus_label_hpa'])
    .to_csv(
        output_dir / ('%s-oc-hpa-comparison--high-res--targets-with-only-one-label.csv' % timestamp()),
        index=False
    )
)

In [None]:
# export a sorted CSV of targets with totally mismatched labels
mismatched = oc_hpa.loc[~oc_hpa.partial_match].copy()
mismatched.sort_values(by=['consensus_label_oc', 'consensus_label_hpa']).to_csv(
    output_dir / ('%s-oc-hpa-comparison-discrepant-targets.csv' % timestamp()), index=False
)

### Inspect particular mismatched targets

In [None]:
oc_hpa.loc[
    (oc_hpa.consensus_label_hpa == ('nucleoplasm',)) & (oc_hpa.consensus_label_oc == ('er',))
]

### Aside: append links to OC, HPA, Uniprot to the summary of curated discrepancies
This is for the supp table.

In [None]:
targets = pd.read_csv('../data/all-public-targets.csv')

discreps = pd.read_excel(
    output_dir / 'OC-HPA-discrepancies-tmp.xlsx', sheet_name='data', engine='openpyxl'
)

discreps['target_name'] = discreps.gene_name.str.upper()
targets['target_name'] = targets.target_name.str.upper()

discreps_merged = pd.merge(discreps, targets, on='target_name', how='inner')
discreps.shape, discreps_merged.shape

In [None]:
discreps_merged['hpa_link'] = ''
discreps_merged['uniprot_link'] = ''
discreps_merged['opencell_link'] = ''

for ind, row in discreps_merged.iterrows():
    discreps_merged.at[ind, 'hpa_link'] = f'https://www.proteinatlas.org/{row.ensg_id_x}/cell'
    discreps_merged.at[ind, 'uniprot_link'] = f'https://www.uniprot.org/uniprot/{row.uniprot_id}'
    discreps_merged.at[ind, 'opencell_link'] = f'https://opencell.czbiohub.org/target/{row.cell_line_id}'

In [None]:
(
    discreps_merged.drop(labels=['target_name', 'cell_line_id', 'ensg_id_y', 'uniprot_id'], axis=1)
    .to_csv(output_dir / 'OC-HPA-discrepancies-w-links.csv')
)

# OpenCell <> yeast

### Inspect yeast-human homologs

There are 3900 human genes (ensg_ids) and 2650 yeast genes. Of these, 1292 are one-to-one. 

In [None]:
homologs = datasets.load_yeast_homologs()

# number of human and yeast genes in the list of homologs
homologs.ensg_id.nunique(), homologs.orf_id.nunique()

In [None]:
# the number of yeast orfs that map to a given number of ensg_ids
homologs.orf_id.value_counts().value_counts().head(5)

In [None]:
# the number of ensg_ids that map to a iven number of yeast orfs
homologs.ensg_id.value_counts().value_counts().head(5)

### Load the OpenCell and yeast labels

In [None]:
res = 'high'
oc_ants = datasets.load_oc_annotations(map_kind='yeast', res=res)
yeast_ants = datasets.load_yeast_annotations(
    res=res, append_human_homologs=True, include_only_1to1_homologs=False
)

In [None]:
# the number of ensg_ids that map to a given number of orfs
yeast_ants.groupby(['orf_id', 'ensg_id']).count().reset_index().ensg_id.value_counts().value_counts()

In [None]:
# most common OC label sets
oc_ants.groupby('ensg_id').consensus_label.agg(tuple).value_counts().head(5)

In [None]:
yeast_ants.groupby('ensg_id').consensus_label.agg(tuple).value_counts()

In [None]:
oc_yeast = datasets.merge_targets(
    oc_ants, reference_ants=yeast_ants, reference_kind='yeast', exclude_multilocalizing=False
)
oc_yeast_only_one = datasets.merge_targets(
    oc_ants, reference_ants=yeast_ants, reference_kind='yeast', exclude_multilocalizing=True
)

(
    oc_yeast.shape, 
    oc_yeast.partial_match.sum(), 
    oc_yeast.exact_match.sum(), 
    oc_yeast_only_one.shape,
    oc_yeast_only_one.exact_match.sum()
)

In [None]:
# all targets
plotting.plot_sankey(oc_yeast, res=res, right_category_name='Yeast', min_count=5)
plt.savefig(output_dir / ('%s-oc-yeast--all--%s-res--min-count-5.pdf' % (timestamp(), res)), bbox_inches='tight')

In [None]:
# without multilocz
plotting.plot_sankey(oc_yeast_only_one, res=res, right_category_name='Yeast', min_count=1)
# plt.savefig(output_dir / ('oc-yeast--%s-res--wo-multilocz.pdf' % res), bbox_inches='tight')

In [None]:
# export annotations and summary stats (for supp table)
datasets.export_consensus_annotations(output_dir, timestamp(), reference_kind='yeast')

In [None]:
# export a sorted CSV of targets with totally mismatched labels
mismatched = oc_yeast.loc[oc_yeast.partial_match == False].copy()

mismatched.sort_values(by=['consensus_label_oc', 'consensus_label_yeast']).to_csv(
    output_dir / ('%s-oc-yeast-comparison-discrepant-targets.csv' % timestamp()), index=False
)