## Fig. 2 panel C protein-level concensus annotation

This notebook generates two sets of annotations:
- Graph-based_localization_annotation  
  For each protein, this is the most common annotation in the neighbor annotation 
- consensus_graph_annnotation
  Based on the graph-based localization annotation, for proteins where the graph-based annotation is unclassified, use the cluster annotation


In [1]:
import copy
import os
import random
import sys
from datetime import datetime
from pathlib import Path
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.io as pio
import umap
import umap.plot
from tqdm.notebook import tqdm

plt.style.use('ggplot')
plt.rcParams['pdf.fonttype'] = 42

script_path = Path.cwd().parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent / "data"
sys.path.append(str(script_path))
from external import clustering_workflows
from utils.Jaccard_coefficient import *
from utils.label_processing import attach_annotations

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

c:\Users\duo.peng\Documents\Organelle_IP_figures_14-3-3\notebooks\Fig2\panel_C


## load the enrichment table

In [2]:
%store -r timestamp
print(f"Timestamp: {timestamp}")

Timestamp: 2024-07-14


In [3]:
# define files to load
enrichment_dir = Path.cwd().parent.parent / "enrichment"
enrichment_csv_path = enrichment_dir / "output" / "enrichment_and_volcano_tables" / f'{timestamp}_enrichment_table_NOC_prop.csv'

try:
    # load the file
    enrichments = pd.read_csv(enrichment_csv_path, header=[0, 1], index_col=0)
except FileNotFoundError:
    print(f"File {enrichment_csv_path} not found.\nPlease run the enrichment analysis first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {enrichment_csv_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [4]:
# check the enrichment metadata columns
# enrichments["metadata"]

In [5]:
# check the sample columns
# enrichments["sample"]

In [6]:
# attach canonical gene names
gene_name_csv = data_path / "external" / "canonical_names_and_Itzhak_data.csv"

lookup_table = pd.read_csv(gene_name_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["Gene_name_canonical"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c, from_on="Majority protein IDs", to_on="Majority protein IDs")
    enrichments[("metadata", "Gene_name_canonical")] = new_col_data

# attach ground truth
ground_truth_csv = data_path / "external" / "curated_ground_truth_v9.0.csv"

lookup_table = pd.read_csv(ground_truth_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["compartment"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c, from_on="gene_name_canonical", to_on="Gene_name_canonical")
    enrichments[("metadata", "curated_ground_truth_v9.0")] = new_col_data

# attach labels
labels_csv = data_path / "labels" / "cluster_annotation_Dec6.csv"

lookup_table = pd.read_csv(labels_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["cluster_annotation"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c, from_on="Majority protein IDs", to_on="Majority protein IDs")
    enrichments[("metadata", "cluster_annotation")] = new_col_data


## sample selection

In [7]:
# Sample selection for the Leiden and the UMAP algorithms, NOTE: manual sample removal is NOT in this cell

# there are some superfluous samples in this table as well as WTs
# these dont help too much in separating organelles so not counting them
# we also remove the infected samples here as those shouldn't be used in calculating the reference UMAP
cols = list(enrichments["sample"])
meta_cols = list(enrichments["metadata"])
samples = [
    x for x in cols
    if "WT" not in x
    and "harsh" not in x
    and "unsorted" not in x
    and "Infected" not in x
]

# next, we remove additional samples using bait names
genes = [x.split("-")[1] if "-" in x else x for x in samples]
sample_table = pd.DataFrame()
sample_table["samples"] = samples
sample_table["bait"] = genes

bait_drop_list = ["EXOC2"]  # here we are just removing EXOC2
selected_samples = []
for index, row in sample_table.iterrows():
    if row["bait"] not in bait_drop_list:
        selected_samples.append(row["samples"])

In [8]:
# check the selected samples
print(f"the number of selected samples is {len(selected_samples)}")
print(f"the selected samples are {sorted(selected_samples)}")

the number of selected samples is 68
the selected samples are ['01-CAPRIN1', '02-ATG101', '02-COPE', '02-DCP1A', '02-GOLGA2', '02-RICTOR', '03-HSP90AA', '03-HSPA1B', '03-SEC23A', '05-CAV1', '05-EDC4', '05-NCLN', '06-ATP6V1B2', '06-CCDC47', '06-CSNK2A1', '06-CSNK2A2', '06-YWHAB', '07-AP4B1', '07-CLTA', '07-COG8', '07-RAPTOR', '09-ATG101', '09-EDC4', '09-HSP90AA1', '09-PEX3', '09-PSMB7', '09-TOMM20', '10-AP2B1', '10-RTN4', '10-TOMM20', '10-VPS35', '11-CEP350', '11-EEA1', '11-GPR107', '11-SEC31A', '12-ACTB', '12-G3BP1', '12-LAMP1', '12-PNPLA2', '12-RTN4', '12-SEC61B', '12-TOMM20', '13-GOLGA2', '13-RAB11A', '13-RAB14', '13-RAB1A', '13-RAB7A', '14-COPE', '14-GOLGA2', '14-RAB11A', '14-RAB14', '14-RAB1A', '14-RAB7A', '15-G3BP1', '15-GOLGA2', '15-LAMP1', '15-MAP1LC3B', '15-SEC61B', '15-TOMM20', '17-ATP1B3', '17-CAPRIN1', '17-G3BP1', '17-MAP1LC3B', '17-RPL36', '17-SLC30A2', 'NOC_cytosol', 'NOC_nuclear', 'NOC_organelle']


In [9]:
# manually drop a few samples 
to_drop = ["02-EXOC2","06-ATP6V1B2","06-CSNK2A1", "06-CSNK2A2", "07-AP4B1", '02-RICTOR', "07-RAPTOR", "10-AP2B1", "12-PNPLA2"]  # for example: to_drop = ["09-HSP90AA1", "09-PSMB7"]
selected_samples = [x for x in selected_samples if x not in to_drop]  # update the variable: selected_samples

In [10]:
# check the selected samples after manual sample removal
print(f"the number of selected samples is {len(selected_samples)}")
print(f"the selected samples are {sorted(selected_samples)}")

the number of selected samples is 60
the selected samples are ['01-CAPRIN1', '02-ATG101', '02-COPE', '02-DCP1A', '02-GOLGA2', '03-HSP90AA', '03-HSPA1B', '03-SEC23A', '05-CAV1', '05-EDC4', '05-NCLN', '06-CCDC47', '06-YWHAB', '07-CLTA', '07-COG8', '09-ATG101', '09-EDC4', '09-HSP90AA1', '09-PEX3', '09-PSMB7', '09-TOMM20', '10-RTN4', '10-TOMM20', '10-VPS35', '11-CEP350', '11-EEA1', '11-GPR107', '11-SEC31A', '12-ACTB', '12-G3BP1', '12-LAMP1', '12-RTN4', '12-SEC61B', '12-TOMM20', '13-GOLGA2', '13-RAB11A', '13-RAB14', '13-RAB1A', '13-RAB7A', '14-COPE', '14-GOLGA2', '14-RAB11A', '14-RAB14', '14-RAB1A', '14-RAB7A', '15-G3BP1', '15-GOLGA2', '15-LAMP1', '15-MAP1LC3B', '15-SEC61B', '15-TOMM20', '17-ATP1B3', '17-CAPRIN1', '17-G3BP1', '17-MAP1LC3B', '17-RPL36', '17-SLC30A2', 'NOC_cytosol', 'NOC_nuclear', 'NOC_organelle']


## data preprocessing

In [11]:
# save a copy of the tables for UMAP

umap_table = enrichments.droplevel(0, axis=1)[meta_cols + selected_samples].copy()
# normalization and UMAP algorithm are not compatible with any NaN values, so drop them
umap_table = umap_table.dropna(subset=selected_samples)
quants = umap_table[selected_samples].copy()
print(f"the dimensions of the data table saved for UMAP are {quants.shape}")

quants.to_csv(save_path / f"{timestamp}_quants.csv")
umap_table.to_csv(save_path / f"{timestamp}_umap_table.csv")

the dimensions of the data table saved for UMAP are (8541, 60)


convert data into anndata format 


In [12]:
# generating AnnData
selected = enrichments['sample'][selected_samples].copy()
adata = ad.AnnData(selected, dtype=np.float32)

adata.var_names = selected.columns.to_list()
adata.obs_names = enrichments['metadata']["Protein IDs"].to_list()
adata.obs["Protein IDs"] = enrichments['metadata']["Protein IDs"].to_list()
adata.obs["Majority protein IDs"] = enrichments['metadata']["Majority protein IDs"].to_list()
adata.obs["Gene_name_canonical"] = enrichments['metadata']["Gene_name_canonical"].to_list()
adata.obs["curated_ground_truth_v9.0"] = enrichments['metadata']["curated_ground_truth_v9.0"].to_list()
adata.obs["cluster_annotation"] = enrichments['metadata']["cluster_annotation"].to_list()

adata.write_h5ad(save_path / f"adata_{timestamp}.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Gene_name_canonical' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'curated_ground_truth_v9.0' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cluster_annotation' as categorical


## compute knn graph (and save a copy for later use)

In [13]:
# instantiate a clusteringworkflow class (to use the part of workflow that computes the nearest neighbor graph)
kNN_obj = clustering_workflows.ClusteringWorkflow(adata=copy.deepcopy(adata))
# preprocessing
kNN_obj.preprocess(n_pcs=None)
# compute nearest neighbor graph
kNN_obj.calculate_neighbors(n_pcs=None, n_neighbors=20)
adata = kNN_obj.adata
# save a copy of the adata object that contains the kNN graph
knn_adata_path = save_path / f"adata_kNN_{timestamp}.h5ad"
adata.write(knn_adata_path)

In [14]:
kNN_obj.adata

AnnData object with n_obs × n_vars = 8541 × 60
    obs: 'Protein IDs', 'Majority protein IDs', 'Gene_name_canonical', 'curated_ground_truth_v9.0', 'cluster_annotation'
    var: 'mean', 'std'
    uns: 'neighbors'
    obsp: 'distances', 'connectivities'

## generate protein-level consensus annotation

In [15]:
all_majority_ids = adata.obs["Majority protein IDs"].to_list()
all_genes = adata.obs["Gene_name_canonical"].to_list()

annot_df = pd.DataFrame(
    list(zip(
            adata.obs["Majority protein IDs"].to_list(),
            adata.obs["Gene_name_canonical"].to_list(),
            adata.obs["cluster_annotation"].to_list(),
        )),
    columns=["Majority protein IDs", "Gene_name_canonical", "cluster_annotation"],
)

In [16]:
# iterate over all genes and generate protein-level consensus annotation
Graph_based_loc_annot = []

for idx, gene in tqdm(enumerate(all_genes), total=len(all_genes)): 
    neighbor_list, neighbor_annot_list = gene_neighbor_annots(gene_name=gene, adata=adata, annot_df=annot_df, gene_name_col="Gene_name_canonical", annot_col="cluster_annotation")
    # get the most common annotation in the neighbor annotation
    most_common_annot = Counter(neighbor_annot_list).most_common(1)[0][0]
    Graph_based_loc_annot.append(most_common_annot)    

  0%|          | 0/8541 [00:00<?, ?it/s]

In [17]:
# add the consensus annotation to the dataframe
annot_df["Graph-based_localization_annotation"] = Graph_based_loc_annot
annot_df["consensus_graph_annnotation"] = Graph_based_loc_annot  # initialize the column

# for proteins where the graph-based annotation is unclassified, use the cluster annotation
mask = annot_df["Graph-based_localization_annotation"] == "unclassified"
print("Number of proteins with unclassified graph-based annotation:", sum(mask))
annot_df.loc[mask, "consensus_graph_annnotation"] = annot_df.loc[mask, "cluster_annotation"]

# save the annotation table
annot_df.to_csv(save_path / f"{timestamp}_graph-based_annotations.csv")

Number of proteins with unclassified graph-based annotation: 1034


In [18]:
# add the annotations to the adata object
adata.obs["Graph-based_localization_annotation"] = annot_df["Graph-based_localization_annotation"].to_list()
adata.obs["consensus_graph_annnotation"] = annot_df["consensus_graph_annnotation"].to_list()
# save a copy of the (updated) adata object, overwriting the previous one
adata_path = save_path / f"adata_kNN_{timestamp}.h5ad"
adata.write(adata_path)

# add the annotations to the umap_table
umap_table.insert(6, "consensus_graph_annnotation", annot_df["consensus_graph_annnotation"].to_list())
umap_table.insert(6, "Graph-based_localization_annotation", annot_df["Graph-based_localization_annotation"].to_list())
# save a copy of the (updated) umap_table
umap_table.to_csv(save_path / f"{timestamp}_umap_table.csv")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Graph-based_localization_annotation' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'consensus_graph_annnotation' as categorical
