## compute umap coordinates for one-IP-per-compartment
This notebook calculates 2D and 3D UMAP embeddings and generate plots

In [1]:
# select from three different approaches to choose the best IPs for each compartment

# v1 approach: pick the IP/bait that has the highest median enrichment for the highlights (orange). 
# (median was computed after retaining positive, significant enrichments) 

# v2 approach: pick the IP/bait that has the highest number of significantly enriched preys that are 
# expected according to graph-based annotation @Manuel Leonetti  please confirm this choice.

# v3 approach: unbiased scan, pick the IP combination (one-per-compartment) that gives the highest clustering score

approach = 'v3' # possible values are 'v1', 'v2', 'v3'

assert approach in ['v1', 'v2', 'v3']

In [2]:
import os, sys, random
from pathlib import Path
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.io as pio
import umap

import shutil

plt.style.use('ggplot')
plt.rcParams['pdf.fonttype'] = 42

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus.plotting import plotly_umap as pu

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

### load data

In [3]:
#%store -r timestamp
timestamp = "2023-10-21-imp5-for-figures"
print(f"Timestamp: {timestamp}") 

Timestamp: 2023-10-21-imp5-for-figures


In [4]:
# define files to load
input_dir = Path.cwd().parent.parent.parent.parent.parent.parent / "Fig2" / "panel_C" / "output"
umap_table_path = input_dir / f"{timestamp}_umap_table.csv"
quants_path = input_dir / f"{timestamp}_quants.csv"

version = approach
use_best_scan_result = False
if version == 'v1' or version == 'v2':
    best_IP_path = Path.cwd().parent / "output" / f"2023-10-21-imp5-for-figures_best_pulldowns_{version}.csv"
    # load the list of best IP
    try:
        # load the file
        best_IPs = pd.read_csv(best_IP_path, index_col=0)
    except FileNotFoundError:
        print(f"File {best_IP_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
    except pd.errors.ParserError:
        print(f"There was an error parsing the CSV file at {best_IP_path}.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        # rank by compartment
        best_IPs = best_IPs.sort_values(by=["compartment"], ascending=[True])
elif version == 'v3':
    use_best_scan_result = True

try:
    # load the umap file
    umap_table = pd.read_csv(umap_table_path, index_col=0)
except FileNotFoundError:
    print(f"File {umap_table_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {umap_table_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

try:
    # load the file
    quants = pd.read_csv(quants_path, index_col=0)
except FileNotFoundError:
    print(f"File {quants_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {quants_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


In [5]:
if use_best_scan_result: # replace the best_IPs list with the best scan result
    best_IP_path_wNOC = Path.cwd().parent / "scan_IP_combo" / "hpc_results" / "with_NOC" / f"IP_combos.1-5.19.tsv_scores.tsv"
    best_IPs_wNOC = pd.read_csv(best_IP_path_wNOC, sep='\t')
    best_IPs_wNOC = best_IPs_wNOC[best_IPs_wNOC["n_compart"] == 19]
    # drop the idx column and reindix
    best_IPs_wNOC = best_IPs_wNOC.drop(columns=["idx"]).reset_index(drop=True)
    best_IPs_wNOC["score_sum"] = best_IPs_wNOC["compartment_score"] + best_IPs_wNOC["complex_score"]
    best_IPs_wNOC = best_IPs_wNOC.sort_values(by=["score_sum"], ascending=[False])
    best_IPs = best_IPs_wNOC.iloc[0]["combo"].split(",")
    print(f"Best IP combo: {best_IPs}")

Best IP combo: ['12-YWHAQ', '12-ACTB', '09-ATG101', '11-CEP350', '10-VPS35', '07-CLTA', '05-NCLN', '02-COPE', '13-GOLGA2', '03-HSPA1B', '12-LAMP1', '12-TOMM20', '05-EDC4', '09-PEX3', '17-ATP1B3', '09-PSMB7', '14-RAB11A', '17-RPL36', '17-G3BP1']


In [6]:
quants

Unnamed: 0,13-GOLGA2,12-G3BP1,13-RAB7A,12-YWHAQ,13-RAB14,12-TOMM20,13-RAB11A,13-RAB1A,12-RTN4,12-ACTB,...,09-PSMB7,09-HSP90AA1,11-GPR107,09-PEX3,11-EEA1,09-ATG101,10-TOMM20,NOC_cytosol,NOC_organelle,NOC_nuclear
0,-0.610929,0.512454,-0.048441,0.560037,-0.001493,-0.428346,0.075676,-1.377505,-0.715905,0.560059,...,-0.259385,-0.120096,-1.145514,-0.236974,-1.264236,2.990684,-0.640482,0.235748,0.241247,0.523005
1,-0.430800,0.621600,-1.357550,0.709150,-0.984800,-2.248450,-1.194000,-1.954150,-2.264500,0.859100,...,-0.327100,-2.443700,-2.108900,-0.496950,-1.618800,-0.728450,-2.969400,0.362391,0.286889,0.350721
2,6.484960,-4.707100,4.723722,-3.569950,4.380600,-2.609584,5.609286,5.862022,4.001500,-4.186372,...,-3.399503,-4.311600,5.934600,-0.092300,-1.672800,2.393000,-2.685600,0.054846,0.654148,0.291006
3,0.358972,0.043422,-0.139988,0.358322,-0.668563,-1.284575,-0.421189,0.712387,-0.379881,0.907181,...,-0.279703,-1.047753,0.802264,0.534835,0.255801,-0.673791,-0.196068,1.000000,0.000000,0.000000
4,6.830250,-7.722021,8.747675,1.164950,7.223181,-4.417649,7.422465,7.264575,6.729837,-5.076866,...,-1.587400,-3.389100,5.970200,0.016550,2.055000,-0.412350,-2.382900,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8536,-0.753922,0.327333,-0.105419,-0.961485,-0.302287,0.102512,-0.459466,1.448825,0.557191,0.957033,...,0.667332,0.796147,0.485010,0.335840,-0.501416,0.714542,0.421261,0.765637,0.145727,0.088636
8537,-0.206649,0.049816,0.018155,-0.959656,0.704740,-0.981348,-1.027931,0.104669,0.100258,0.124037,...,-0.741734,-0.241625,-0.315763,1.120229,1.024335,-0.047443,-1.478316,0.000000,0.802964,0.197036
8538,-0.122949,-0.060945,0.771012,0.125994,-0.431806,-0.070560,0.189869,-0.258547,-0.090820,0.089787,...,-0.209416,-0.073516,0.990121,-0.052907,0.362366,0.532208,0.273076,1.000000,0.000000,0.000000
8539,-0.586886,1.071665,0.216161,-0.185768,0.157390,2.320334,-0.277925,-0.526101,0.561332,0.143135,...,0.213513,4.144041,-0.621525,-0.099420,-0.622762,-0.705361,3.242869,0.720741,0.279259,0.000000


In [7]:
umap_table

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,cluster_annotation,Graph-based_localization_annotation,consensus_graph_annnotation,13-GOLGA2,12-G3BP1,...,09-PSMB7,09-HSP90AA1,11-GPR107,09-PEX3,11-EEA1,09-ATG101,10-TOMM20,NOC_cytosol,NOC_organelle,NOC_nuclear
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,MAGOHB,,nucleus,nucleus,nucleus,-0.610929,0.512454,...,-0.259385,-0.120096,-1.145514,-0.236974,-1.264236,2.990684,-0.640482,0.235748,0.241247,0.523005
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,RBM8A,,nucleus,nucleus,nucleus,-0.430800,0.621600,...,-0.327100,-2.443700,-2.108900,-0.496950,-1.618800,-0.728450,-2.969400,0.362391,0.286889,0.350721
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,TM9SF4,Golgi,Golgi,Golgi,Golgi,6.484960,-4.707100,...,-3.399503,-4.311600,5.934600,-0.092300,-1.672800,2.393000,-2.685600,0.054846,0.654148,0.291006
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,PTEN,,cytosol,cytosol,cytosol,0.358972,0.043422,...,-0.279703,-1.047753,0.802264,0.534835,0.255801,-0.673791,-0.196068,1.000000,0.000000,0.000000
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,TM9SF2,Golgi,trans-Golgi,trans-Golgi,trans-Golgi,6.830250,-7.722021,...,-1.587400,-3.389100,5.970200,0.016550,2.055000,-0.412350,-2.382900,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8536,X5D7P8,X5D7P8,STK39,STK39,,cytosol,cytosol,cytosol,-0.753922,0.327333,...,0.667332,0.796147,0.485010,0.335840,-0.501416,0.714542,0.421261,0.765637,0.145727,0.088636
8537,X5D8X9,X5D8X9,CNTNAP2,CNTNAP2,,plasma_membrane,plasma_membrane,plasma_membrane,-0.206649,0.049816,...,-0.741734,-0.241625,-0.315763,1.120229,1.024335,-0.047443,-1.478316,0.000000,0.802964,0.197036
8538,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,CRMP1,,cytosol,cytosol,cytosol,-0.122949,-0.060945,...,-0.209416,-0.073516,0.990121,-0.052907,0.362366,0.532208,0.273076,1.000000,0.000000,0.000000
8539,X5DQZ7,X5DQZ7,GPX1,GPX1,,mitochondrion,mitochondrion,mitochondrion,-0.586886,1.071665,...,0.213513,4.144041,-0.621525,-0.099420,-0.622762,-0.705361,3.242869,0.720741,0.279259,0.000000


### subset the data to the best IP/baits

In [8]:
all_IPs = [i for i in quants.columns if not i.startswith("NOC")]
NOCs = [i for i in quants.columns if i.startswith("NOC")]
if not use_best_scan_result:
    best_IPs = best_IPs.index.to_list()

In [9]:
umap_table = umap_table[[i for i in umap_table.columns if (not i in quants.columns.to_list()) or (i in best_IPs) or (i in NOCs)]]
quants = quants[best_IPs + NOCs]

In [10]:
umap_table

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,cluster_annotation,Graph-based_localization_annotation,consensus_graph_annnotation,13-GOLGA2,12-YWHAQ,...,17-ATP1B3,17-G3BP1,11-CEP350,10-VPS35,09-PSMB7,09-PEX3,09-ATG101,NOC_cytosol,NOC_organelle,NOC_nuclear
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,MAGOHB,,nucleus,nucleus,nucleus,-0.610929,0.560037,...,-0.908800,-2.252888,0.413392,-1.206977,-0.259385,-0.236974,2.990684,0.235748,0.241247,0.523005
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,RBM8A,,nucleus,nucleus,nucleus,-0.430800,0.709150,...,-1.653100,0.980800,0.895300,-0.959200,-0.327100,-0.496950,-0.728450,0.362391,0.286889,0.350721
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,TM9SF4,Golgi,Golgi,Golgi,Golgi,6.484960,-3.569950,...,-3.943550,-1.904600,0.397300,0.601500,-3.399503,-0.092300,2.393000,0.054846,0.654148,0.291006
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,PTEN,,cytosol,cytosol,cytosol,0.358972,0.358322,...,0.276937,0.518615,-0.590800,0.299063,-0.279703,0.534835,-0.673791,1.000000,0.000000,0.000000
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,TM9SF2,Golgi,trans-Golgi,trans-Golgi,trans-Golgi,6.830250,1.164950,...,-1.577950,-2.308600,-0.438800,2.271100,-1.587400,0.016550,-0.412350,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8536,X5D7P8,X5D7P8,STK39,STK39,,cytosol,cytosol,cytosol,-0.753922,-0.961485,...,-1.329812,0.481129,-0.809429,-1.384804,0.667332,0.335840,0.714542,0.765637,0.145727,0.088636
8537,X5D8X9,X5D8X9,CNTNAP2,CNTNAP2,,plasma_membrane,plasma_membrane,plasma_membrane,-0.206649,-0.959656,...,3.428048,-3.976792,1.940446,-0.431210,-0.741734,1.120229,-0.047443,0.000000,0.802964,0.197036
8538,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,CRMP1,,cytosol,cytosol,cytosol,-0.122949,0.125994,...,-0.376828,-1.133368,1.243900,-0.165505,-0.209416,-0.052907,0.532208,1.000000,0.000000,0.000000
8539,X5DQZ7,X5DQZ7,GPX1,GPX1,,mitochondrion,mitochondrion,mitochondrion,-0.586886,-0.185768,...,-0.488650,-0.103649,-0.097903,-0.311805,0.213513,-0.099420,-0.705361,0.720741,0.279259,0.000000


In [11]:
quants

Unnamed: 0,12-YWHAQ,12-ACTB,09-ATG101,11-CEP350,10-VPS35,07-CLTA,05-NCLN,02-COPE,13-GOLGA2,03-HSPA1B,...,05-EDC4,09-PEX3,17-ATP1B3,09-PSMB7,14-RAB11A,17-RPL36,17-G3BP1,NOC_cytosol,NOC_organelle,NOC_nuclear
0,0.560037,0.560059,2.990684,0.413392,-1.206977,-0.789800,-0.315851,0.173900,-0.610929,-1.601996,...,2.872639,-0.236974,-0.908800,-0.259385,0.744121,-0.334927,-2.252888,0.235748,0.241247,0.523005
1,0.709150,0.859100,-0.728450,0.895300,-0.959200,-0.267200,-1.078600,-1.290000,-0.430800,-0.088200,...,0.816800,-0.496950,-1.653100,-0.327100,-0.883200,0.133400,0.980800,0.362391,0.286889,0.350721
2,-3.569950,-4.186372,2.393000,0.397300,0.601500,-0.980700,2.696400,3.581000,6.484960,-2.283300,...,-1.360200,-0.092300,-3.943550,-3.399503,3.475600,-2.638500,-1.904600,0.054846,0.654148,0.291006
3,0.358322,0.907181,-0.673791,-0.590800,0.299063,0.088772,-0.643163,-0.465978,0.358972,0.380540,...,-1.538733,0.534835,0.276937,-0.279703,0.746658,-0.759594,0.518615,1.000000,0.000000,0.000000
4,1.164950,-5.076866,-0.412350,-0.438800,2.271100,0.228300,1.662000,2.402350,6.830250,-1.497300,...,-0.273400,0.016550,-1.577950,-1.587400,2.189300,-2.485100,-2.308600,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8536,-0.961485,0.957033,0.714542,-0.809429,-1.384804,0.208881,-0.558006,0.662960,-0.753922,0.304750,...,0.309381,0.335840,-1.329812,0.667332,-0.119997,-0.108009,0.481129,0.765637,0.145727,0.088636
8537,-0.959656,0.124037,-0.047443,1.940446,-0.431210,0.354900,1.587381,1.142600,-0.206649,-0.455800,...,-0.403736,1.120229,3.428048,-0.741734,0.754700,-3.245091,-3.976792,0.000000,0.802964,0.197036
8538,0.125994,0.089787,0.532208,1.243900,-0.165505,-0.909993,-0.551448,0.117320,-0.122949,-0.431862,...,0.262057,-0.052907,-0.376828,-0.209416,-0.434443,-0.517841,-1.133368,1.000000,0.000000,0.000000
8539,-0.185768,0.143135,-0.705361,-0.097903,-0.311805,0.597187,0.512849,-0.116548,-0.586886,-1.251409,...,0.393994,-0.099420,-0.488650,0.213513,3.278988,-0.772276,-0.103649,0.720741,0.279259,0.000000


### preprocessing

In [12]:
# scale the table
scaled = pu.scale_table(matrix=quants, method='standard')

### compute UMAPs with different seeds and save to file

In [13]:
if os.path.exists(save_path / "bootstraps"):
    shutil.rmtree(save_path / "bootstraps")
os.makedirs(save_path / "bootstraps", exist_ok=True)


n_bootstraps = 10
n_neighbors = 20
min_dist = 0.1
metric = 'euclidean'

# flip the umap coordinates (some times the UMAP algorithm flips the coordinates)
flip = True

for _ in range(n_bootstraps):
    UMAP_seed = random.randint(0, 10000)
    # calculate 2D UMAP embeddings
    fit = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        metric=metric, 
        random_state=UMAP_seed
    )
    u = fit.fit_transform(scaled)
    umap_table['umap_1'] = u[:, 0] 
    umap_table['umap_2'] = u[:, 1]

    # flip the UMAP coordinates
    if flip:
        max_x = max(umap_table["umap_1"])
        max_y = max(umap_table["umap_2"])
        umap_table["umap_1"] = max_x - umap_table["umap_1"]
        umap_table["umap_2"] = max_y - umap_table["umap_2"]
    
    # save umap embedding to csv file
    save_name = f"{timestamp}_UMAP_embeddings_seed={UMAP_seed}.csv"
    umap_table.to_csv(os.path.join(save_path / "bootstraps", save_name), index=False)

In [14]:
# vizualize the last UMAP embeddings

# generate 2D UMAP plot
label_to_color = "consensus_graph_annnotation"  # **choose which annotation column to highlight here** , other choices: cluster_annotation, Protein-level_consensus_annotation

fig = pu.interaction_umap(
    umap_table,
    node_name="Gene_name_canonical", 
    cluster=label_to_color, opacity=0.35, unlabelled_color="#D0D3D4", unlabelled_opacity=0.1,
    pointsize=6, x="umap_1", y="umap_2",
    categorical=True,
)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"{timestamp}_UMAP_2D_seed{UMAP_seed}_{version}.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)