## Fig. 2 panel D umap
This notebook calculates 2D and 3D UMAP embeddings and generate plots

In [184]:
import os
import random
import sys
from datetime import datetime
from pathlib import Path
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.io as pio
import umap
import umap.plot
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA

plt.style.use('ggplot')
plt.rcParams['pdf.fonttype'] = 42

script_path = Path.cwd().parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus.plotting import plotly_umap as pu
from utils.label_processing import attach_annotations

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

### load data

In [185]:
# %store -r timestamp USE_FROZEN
# if USE_FROZEN:
#     raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from Fig1")
timestamp = "2024-07-24"

In [186]:
# define files to load
input_dir = Path.cwd().parent / "panel_C" / "output"
umap_table_path = input_dir / f"{timestamp}_umap_table.csv"
quants_path = input_dir / f"{timestamp}_quants.csv"

# load data
try:
    # load the file
    umap_table = pd.read_csv(umap_table_path, index_col=0)
except FileNotFoundError:
    print(f"File {umap_table_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {umap_table_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

try:
    # load the file
    quants = pd.read_csv(quants_path, index_col=0)
except FileNotFoundError:
    print(f"File {quants_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {quants_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### preprocessing

In [187]:
# scale the table for UMAP
scaled = pu.scale_table(matrix=quants, method='standard')

### PCA

In [188]:
# Perform PCA
pca = PCA(n_components=5)
pca.fit(scaled)

umap_table['PC_1'] = pca.transform(scaled)[:, 0]
umap_table['PC_2'] = pca.transform(scaled)[:, 1]

In [189]:
# Variance explained by each principal component
variance_explained = pca.explained_variance_ratio_
variance_explained

array([0.23070978, 0.16772351, 0.06802476, 0.04993717, 0.03911036])

In [190]:
# generate 2D PCA plot
label_to_color = "consensus_graph_annnotation"  # **choose which annotation column to highlight here** , other choices: cluster_annotation, Protein-level_consensus_annotation

fig = pu.interaction_umap(
    umap_table,
    node_name="Gene_name_canonical", 
    cluster=label_to_color, opacity=0.35, unlabelled_color="#D0D3D4", unlabelled_opacity=0.1,
    pointsize=6, x="PC_1", y="PC_2",
    categorical=True,
)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"{timestamp}_PCA_2Dview.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)





### UMAP

In [191]:
# UMAP parameters
n_neighbors = 20
min_dist = 0.1
metric = 'euclidean'

# flip the umap coordinates (some times the UMAP algorithm flips the coordinates)
flip = True

# set the UMAP seed
UMAP_seed = 1234

In [192]:
# calculate 2D UMAP embeddings
fit = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed
)
u = fit.fit_transform(scaled)
umap_table['umap_1'] = u[:, 0] 
umap_table['umap_2'] = u[:, 1]

# flip the UMAP coordinates
if flip:
    max_x = max(umap_table["umap_1"])
    max_y = max(umap_table["umap_2"])
    umap_table["umap_1"] = max_x - umap_table["umap_1"]
    umap_table["umap_2"] = max_y - umap_table["umap_2"]


# calculate 3D UMAP embeddings
fit3D = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed, 
    n_components=3
)
u3D = fit3D.fit_transform(scaled)
# add the UMAP coordinates to the table
umap_table['3D_umap_1'] = u3D[:, 0] 
umap_table['3D_umap_2'] = u3D[:, 1]
umap_table['3D_umap_3'] = u3D[:, 2]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



### save UMAP embeddings

In [193]:
# save umap embedding to csv file
save_name = f"{timestamp}_UMAP_embeddings_seed={UMAP_seed}.csv"
umap_table.to_csv(os.path.join(save_path, save_name), index=False)

In [194]:
umap_table

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,cluster_annotation,neighbors,Graph-based_localization_annotation,consensus_graph_annnotation,12-LAMP1,...,NOC_cytosol,NOC_organelle,NOC_nuclear,PC_1,PC_2,umap_1,umap_2,3D_umap_1,3D_umap_2,3D_umap_3
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,MAGOHB,,nucleus,"[[('nucleus', 16), ('ER', 1), ('unclassified',...",nucleus,nucleus,-0.141427,...,0.235748,0.241247,0.523005,-0.035243,0.064276,9.185568,3.134099,6.374478,9.173985,5.213600
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,RBM8A,,nucleus,"[[('nucleus', 45), ('unclassified', 9), ('nucl...",nucleus,nucleus,-0.588500,...,0.362391,0.286889,0.350721,-2.861609,-1.957534,8.701310,1.512504,7.948301,10.094701,5.763598
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,TM9SF4,Golgi,Golgi,"[[('trans-Golgi', 8), ('Golgi', 42), ('ERGIC',...",Golgi,Golgi,3.703700,...,0.054846,0.654148,0.291006,4.119621,8.992140,8.503765,10.134826,9.509655,4.085703,6.368067
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,PTEN,,cytosol,"[[('cytosol', 20)]]",cytosol,cytosol,0.261350,...,1.000000,0.000000,0.000000,-0.840413,-0.663711,4.121713,2.524803,7.276361,8.391106,0.852297
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,TM9SF2,Golgi,trans-Golgi,"[[('Golgi', 6), ('plasma_membrane', 1), ('tran...",trans-Golgi,trans-Golgi,5.499848,...,0.083591,0.697825,0.218584,3.718755,9.240308,8.261353,9.791969,9.555220,4.436825,6.164950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8535,X5D7P8,X5D7P8,STK39,STK39,,cytosol,"[[('unclassified', 11), ('cytosol', 43)]]",cytosol,cytosol,-0.030088,...,0.765637,0.145727,0.088636,-0.959204,-0.818976,4.545286,2.951300,7.063812,8.763442,1.528698
8536,X5D8X9,X5D8X9,CNTNAP2,CNTNAP2,,plasma_membrane,"[[('plasma_membrane', 20), ('nucleolus', 1)]]",plasma_membrane,plasma_membrane,-0.052453,...,0.000000,0.802964,0.197036,-2.247702,2.978923,5.629073,7.283597,9.865761,7.022906,3.905463
8537,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,CRMP1,,cytosol,"[[('cytosol', 21), ('unclassified', 1)]]",cytosol,cytosol,0.327787,...,1.000000,0.000000,0.000000,-0.793347,-0.654640,4.472646,2.778949,7.115033,8.773095,1.307355
8538,X5DQZ7,X5DQZ7,GPX1,GPX1,,mitochondrion,"[[('mitochondrion', 30), ('unclassified', 5), ...",mitochondrion,mitochondrion,0.261512,...,0.720741,0.279259,0.000000,1.578156,0.797008,4.939523,2.923792,6.902620,8.747833,1.828734


In [195]:
umap_table.iloc[:, 9:9+scaled.shape[1]] 

Unnamed: 0,12-LAMP1,13-RAB1A,12-TOMM20,12-SEC61B,12-ACTB,13-RAB11A,13-RAB7A,12-G3BP1,12-RTN4,13-GOLGA2,...,11-SEC31A,09-HSP90AA1,10-RTN4,09-TOMM20,11-CEP350,09-ATG101,10-TOMM20,NOC_cytosol,NOC_organelle,NOC_nuclear
0,-0.141427,0.164993,-0.279302,0.018797,0.288365,0.283929,0.747769,-0.239663,-0.604410,0.228410,...,0.084513,-1.310187,-0.369807,-0.294506,0.212616,3.018858,-1.547331,0.235748,0.241247,0.523005
1,-0.588500,-1.931700,-2.246100,-0.623300,0.860750,-1.187400,-1.335100,0.631900,-2.264500,-0.421200,...,0.979850,-2.443700,-1.899050,-2.430950,0.895300,-0.728450,-2.969400,0.362391,0.286889,0.350721
2,3.703700,6.304299,-2.356032,4.604981,-4.326681,6.214500,5.165999,-5.086800,4.333281,6.801000,...,1.688300,-5.186684,1.852200,-2.612500,0.397300,2.393000,-2.685600,0.054846,0.654148,0.291006
3,0.261350,0.521100,0.744454,0.297697,-0.356495,-1.348290,1.389465,-0.309414,0.163948,0.157317,...,0.361729,-0.195980,0.269211,-0.420635,-0.067346,-0.332397,0.223960,1.000000,0.000000,0.000000
4,5.499848,7.657225,-1.099823,6.447580,-7.542662,7.354080,9.140325,-7.666267,6.273880,7.549848,...,1.179800,-3.389100,1.524950,-1.385850,-0.438800,-0.412350,-2.382900,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8535,-0.030088,-0.838131,-0.523289,0.547829,0.125618,1.058831,0.049707,0.030088,0.124307,0.315381,...,1.326828,-0.147448,0.818616,-0.715761,-0.227490,0.874124,0.660835,0.765637,0.145727,0.088636
8536,-0.052453,-0.518175,-0.658738,0.028252,0.861863,-0.529760,-0.149484,-0.936443,-0.821882,-0.440132,...,1.567885,-0.780498,-0.554957,-1.183749,1.758580,0.498581,-0.680795,0.000000,0.802964,0.197036
8537,0.327787,0.309068,0.558718,-0.120634,0.805817,-0.914660,0.484696,-0.087955,-0.510794,-0.002831,...,1.309921,1.010047,-0.362585,1.048421,-0.334151,-0.265872,-0.425032,1.000000,0.000000,0.000000
8538,0.261512,-0.491176,2.629682,-0.382702,-0.287952,0.725518,0.806167,-0.381006,0.524223,-0.153642,...,-1.048860,3.884103,0.371638,2.954144,-0.481100,-0.570653,2.896576,0.720741,0.279259,0.000000


In [196]:
# save scaled quants to csv file
save_name = f"{timestamp}_scaled_quants.csv"
umap_table.iloc[:, 9:9+scaled.shape[1]] = scaled# this line might need to be adjusted depending on the number of columns in the quants file
umap_table.to_csv(os.path.join(save_path, save_name), index=False)

# save 0-1 scaled quants to csv file
def normalize_rowwise(df):
    row_sums = df.sum(axis=1)
    normalized_df = df.div(row_sums, axis=0)
    return normalized_df
save_name = f"{timestamp}_01_scaled_quants.csv"
data = umap_table.iloc[:, 9:9+scaled.shape[1]].copy()
data[data<0] = 0
umap_table.iloc[:, 9:9+scaled.shape[1]] = normalize_rowwise(data)
umap_table.to_csv(os.path.join(save_path, save_name), index=False)

In [197]:
umap_table

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,cluster_annotation,neighbors,Graph-based_localization_annotation,consensus_graph_annnotation,12-LAMP1,...,NOC_cytosol,NOC_organelle,NOC_nuclear,PC_1,PC_2,umap_1,umap_2,3D_umap_1,3D_umap_2,3D_umap_3
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,MAGOHB,,nucleus,"[[('nucleus', 16), ('ER', 1), ('unclassified',...",nucleus,nucleus,0.000000,...,0.000000,0.000000,0.046493,-0.035243,0.064276,9.185568,3.134099,6.374478,9.173985,5.213600
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,RBM8A,,nucleus,"[[('nucleus', 45), ('unclassified', 9), ('nucl...",nucleus,nucleus,0.000000,...,0.006628,0.000000,0.019397,-2.861609,-1.957534,8.701310,1.512504,7.948301,10.094701,5.763598
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,TM9SF4,Golgi,Golgi,"[[('trans-Golgi', 8), ('Golgi', 42), ('ERGIC',...",Golgi,Golgi,0.031953,...,0.000000,0.022114,0.000000,4.119621,8.992140,8.503765,10.134826,9.509655,4.085703,6.368067
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,PTEN,,cytosol,"[[('cytosol', 20)]]",cytosol,cytosol,0.000000,...,0.167236,0.000000,0.000000,-0.840413,-0.663711,4.121713,2.524803,7.276361,8.391106,0.852297
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,TM9SF2,Golgi,trans-Golgi,"[[('Golgi', 6), ('plasma_membrane', 1), ('tran...",trans-Golgi,trans-Golgi,0.052838,...,0.000000,0.027312,0.000000,3.718755,9.240308,8.261353,9.791969,9.555220,4.436825,6.164950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8535,X5D7P8,X5D7P8,STK39,STK39,,cytosol,"[[('unclassified', 11), ('cytosol', 43)]]",cytosol,cytosol,0.000000,...,0.137511,0.000000,0.000000,-0.959204,-0.818976,4.545286,2.951300,7.063812,8.763442,1.528698
8536,X5D8X9,X5D8X9,CNTNAP2,CNTNAP2,,plasma_membrane,"[[('plasma_membrane', 20), ('nucleolus', 1)]]",plasma_membrane,plasma_membrane,0.000000,...,0.000000,0.101677,0.000000,-2.247702,2.978923,5.629073,7.283597,9.865761,7.022906,3.905463
8537,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,CRMP1,,cytosol,"[[('cytosol', 21), ('unclassified', 1)]]",cytosol,cytosol,0.002191,...,0.179616,0.000000,0.000000,-0.793347,-0.654640,4.472646,2.778949,7.115033,8.773095,1.307355
8538,X5DQZ7,X5DQZ7,GPX1,GPX1,,mitochondrion,"[[('mitochondrion', 30), ('unclassified', 5), ...",mitochondrion,mitochondrion,0.000000,...,0.054638,0.000000,0.000000,1.578156,0.797008,4.939523,2.923792,6.902620,8.747833,1.828734


### update annotation 
remove `cluster_annotation`, `Graph-based_localization_annotation` and `consensus_graph_annnotation`  
replace with that from 2023-10-21-imp5-for-figures

In [198]:
# attach canonical gene names
gene_name_csv = data_path / "labels" / "2023-10-21-imp5-for-figures_graph-based_annotations.csv"

lookup_table = pd.read_csv(gene_name_csv)
to_df = umap_table.copy()
list_of_cols_to_add = reversed(["Graph-based_localization_annotation"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c, from_on="Majority protein IDs", to_on="Majority protein IDs")
    umap_table["[all_IPs] Graph-based_localization_annotation"] = new_col_data

lookup_table = pd.read_csv(gene_name_csv)
to_df = umap_table.copy()
list_of_cols_to_add = reversed(["consensus_graph_annnotation"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c, from_on="Majority protein IDs", to_on="Majority protein IDs")
    umap_table["[all_IPs] consensus_graph_annnotation"] = new_col_data

In [199]:
# construct 14-3-3 vs other column
umap_table["[all_IPs] 14-3-3_graph"] = umap_table["[all_IPs] Graph-based_localization_annotation"].apply(lambda x: "14-3-3_scaffold" if x == "14-3-3_scaffold" else None)
umap_table["[all_IPs] 14-3-3_consensus"] = umap_table["[all_IPs] consensus_graph_annnotation"].apply(lambda x: "14-3-3_scaffold" if x == "14-3-3_scaffold" else None)

In [200]:
umap_table["[all_IPs] 14-3-3_consensus"].value_counts()

[all_IPs] 14-3-3_consensus
14-3-3_scaffold    130
Name: count, dtype: int64

### generate UMAP plots

In [201]:
# flip x and y axis
flip = True
if flip:
    umap_table["umap_1"] = -umap_table["umap_1"]
    umap_table["umap_2"] = -umap_table["umap_2"]

# generate 2D UMAP plot
label_to_color = "[all_IPs] consensus_graph_annnotation"  # **choose which annotation column to highlight here** , other choices: cluster_annotation, Protein-level_consensus_annotation

fig = pu.interaction_umap(
    umap_table,
    node_name="Gene_name_canonical", 
    cluster=label_to_color, opacity=0.5, unlabelled_color="#D0D3D4", unlabelled_opacity=0.2,
    pointsize=6, x="umap_1", y="umap_2",
    categorical=True
)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"{timestamp}_UMAP_2Dview_ported_consensus_annot_allclusters_v2.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)





In [202]:
# generate 2D UMAP plot
label_to_color = "[all_IPs] 14-3-3_consensus"  # **choose which annotation column to highlight here** , other choices: cluster_annotation, Protein-level_consensus_annotation

fig = pu.interaction_umap(
    umap_table,
    node_name="Gene_name_canonical", 
    cluster=label_to_color, opacity=0.5, unlabelled_color="#D0D3D4", unlabelled_opacity=0.2,
    pointsize=6, x="umap_1", y="umap_2",
    categorical=True, custom_colors = ["#ff0000", "#e1e3e1"]
)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"{timestamp}_UMAP_2Dview_ported_consensus_annot_1433_v2.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)

### 14-3-3 cluster analysis (useful only when there is a distinct 14-3-3 cluster)

In [203]:
present_14_3_3_cluster = False

if present_14_3_3_cluster:
    # produce a list of clustering patterns of 14-3-3 scaffold proteins 
    umap_table_1433 = umap_table[umap_table["[all_IPs] 14-3-3_consensus"] == "14-3-3_scaffold"]

    points = umap_table_1433[["umap_1", "umap_2"]].values

    # DBSCAN parameters
    epsilon = 0.4  # Maximum distance between points to be considered as neighbors
    min_samples = 10  # Minimum number of points to form a cluster

    # Apply DBSCAN
    import matplotlib.pyplot as plt
    from sklearn.cluster import DBSCAN
    db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(points)
    labels = db.labels_

    umap_table_1433["DBSCAN_cluster"] = ["in 14-3-3 cluster" if i != -1 else "not in 14-3-3 cluster" for i in labels ]

    # Identify outliers (points labeled as -1)
    outliers = points[labels == -1]

    # Plot the points and highlight the outliers
    plt.scatter(points[:, 0], points[:, 1], label='Cluster')
    plt.scatter(outliers[:, 0], outliers[:, 1], color='blue', label='Not in cluster')
    plt.legend()
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('DBSCAN 14-3-3 cluster Detection')
    plt.show()
else:
    umap_table_1433 = umap_table[umap_table["[all_IPs] 14-3-3_consensus"] == "14-3-3_scaffold"]

### write 14-3-3 dataframe to file

In [204]:
umap_table_1433

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,cluster_annotation,neighbors,Graph-based_localization_annotation,consensus_graph_annnotation,12-LAMP1,...,PC_2,umap_1,umap_2,3D_umap_1,3D_umap_2,3D_umap_3,[all_IPs] Graph-based_localization_annotation,[all_IPs] consensus_graph_annnotation,[all_IPs] 14-3-3_graph,[all_IPs] 14-3-3_consensus
103,A0A024QZX0;Q9P0N9;A0A024R011;Q5SZL6;Q5SZL4;Q5S...,A0A024QZX0;Q9P0N9;A0A024R011;Q5SZL6;Q5SZL4;Q5S...,TBC1D7,TBC1D7,cytosol,unclassified,"[[('cytosol', 21), ('unclassified', 1)]]",cytosol,cytosol,0.030539,...,-0.424400,-4.765238,-2.299360,6.773326,9.107992,1.286784,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
164,A0A024R0T2;Q7L8J4;Q96MW4;B4DSF1;Q96ET3,A0A024R0T2;Q7L8J4;Q96MW4;B4DSF1,SH3BP5L,SH3BP5L,,unclassified,"[[('cytosol', 22), ('unclassified', 1)]]",cytosol,cytosol,0.000000,...,-1.228461,-4.066715,-2.280616,7.086406,9.145860,0.723154,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
165,A0A024R0V3;Q6P597;K7EL76;K7ENJ3;K7ELP9,A0A024R0V3;Q6P597;K7EL76;K7ENJ3,KLC3,KLC3,,unclassified,"[[('cytosol', 20)]]",cytosol,cytosol,0.000000,...,-1.025457,-4.266518,-3.008642,7.492700,8.894593,1.119316,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
183,Q9H0H5;A0A2X0U4T9;A0A024R136;B2RE34;F8VRD2;Q9H...,Q9H0H5;A0A2X0U4T9;A0A024R136;B2RE34,RACGAP1,RACGAP1,,unclassified,"[[('plasma_membrane', 7), ('unclassified', 8),...",plasma_membrane,plasma_membrane,0.198962,...,0.539586,-5.800062,-6.837513,9.647807,7.332375,3.883089,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
194,A0A024R178;P10398;Q96II5;A0A0S2Z3F2;B4DV85;B4D...,A0A024R178;P10398;Q96II5;A0A0S2Z3F2;B4DV85,ARAF,ARAF,,unclassified,"[[('cytosol', 21)]]",cytosol,cytosol,0.000000,...,-1.474833,-4.938622,-2.503399,7.130659,9.436317,1.169868,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8252,Q9ULJ3,Q9ULJ3,ZBTB21,ZBTB21,,unclassified,"[[('unclassified', 13), ('nucleus', 5), ('nucl...",nucleus,nucleus,0.000000,...,1.453976,-7.169866,-3.670943,6.744531,9.210514,3.504864,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
8288,Q9UNW9;A0A6Q8PFC2;F8VYI3;Q9HDB7;M0R1A0,Q9UNW9;A0A6Q8PFC2,NOVA2,NOVA2,,unclassified,"[[('lysosome', 4), ('unclassified', 2), ('ER',...",ER,ER,0.100944,...,0.560931,-9.233232,-7.863008,7.598414,5.306971,6.313887,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
8350,Q9Y2J4;B3KR56;B3KQD2;D6RIC7;D6RBK2;D6RJA4;D6RF...,Q9Y2J4;B3KR56,AMOTL2,AMOTL2,,unclassified,"[[('cytosol', 65), ('mitochondrion', 1), ('Gol...",cytosol,cytosol,0.002016,...,-0.768317,-4.208410,-2.649073,6.976948,8.831423,0.984857,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold
8351,Q9Y2K2;M5FMX3;H0Y4E8;A1A5A9;H7C3X8;A0A024R3J3;...,Q9Y2K2;M5FMX3;H0Y4E8;A1A5A9;H7C3X8;A0A024R3J3,SIK3;KIAA0999,SIK3,cytosol,unclassified,"[[('cytosol', 45), ('mitochondrion', 2), ('unc...",cytosol,cytosol,0.006743,...,-1.281907,-5.324926,-2.852285,7.191870,9.496968,1.842863,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold,14-3-3_scaffold


In [205]:
# rename columns
if present_14_3_3_cluster:
    umap_table_1433_csv = umap_table_1433.copy()[["Majority protein IDs", "Gene names", "Graph-based_localization_annotation", "consensus_graph_annnotation",
                                             "[all_IPs] Graph-based_localization_annotation", "[all_IPs] consensus_graph_annnotation", "DBSCAN_cluster"]]
    umap_table_1433_csv.rename(columns={"DBSCAN_cluster": "[without 14-3-3 IP] in UMAP 14-3-3 cluster",
                                        "Graph-based_localization_annotation": "[without 14-3-3 IP] Graph-based_localization_annotation",
                                        "consensus_graph_annnotation": "[without 14-3-3 IP] consensus_graph_annnotation",
                                        }, inplace=True)
else:
    umap_table_1433_csv = umap_table_1433.copy()[["Majority protein IDs", "Gene names", "Gene_name_canonical", "curated_ground_truth_v9.0", "Graph-based_localization_annotation", "consensus_graph_annnotation","neighbors",
                                             "[all_IPs] Graph-based_localization_annotation", "[all_IPs] consensus_graph_annnotation"]]
    umap_table_1433_csv.rename(columns={"Graph-based_localization_annotation": "[without 14-3-3 IP] Graph-based_localization_annotation",
                                    "consensus_graph_annnotation": "[without 14-3-3 IP] consensus_graph_annnotation",
                                    }, inplace=True)

# save to file
save_name = f"{timestamp}_remove_14-3-3_IP.csv"
umap_table_1433_csv.to_csv(os.path.join(save_path, save_name), index=False)

In [206]:
umap_table_1433_csv

Unnamed: 0,Majority protein IDs,Gene names,Gene_name_canonical,curated_ground_truth_v9.0,[without 14-3-3 IP] Graph-based_localization_annotation,[without 14-3-3 IP] consensus_graph_annnotation,neighbors,[all_IPs] Graph-based_localization_annotation,[all_IPs] consensus_graph_annnotation
103,A0A024QZX0;Q9P0N9;A0A024R011;Q5SZL6;Q5SZL4;Q5S...,TBC1D7,TBC1D7,cytosol,cytosol,cytosol,"[[('cytosol', 21), ('unclassified', 1)]]",14-3-3_scaffold,14-3-3_scaffold
164,A0A024R0T2;Q7L8J4;Q96MW4;B4DSF1,SH3BP5L,SH3BP5L,,cytosol,cytosol,"[[('cytosol', 22), ('unclassified', 1)]]",14-3-3_scaffold,14-3-3_scaffold
165,A0A024R0V3;Q6P597;K7EL76;K7ENJ3,KLC3,KLC3,,cytosol,cytosol,"[[('cytosol', 20)]]",14-3-3_scaffold,14-3-3_scaffold
183,Q9H0H5;A0A2X0U4T9;A0A024R136;B2RE34,RACGAP1,RACGAP1,,plasma_membrane,plasma_membrane,"[[('plasma_membrane', 7), ('unclassified', 8),...",14-3-3_scaffold,14-3-3_scaffold
194,A0A024R178;P10398;Q96II5;A0A0S2Z3F2;B4DV85,ARAF,ARAF,,cytosol,cytosol,"[[('cytosol', 21)]]",14-3-3_scaffold,14-3-3_scaffold
...,...,...,...,...,...,...,...,...,...
8252,Q9ULJ3,ZBTB21,ZBTB21,,nucleus,nucleus,"[[('unclassified', 13), ('nucleus', 5), ('nucl...",14-3-3_scaffold,14-3-3_scaffold
8288,Q9UNW9;A0A6Q8PFC2,NOVA2,NOVA2,,ER,ER,"[[('lysosome', 4), ('unclassified', 2), ('ER',...",14-3-3_scaffold,14-3-3_scaffold
8350,Q9Y2J4;B3KR56,AMOTL2,AMOTL2,,cytosol,cytosol,"[[('cytosol', 65), ('mitochondrion', 1), ('Gol...",14-3-3_scaffold,14-3-3_scaffold
8351,Q9Y2K2;M5FMX3;H0Y4E8;A1A5A9;H7C3X8;A0A024R3J3,SIK3;KIAA0999,SIK3,cytosol,cytosol,cytosol,"[[('cytosol', 45), ('mitochondrion', 2), ('unc...",14-3-3_scaffold,14-3-3_scaffold


In [207]:
if present_14_3_3_cluster:
    umap_table_1433_csv["[without 14-3-3 IP] in 14-3-3 cluster"].value_counts()

In [208]:
umap_table_1433_csv["[without 14-3-3 IP] consensus_graph_annnotation"].value_counts()

[without 14-3-3 IP] consensus_graph_annnotation
cytosol               71
nucleus               29
plasma_membrane       15
translation            6
stress_granule         3
ER                     2
actin_cytoskeleton     2
mitochondrion          1
lysosome               1
Name: count, dtype: int64

### extract the neighborhood of 14-3-3 proteins

In [209]:
# # read the adata object
# adata = ad.read_h5ad(Path.cwd().parent/ "panel_C" / "output" / "adata_kNN_2024-07-24.h5ad")

In [210]:
# adata

# all_majority_ids = adata.obs["Majority protein IDs"].to_list()
# all_genes = adata.obs["Gene_name_canonical"].to_list()

# annot_df = pd.DataFrame(
#     list(zip(
#             adata.obs["Majority protein IDs"].to_list(),
#             adata.obs["Gene_name_canonical"].to_list(),
#             adata.obs["cluster_annotation"].to_list(),
#             adata.obs["consensus_graph_annnotation"].to_list()
#         )),
#     columns=["Majority protein IDs", "Gene_name_canonical", "cluster_annotation", "consensus_graph_annnotation"],
# )

In [211]:
# adata

In [212]:
# from utils.Jaccard_coefficient import *

# umap_table_1433_csv["neighborhood"] = None
# umap_table_1433_csv["[without 14-3-3 IP] consensus_graph_annnotation (unclassified fix)"] = umap_table_1433_csv["[without 14-3-3 IP] consensus_graph_annnotation"]

# for idx, row in umap_table_1433_csv.iterrows():
#     gene = row["Gene names"]
#     neighbor_list, neighbor_annot_list = gene_neighbor_annots(gene_name=gene, adata=adata, annot_df=annot_df, gene_name_col="Gene_name_canonical", annot_col="consensus_graph_annnotation")
#     # get the neighbor annotation   
#     umap_table_1433_csv.at[idx, "neighborhood"] = [list(Counter(neighbor_annot_list).items())]

#     if umap_table_1433_csv.at[idx, "[without 14-3-3 IP] consensus_graph_annnotation"] == "unclassified":
#         try:
#             most_common_annot = Counter(neighbor_annot_list).most_common(1)[0][0]
#             if most_common_annot != "unclassified":
#                 umap_table_1433_csv.at[idx, "[without 14-3-3 IP] consensus_graph_annnotation (unclassified fix)"] = most_common_annot
#             else:
#                 second_most_common = Counter(neighbor_annot_list).most_common(2)[1][0]
#                 umap_table_1433_csv.at[idx, "[without 14-3-3 IP] consensus_graph_annnotation (unclassified fix)"] = second_most_common
#         except:
#             pass




In [213]:
# umap_table_1433_csv
# # write to file
# umap_table_1433_csv.to_csv(os.path.join(save_path, save_name), index=False)

In [214]:
# umap_table_1433_csv

In [215]:
# umap_table_1433_csv["[without 14-3-3 IP] consensus_graph_annnotation (unclassified fix)"].value_counts()