## Fig. 2 panel D umap
This notebook calculates 2D and 3D UMAP embeddings and generate plots

In [19]:
import pandas as pd
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import plotly.io as pio
import umap
import anndata as ad
import umap.plot
import random
from pathlib import Path
from datetime import datetime
import anndata as ad

plt.rcParams['pdf.fonttype'] = 42
script_path = Path.cwd().parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus.plotting import plotly_umap as pu

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

### load data

In [20]:
# timestamp = datetime.now().strftime('%Y-%m-%d')
# print(f"Timestamp: {timestamp}")    

In [21]:
#manually set the timestamp to use the intermediate results from another date
timestamp = "2023-12-04"

In [22]:
# define files to load
input_dir = Path.cwd().parent / "panel_C" / "output"
umap_table_path = input_dir / f"{timestamp}_umap_table.csv"
quants_path = input_dir / f"{timestamp}_quants.csv"

# load data
try:
    # load the file
    umap_table = pd.read_csv(umap_table_path, index_col=0)
except FileNotFoundError:
    print(f"File {umap_table_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {umap_table_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

try:
    # load the file
    quants = pd.read_csv(quants_path, index_col=0)
except FileNotFoundError:
    print(f"File {quants_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {quants_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [23]:
umap_table

Unnamed: 0,Protein IDs,Majority protein IDs,Gene names,Gene_name_canonical,organelle_ground_truth_v6.0,cluster_annotation,Graph-based_localization_annotation,Protein-level_consensus_annotation,12-SEC61B,13-RAB7A,...,11-EEA1,11-GPR107,11-CEP350,09-ATG101,09-PEX3,11-SEC31A,10-RTN4,NOC_cytosol,NOC_organelle,NOC_nuclear
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,MAGOHB,,nucleus,nucleus,nucleus,-0.139405,0.634090,...,-0.681700,-0.615860,0.160560,3.270767,-0.086078,-1.159713,-0.158394,0.235748,0.241247,0.523005
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,RBM8A,,nucleus,nucleus,nucleus,-0.631700,-1.357550,...,-1.618800,-2.043600,0.895300,-0.728450,-0.496950,0.979850,-1.899050,0.362391,0.286889,0.350721
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,TM9SF4,Golgi,Golgi,Golgi,Golgi,4.080419,4.503702,...,-1.672800,5.885700,0.397300,2.393000,-0.092300,1.688300,1.852200,0.054846,0.654148,0.291006
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,PTEN,,cytosol,cytosol,cytosol,-0.874511,-0.510095,...,-1.261271,-0.394072,0.064974,0.978763,0.488340,-0.652074,0.656888,1.000000,0.000000,0.000000
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,TM9SF2,Golgi,trans-Golgi,trans-Golgi,trans-Golgi,7.104435,8.938121,...,2.055000,5.827450,-0.438800,-0.412350,0.016550,1.179800,1.524950,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8536,X5D7P8,X5D7P8,STK39,STK39,,cytosol,cytosol,cytosol,0.967365,1.069567,...,0.023261,-0.641255,0.428594,-1.117751,-0.880850,0.993200,-0.693133,0.765637,0.145727,0.088636
8537,X5D8X9,X5D8X9,CNTNAP2,CNTNAP2,,plasma_membrane,plasma_membrane,plasma_membrane,-0.021208,0.625338,...,1.104272,-0.473069,1.947072,0.136215,1.171869,1.749173,-1.746368,0.000000,0.802964,0.197036
8538,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,CRMP1,,cytosol,cytosol,cytosol,0.491496,-0.820043,...,0.722372,-0.688941,-0.044831,-0.041730,0.374553,0.029335,-0.180538,1.000000,0.000000,0.000000
8539,X5DQZ7,X5DQZ7,GPX1,GPX1,,mitochondrion,mitochondrion,mitochondrion,0.184980,1.318104,...,-1.142611,0.450772,0.828155,-0.488848,0.120523,-0.320115,-1.075311,0.720741,0.279259,0.000000


### preprocessing

In [26]:
# scale the table for UMAP
scaled = pu.scale_table(matrix=quants, method='standard')

### UMAP

In [27]:
# UMAP parameters
n_neighbors = 20
min_dist = 0.1
metric = 'euclidean'

# flip the umap coordinates (some times the UMAP algorithm flips the coordinates)
flip = True

# set the UMAP seed
UMAP_seed = 1234

In [28]:
# calculate 2D UMAP embeddings
fit = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed
)
u = fit.fit_transform(scaled)
umap_table['umap_1'] = u[:, 0] 
umap_table['umap_2'] = u[:, 1]

# flip the UMAP coordinates
if flip:
    max_x = max(umap_table["umap_1"])
    max_y = max(umap_table["umap_2"])
    umap_table["umap_1"] = max_x - umap_table["umap_1"]
    umap_table["umap_2"] = max_y - umap_table["umap_2"]


# calculate 3D UMAP embeddings
fit3D = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed, 
    n_components=3
)
u3D = fit3D.fit_transform(scaled)
# add the UMAP coordinates to the table
umap_table['3D_umap_1'] = u3D[:, 0] 
umap_table['3D_umap_2'] = u3D[:, 1]
umap_table['3D_umap_3'] = u3D[:, 2]

### save UMAP embeddings

In [29]:
# save umap embedding to csv file
save_name = f"UMAP_embeddings_seed={UMAP_seed}.csv"
umap_table.to_csv(os.path.join(save_path, save_name), index=False)

### generate UMAP plots

In [33]:
# generate 2D UMAP plot
label_to_color = "Graph-based_localization_annotation" # **choose which annotation column to highlight here** , other choices: cluster_annotation, Protein-level_consensus_annotation

fig = pu.interaction_umap(umap_table,
    node_name='Gene_name_canonical', cluster=label_to_color, opacity = 0.35,
    unlabelled_color='#D0D3D4', unlabelled_opacity=0.1, pointsize = 6,
    x='umap_1', y='umap_2',
    categorical=True)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"UMAP_2Dview_seed={UMAP_seed}html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)

In [34]:
# generate 3D UMAP plot
label_to_color = "Graph-based_localization_annotation" # **choose which annotation column to highlight here**

fig = pu.interaction_3D_umap(umap_table,
    node_name='Gene_name_canonical', cluster=label_to_color,
    unlabelled_color='#D0D3D4', unlabelled_opacity=0.1,
    x='3D_umap_1', y='3D_umap_2', z='3D_umap_3',
    categorical=True)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"UMAP_3Dview_seed={UMAP_seed}.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)