## Fig. 2 panel D umap
This notebook calculates 2D and 3D UMAP embeddings and generate plots

In [8]:
import pandas as pd
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import plotly.io as pio
import umap
import anndata as ad
import umap.plot
import random
from pathlib import Path
from datetime import datetime
import anndata as ad

plt.rcParams['pdf.fonttype'] = 42
script_path = Path.cwd().parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus.plotting import plotly_umap as pu
from utils.label_processing import attach_annotations

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

### Load the enrichment table

In [9]:
timestamp = datetime.now().strftime('%Y-%m-%d')
print(f"Timestamp: {timestamp}")    

Timestamp: 2023-12-05


In [11]:
#manually set the timestamp to use the intermediate results from another date
timestamp = "2023-12-04"

In [12]:
# define files to load
enrichment_dir = Path.cwd().parent.parent / "enrichment"
enrichment_csv_path = enrichment_dir / "output" / "enrichment_and_volcano_tables" / f'{timestamp}_enrichment_table_NOC_prop.csv'

try:
    # load the file
    enrichments = pd.read_csv(enrichment_csv_path, header=[0,1], index_col=0)
except FileNotFoundError:
    print(f"File {enrichment_csv_path} not found.\nPlease run the enrichment analysis first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {enrichment_csv_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [13]:
# check the enrichment metadata columns
# enrichments["metadata"]

In [14]:
# check the sample columns
# enrichments["sample"]

In [15]:
# attach canonical gene names
gene_name_csv = data_path / "external" / "canonical_names_and_Itzhak_data.csv"

lookup_table = pd.read_csv(gene_name_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["Gene_name_canonical"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c , from_on="Majority protein IDs", to_on="Majority protein IDs")
    enrichments[("metadata", "Gene_name_canonical")] = new_col_data

# attach ground truth
ground_truth_csv = data_path / "external" / "organelle_curated_ground_truth_v6.0.csv"

lookup_table = pd.read_csv(ground_truth_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["organelle"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c , from_on="gene_name_canonical", to_on="Gene_name_canonical")
    enrichments[("metadata", "organelle_ground_truth_v6.0")] = new_col_data

# attach labels
labels_csv = data_path / "labels" / "cluster_annotation_Dec5.csv"

lookup_table = pd.read_csv(labels_csv)
to_df = enrichments["metadata"].copy()
list_of_cols_to_add = reversed(["cluster_annotation"])
for c in list_of_cols_to_add:
    new_col_data = attach_annotations(from_df=lookup_table, to_df=to_df, anno_col=c , from_on="Majority protein IDs", to_on="Majority protein IDs")
    enrichments[("metadata", "cluster_annotation")] = new_col_data


### Sample selection

In [16]:
# Sample selection for the Leiden and the UMAP algorithms, NOTE: manual sample removal is NOT in this cell

# there are some superfluous samples in this table as well as WTs 
# these dont help too much in separating organelles so not counting them
# we also remove the infected samples here as those shouldn't be used in calculating the reference UMAP
cols = list(enrichments['sample'])
meta_cols = list(enrichments['metadata'])
samples = [x for x in cols if 
    'WT' not in x and 'harsh' not in x and 'unsorted' not in x and "Infected" not in x]

# next, we remove additional samples using bait names 
genes = [x.split('-')[1] if '-' in x else x for x in samples]
sample_table = pd.DataFrame()
sample_table['samples'] = samples
sample_table['bait'] = genes

bait_drop_list = ['EXOC2'] # here we are just removing EXOC2
selected_samples = []
for index, row in sample_table.iterrows():
    if row['bait'] not in bait_drop_list:
        selected_samples.append(row['samples'])

In [17]:
# check the selected samples
print(f"the number of selected samples is {len(selected_samples)}")
print(f"the selected samples are {sorted(selected_samples)}")

the number of selected samples is 69
the selected samples are ['01-CAPRIN1', '02-ATG101', '02-COPE', '02-DCP1A', '02-GOLGA2', '02-RICTOR', '03-HSP90AA', '03-HSPA1B', '03-SEC23A', '05-CAV1', '05-EDC4', '05-NCLN', '06-ATP6V1B2', '06-CCDC47', '06-CSNK2A1', '06-CSNK2A2', '06-YWHAB', '07-AP4B1', '07-CLTA', '07-COG8', '07-RAPTOR', '09-ATG101', '09-EDC4', '09-HSP90AA1', '09-PEX3', '09-PSMB7', '09-TOMM20', '10-AP2B1', '10-RTN4', '10-TOMM20', '10-VPS35', '11-CEP350', '11-EEA1', '11-GPR107', '11-SEC31A', '12-ACTB', '12-G3BP1', '12-LAMP1', '12-PNPLA2', '12-RTN4', '12-SEC61B', '12-TOMM20', '12-YWHAQ', '13-GOLGA2', '13-RAB11A', '13-RAB14', '13-RAB1A', '13-RAB7A', '14-COPE', '14-GOLGA2', '14-RAB11A', '14-RAB14', '14-RAB1A', '14-RAB7A', '15-G3BP1', '15-GOLGA2', '15-LAMP1', '15-MAP1LC3B', '15-SEC61B', '15-TOMM20', '17-ATP1B3', '17-CAPRIN1', '17-G3BP1', '17-MAP1LC3B', '17-RPL36', '17-SLC30A2', 'NOC_cytosol', 'NOC_nuclear', 'NOC_organelle']


In [18]:
# manually drop a few samples 
to_drop = ["02-EXOC2","06-ATP6V1B2","06-CSNK2A1", "06-CSNK2A2", "07-AP4B1", '02-RICTOR', "07-RAPTOR", "10-AP2B1", "12-PNPLA2"] #  for example: to_drop = ["09-HSP90AA1", "09-PSMB7"]
selected_samples = [x for x in selected_samples if x not in to_drop] # update the variable: selected_samples

In [19]:
# check the selected samples after manual sample removal
print(f"the number of selected samples is {len(selected_samples)}")
print(f"the selected samples are {sorted(selected_samples)}")

the number of selected samples is 61
the selected samples are ['01-CAPRIN1', '02-ATG101', '02-COPE', '02-DCP1A', '02-GOLGA2', '03-HSP90AA', '03-HSPA1B', '03-SEC23A', '05-CAV1', '05-EDC4', '05-NCLN', '06-CCDC47', '06-YWHAB', '07-CLTA', '07-COG8', '09-ATG101', '09-EDC4', '09-HSP90AA1', '09-PEX3', '09-PSMB7', '09-TOMM20', '10-RTN4', '10-TOMM20', '10-VPS35', '11-CEP350', '11-EEA1', '11-GPR107', '11-SEC31A', '12-ACTB', '12-G3BP1', '12-LAMP1', '12-RTN4', '12-SEC61B', '12-TOMM20', '12-YWHAQ', '13-GOLGA2', '13-RAB11A', '13-RAB14', '13-RAB1A', '13-RAB7A', '14-COPE', '14-GOLGA2', '14-RAB11A', '14-RAB14', '14-RAB1A', '14-RAB7A', '15-G3BP1', '15-GOLGA2', '15-LAMP1', '15-MAP1LC3B', '15-SEC61B', '15-TOMM20', '17-ATP1B3', '17-CAPRIN1', '17-G3BP1', '17-MAP1LC3B', '17-RPL36', '17-SLC30A2', 'NOC_cytosol', 'NOC_nuclear', 'NOC_organelle']


### data preprocessing

In [20]:
# finalize the table for leiden and umap
umap_table = enrichments.droplevel(0, axis=1)[meta_cols + selected_samples].copy()
# normalization and UMAP algorithm are not compatible with any NaN values, so drop them 
umap_table = umap_table.dropna(subset=selected_samples)
quants = umap_table[selected_samples].copy()
print(f"the dimensions of the data table used for UMAP are {quants.shape}")

the dimensions of the data table used for UMAP are (8541, 61)


In [21]:
# scale the table for UMAP
scaled = pu.scale_table(matrix=quants, method='standard')

### save a copy of the data in anndata format 
(for generating k-NN graph)

In [22]:
# generating AnnData
selected = enrichments['sample'][selected_samples].copy()
adata = ad.AnnData(selected)

adata.var_names = selected.columns.to_list()
adata.obs_names = enrichments['metadata']["Protein IDs"].to_list()
adata.obs["Protein IDs"] = enrichments['metadata']["Protein IDs"].to_list()
adata.obs["Majority protein IDs"] = enrichments['metadata']["Majority protein IDs"].to_list()
adata.obs["Gene_name_canonical"] = enrichments['metadata']["Gene_name_canonical"].to_list()
adata.obs["organelle_ground_truth_v6.0"] = enrichments['metadata']["organelle_ground_truth_v6.0"].to_list()
adata.obs["cluster_annotation"] = enrichments['metadata']["cluster_annotation"].to_list()


adata.write_h5ad(save_path / f"adata_{timestamp}.h5ad")


Transforming to str index.




The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.

... storing 'Gene_name_canonical' as categorical

The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.

... storing 'organelle_ground_truth_v6.0' as categorical

The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Reordering categories will always return a new Categorical object.

... storing 'cluster_annotation' as categorical


### UMAP

In [23]:
# UMAP parameters
n_neighbors = 20
min_dist = 0.1
metric = 'euclidean'

# flip the umap coordinates (some times the UMAP algorithm flips the coordinates)
flip = True

# set the UMAP seed
UMAP_seed = 1234

In [24]:
# calculate 2D UMAP embeddings
fit = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed
)
u = fit.fit_transform(scaled)
umap_table['umap_1'] = u[:, 0] 
umap_table['umap_2'] = u[:, 1]

# flip the UMAP coordinates
if flip:
    max_x = max(umap_table["umap_1"])
    max_y = max(umap_table["umap_2"])
    umap_table["umap_1"] = max_x - umap_table["umap_1"]
    umap_table["umap_2"] = max_y - umap_table["umap_2"]


# calculate 3D UMAP embeddings
fit3D = umap.UMAP(
    n_neighbors=n_neighbors, 
    min_dist=min_dist, 
    metric=metric, 
    random_state=UMAP_seed, 
    n_components=3
)
u3D = fit3D.fit_transform(scaled)
# add the UMAP coordinates to the table
umap_table['3D_umap_1'] = u3D[:, 0] 
umap_table['3D_umap_2'] = u3D[:, 1]
umap_table['3D_umap_3'] = u3D[:, 2]

### Save UMAP embeddings

In [25]:
# save umap embedding to csv file
save_name = f"UMAP_embeddings_seed={UMAP_seed}.csv"
umap_table.to_csv(os.path.join(save_path, save_name), index=False)

### Generate UMAP plots

In [26]:
# reload modules
import importlib
importlib.reload(pu)

<module 'pyseus.plotting.plotly_umap' from 'c:\\Users\\duo.peng\\Documents\\Organelle_IP_figures\\script\\pyseus\\plotting\\plotly_umap.py'>

In [28]:
# generate 2D UMAP plot (hightlighted by Leiden clusters)
label_to_color = "cluster_annotation" # **choose which annotation column to highlight here**

fig = pu.interaction_umap(umap_table,
    node_name='Gene_name_canonical', cluster=label_to_color, opacity = 0.35,
    unlabelled_color='#D0D3D4', unlabelled_opacity=0.1, pointsize = 6,
    x='umap_1', y='umap_2',
    categorical=True)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"UMAP_2Dview_seed={UMAP_seed}html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)

In [29]:
# generate 3D UMAP plot (hightlighted by Leiden clusters)
label_to_color = "cluster_annotation" # **choose which annotation column to highlight here**

fig = pu.interaction_3D_umap(umap_table,
    node_name='Gene_name_canonical', cluster=label_to_color,
    unlabelled_color='#D0D3D4', unlabelled_opacity=0.1,
    x='3D_umap_1', y='3D_umap_2', z='3D_umap_3',
    categorical=True)
fig.update_layout(width=1200, height=800)

fig.show()

# save the figure as an html file
save_path = os.path.join("output")
save_name = f"UMAP_3Dview_seed={UMAP_seed}.html"
pio.write_html(fig, file=os.path.join(save_path, save_name), auto_open=False)