# Fig. 5 panel A aligned umap

please first run dependency notebooks in the following directories:
- infected_enrichment
- control_enrichment

In [24]:
import math, os, sys
from pathlib import Path
import anndata as ad
import numpy as np
import pandas as pd
import umap
from tqdm.notebook import tqdm

script_path = Path.cwd().parent.parent.parent.parent  / "script"
data_path = Path.cwd().parent.parent.parent.parent  / "data"
sys.path.append(str(script_path))

from pyseus.plotting import plotly_umap as pu
from utils import *

output_folder = Path.cwd() / "output"
os.makedirs(output_folder, exist_ok=True)

## Load enrichment tables (for both infected and uninfected)

In [25]:
%store -r fig5_timestamp
print(f"Timestamp: {fig5_timestamp}")

Timestamp: 2023-10-21-for-figures


In [26]:
# load enrichment tables

#require uninfected samples to match infected samples
uninf_match_inf = True

uninfected_enrichment_path = Path.cwd().parent / "2.control_enrichment" / "output" / "enrichment_and_volcano_tables" / f'{fig5_timestamp}_uninf_enrichment_table_NOC_prop.csv'
infected_enrichment_path = Path.cwd().parent / "1.infected_enrichment" / "output" / "enrichment_and_volcano_tables" / f'{fig5_timestamp}_inf_enrichment_table_NOC_prop.csv'

try:
    enrichments_uninfected = pd.read_csv(uninfected_enrichment_path, header=[0, 1], index_col=0, low_memory=False)
except FileNotFoundError:
    print(f"File {uninfected_enrichment_path} not found.\n please run the uninfected_enrichment notebooks first.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {uninfected_enrichment_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

try:
    enrichments_infected = pd.read_csv(infected_enrichment_path, header=[0, 1], index_col=0, low_memory=False)
except FileNotFoundError:
    print(f"File {infected_enrichment_path} not found.\n please run the infected_enrichment notebooks first.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {infected_enrichment_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

print("The dimenensions of loaded enrichment tables (uninfected, infected) are:")
print(enrichments_uninfected.shape)
print(enrichments_infected.shape)

The dimenensions of loaded enrichment tables (uninfected, infected) are:
(8537, 50)
(8376, 51)


In [27]:
# drop multi-index
enrichments_uninfected = enrichments_uninfected.droplevel(0, axis=1)
enrichments_infected = enrichments_infected.droplevel(0, axis=1)

In [28]:
# rename NOC fractions
enrichments_uninfected.rename(columns={"NOC_cytosol_UnInfected": "NOC_cytosol"}, inplace=True)
enrichments_uninfected.rename(columns={"NOC_organelle_UnInfected": "NOC_organelle"}, inplace=True)
enrichments_uninfected.rename(columns={"NOC_nuclear_UnInfected": "NOC_nuclear"}, inplace=True)

In [29]:
# select the sample columns
uninfected_cols = [
    i for i in enrichments_uninfected.columns
    if i.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "NOC"))
]
infected_cols = [
    i for i in enrichments_infected.columns
    if i.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "NOC"))
]

### Preprocessing

sample exclusion

In [30]:
umap_table_uninfected = enrichments_uninfected[uninfected_cols]
umap_table_infected = enrichments_infected[infected_cols]

meta_cols = ["Protein IDs", "Majority protein IDs", "Gene names"]

# remove non-relavent samples from the mass spectrometry master file
uninfected_cols_filtered = [
    x for x in uninfected_cols
    if "WT" not in x and "harsh" not in x
]
infected_cols_filtered = [
    x for x in infected_cols
    if "WT" not in x and "harsh" not in x
]
umap_table_uninfected = enrichments_uninfected[meta_cols + uninfected_cols_filtered]
umap_table_infected = enrichments_infected[["Protein IDs"] + infected_cols_filtered]

In [31]:
# remove non-informative IPs from the mass spectrometry master file; these correspond to IPs for soluble targets that peripherally bind membranes
# these IPs were not successful at pulling down membrane compartments, and were therefore removed from subsequent analyses
exclude_list = ['09-PSMB7', '09-HSP90AA1', '10-AP2B1', "10-EXOC2"] 
excl_name = 'excl_' + '_'.join(exclude_list)
exclude_list = exclude_list + [f"{i}_Infected" for i in exclude_list]
print(f"excluded: {exclude_list}")

excluded: ['09-PSMB7', '09-HSP90AA1', '10-AP2B1', '10-EXOC2', '09-PSMB7_Infected', '09-HSP90AA1_Infected', '10-AP2B1_Infected', '10-EXOC2_Infected']


remove virus proteins

In [32]:
umap_table_uninfected = umap_table_uninfected[~umap_table_uninfected["Protein IDs"].str.contains("OC43")]
umap_table_infected = umap_table_infected[~umap_table_infected["Protein IDs"].str.contains("OC43")]

match uninfected and infected samples

In [33]:
# tally the infected samples
cols = umap_table_infected.columns
cols_NOC = [i for i in cols if i.split("_")[0] == "NOC"]
cols_sams = [i for i in cols if i.split("-")[0].isdigit()]
# remove samples
cols_sams = [i for i in cols_sams if i not in exclude_list]

cols_inf = sorted(cols_sams) + cols_NOC

# uncomment to print the sample names
# printout = ""
# for idx, val in enumerate(cols_inf):
#     if idx > 0:
#         if val.split('-')[0] == cols_inf[idx-1].split('-')[0]:
#             printout += f", {val}"
#         else:
#             printout += f"\n{val}"
#     else:
#         printout += f"\n{val}"
# print("infected samples to use:")
# print(printout)

In [34]:
# match between infected and uninfected samples
# generate an uninfected sample list that have the same samples as the infected
def in_list_ele(ele, lst):
    for i in lst:
        if ele in i:
            return True
    return False


cols_uninf_match = []
for i in umap_table_uninfected.columns:
    if in_list_ele(i, cols_inf) and i not in ["organelle"]:
        cols_uninf_match.append(i)
cols_uninf_match = sorted(cols_uninf_match)

# uncomment to print the sample names
# printout = ""
# for idx, val in enumerate(cols_uninf_match):
#     if idx > 0:
#         if val.split('-')[0] == cols_uninf_match[idx-1].split('-')[0]:
#             printout += f", {val}"
#         else:
#             printout += f"\n{val}"
#     else:
#         printout += f"\n{val}"
# print("Uninfected samples that match infected samples:")
# print(printout)

#### Merge uninfected and infected enrichment tables  
the merged tables will have superfluous columns, and we keep track of sample for aligned UMAP with these two lists of column names: ``cols_uninf_matched`` and ``cols_inf``

In [35]:
# merge the two umap tables (uninfected and infected)
merged = umap_table_uninfected.merge(umap_table_infected, how='inner', on='Protein IDs', suffixes=["_uninf", "_inf"])

In [36]:
# remove the samples that are not in the matched list
for col_to_drop in exclude_list:
    merged.drop(col_to_drop, axis=1, inplace=True)

In [37]:
# prepare objects for aligning umap

# exclude the annotation columns
excl = [
    i for i in merged.columns
    if not i.startswith(("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "NOC"))
]
if uninf_match_inf:  # BOOL, if require uninfected samples to match infected samples
    uninf = [i for i in merged.columns if i not in excl and i in cols_uninf_match]
else:
    uninf = [i for i in merged.columns if i not in excl and i in cols_uninf]
inf = [i for i in merged.columns if i not in excl and i in cols_inf]

# manually add NOC columns
uninf = uninf + [
    i for i in merged.columns if i.startswith("NOC") and i.endswith("_uninf")
]
inf = inf + [i for i in merged.columns if i.startswith("NOC") and i.endswith("_inf")]


def nicely_print_samples(samples):
    printout = ""
    for idx, val in enumerate(samples):
        if idx > 0:
            if val.split("-")[0] == samples[idx - 1].split("-")[0]:
                printout += f", {val}"
            else:
                printout += f"\n{val}"
        else:
            printout += f"\n{val}"
    print(printout)


print("Uninfected samples used for aligned UMAP:")
nicely_print_samples(sorted(uninf))
print("\nInfected samples used for aligned UMAP:")
nicely_print_samples(sorted(inf))

embedding_uninfected = merged[uninf].copy().fillna(0).to_numpy()
embedding_infected = merged[inf].copy().fillna(0).to_numpy()

# scale the data
embedding_uninfected = pu.scale_table(matrix=embedding_uninfected, method="standard")
embedding_infected = pu.scale_table(matrix=embedding_infected, method="standard")

embeddings = [embedding_uninfected, embedding_infected]

constant_dict = {i: i for i in range(len(embedding_uninfected))}
constant_relations = [constant_dict for i in range(1)]

Uninfected samples used for aligned UMAP:

09-ATG101, 09-EDC4, 09-PEX3, 09-TOMM20
10-RTN4, 10-TOMM20, 10-VPS35
11-CEP350, 11-EEA1, 11-GPR107, 11-SEC31A
12-LAMP1, 12-YWHAQ
14-COPE, 14-GOLGA2, 14-RAB11A, 14-RAB14, 14-RAB1A, 14-RAB7A
17-ATP1B3, 17-CAPRIN1, 17-G3BP1, 17-MAP1LC3B, 17-RPL36, 17-SLC30A2
NOC_cytosol
NOC_nuclear
NOC_organelle

Infected samples used for aligned UMAP:

09-ATG101_Infected, 09-EDC4_Infected, 09-PEX3_Infected, 09-TOMM20_Infected
10-RTN4_Infected, 10-TOMM20_Infected, 10-VPS35_Infected
11-CEP350_Infected, 11-EEA1_Infected, 11-GPR107_Infected, 11-SEC31A_Infected
12-LAMP1_Infected, 12-YWHAQ_Infected
14-COPE_Infected, 14-GOLGA2_Infected, 14-RAB11A_Infected, 14-RAB14_Infected, 14-RAB1A_Infected, 14-RAB7A_Infected
17-ATP1B3_Infected, 17-CAPRIN1_Infected, 17-G3BP1_Infected, 17-MAP1LC3B_Infected, 17-RPL36_Infected, 17-SLC30A2_Infected
NOC_cytosol_Infected
NOC_nuclear_Infected
NOC_organelle_Infected


### Save data for other analyses, e.g. Leiden clustering etc.

In [38]:
# generating AnnData for Leiden
selected = merged[uninf].copy().fillna(0)
adata = ad.AnnData(selected, dtype=np.float32)

adata.obs_names = merged["Protein IDs"].to_list()
adata.var_names = merged[uninf].columns.to_list()
adata.obs["Protein IDs"] = merged["Protein IDs"].to_list()
adata.obs["Majority protein IDs"] = merged["Majority protein IDs"].to_list()
adata.obs["Gene names"] = merged["Gene names"].to_list()

# save the adata object to file
adata.write(output_folder / f"[for_leiden]_adata_aln_uninf.h5ad")


Transforming to str index.



In [39]:
# generating AnnData for Leiden
selected = merged[inf].copy().fillna(0)
adata = ad.AnnData(selected, dtype=np.float32)

adata.obs_names = merged["Protein IDs"].to_list()
adata.var_names = merged[inf].columns.to_list()
adata.obs["Protein IDs"] = merged["Protein IDs"].to_list()
adata.obs["Majority protein IDs"] = merged["Majority protein IDs"].to_list()
adata.obs["Gene names"] = merged["Gene names"].to_list()

# save the adata object to file
adata.write(output_folder / f"[for_leiden]_adata_aln_inf.h5ad")


Transforming to str index.



## Aligned UMAP

In [40]:
# define a helper function
def calculate_distance(neighbors_mapper):
    '''Calculate (for each point) the distance between the the aligned UMAPs
    Input: neighbors_mapper object
    '''
    #extract coordinates
    JointUMAP_uninf_1 = list(neighbors_mapper.embeddings_[0].T[0])
    JointUMAP_uninf_2 = list(neighbors_mapper.embeddings_[0].T[1])
    JointUMAP_inf_1 = list(neighbors_mapper.embeddings_[1].T[0])
    JointUMAP_inf_2 = list(neighbors_mapper.embeddings_[1].T[1])

    coordinate_uninf = list(zip(JointUMAP_uninf_1, JointUMAP_uninf_2))
    coordinate_inf = list(zip(JointUMAP_inf_1, JointUMAP_inf_2))

    # calculate the distance between the two coordinates
    distances = []
    for idx, val in enumerate(coordinate_uninf):
        distance = math.dist(val, coordinate_inf[idx])
        distances.append(distance)
    return distances

### 2D aligned umap (for visualization)

In [41]:
# calculate aligned umap
neighbors_mapper = umap.AlignedUMAP(
    n_neighbors=20,
    metric="euclidean",
    min_dist=0.1,
    # alignment_window_size=2,
    alignment_regularisation=0.002,  # This value was optimized using a sweep. Larger values of alignment_regularisation will work harder to keep points aligned across embeddings (at the cost of the embedding quality at each slice), while smaller values will allow the optimisation to focus more on the individual embeddings and put less emphasis on aligning the embeddings with each other.
    n_epochs=300,  # This value was optimized using a sweep.
    random_state=42,
    verbose=False,
    n_components=2,
).fit(embeddings, relations=constant_relations)

### 10D aligned umap (for remodeling score)

run 200 times, using different seeds, takes 5 hours  
to reduce run time, set a smaller number of runs, e.g. n_seeds = 10

In [42]:
# (takes about 4 hours on a laptop, and can't use multi-processing to parallelize b/c each instance of umap.AlignedUMAP is already parallelized when using random seeds)
n_seeds = 200
if n_seeds < 100:
    print("WARNING:\nn_seeds is less than 100, which may not be enough to calculate the coefficient of variation accurately.")
list_of_alignments = []
for i in tqdm(range(0, n_seeds), desc="Calculating aligned UMAPs", total=n_seeds):
# for i in range(0, 200):
    _neighbors_mapper = umap.AlignedUMAP(
        n_neighbors=20,
        metric="euclidean",
        min_dist=0.1,
        #alignment_window_size=2,
        alignment_regularisation=0.002,  # Larger values of alignment_regularisation will work harder to keep points aligned across embeddings (at the cost of the embedding quality at each slice), while smaller values will allow the optimisation to focus more on the individual embeddings and put less emphasis on aligning the embeddings with each other.
        n_epochs=300,
        random_state=None,  # calculate aligned UMAPs with different random seeds
        verbose=False,
        n_components=10,
    ).fit(embeddings, relations=constant_relations)
    list_of_alignments.append(_neighbors_mapper)

Calculating aligned UMAPs:   0%|          | 0/200 [00:00<?, ?it/s]

In [43]:
# calculate the distance for each alignment

distances = []
for idx, val in enumerate(list_of_alignments):
    _distance = calculate_distance(val)
    distances.append(_distance)

# calculate the mean of the distances
dist_mean_10D = np.mean(distances, axis=0)
# calculate the variance of the distances
dist_variance_10D = np.var(distances, axis=0)
# calculate the standard deviation of the distances
dist_std_10D = np.std(distances, axis=0)
# calculate the coefficient of variation of the distances
dist_cv_10D = dist_std_10D / dist_mean_10D

### 2D aligned umap (for remodeling score)

run 200 times, using different seeds, takes 5 hours  
to reduce run time, set a smaller number of runs, e.g. n_seeds = 10

In [44]:
# (takes about 4 hours on a laptop, and can't use multi-processing to parallelize b/c each instance of umap.AlignedUMAP is already parallelized when using random seeds)
n_seeds = 200
if n_seeds < 100:
    print("WARNING:\nn_seeds is less than 100, which may not be enough to calculate the coefficient of variation accurately.")
list_of_alignments = []
for i in tqdm(range(0, n_seeds), desc="Calculating aligned UMAPs", total=n_seeds):
# for i in range(0, 200):
    _neighbors_mapper = umap.AlignedUMAP(
        n_neighbors=20,
        metric="euclidean",
        min_dist=0.1,
        #alignment_window_size=2,
        alignment_regularisation=0.002,  # Larger values of alignment_regularisation will work harder to keep points aligned across embeddings (at the cost of the embedding quality at each slice), while smaller values will allow the optimisation to focus more on the individual embeddings and put less emphasis on aligning the embeddings with each other.
        n_epochs=300,
        random_state=None,  # calculate aligned UMAPs with different random seeds
        verbose=False,
        n_components=2,
    ).fit(embeddings, relations=constant_relations)
    list_of_alignments.append(_neighbors_mapper)

Calculating aligned UMAPs:   0%|          | 0/200 [00:00<?, ?it/s]

In [45]:
# calculate the distance for each alignment

distances = []
for idx, val in enumerate(list_of_alignments):
    _distance = calculate_distance(val)
    distances.append(_distance)

# calculate the mean of the distances
dist_mean_2D = np.mean(distances, axis=0)
# calculate the variance of the distances
dist_variance_2D = np.var(distances, axis=0)
# calculate the standard deviation of the distances
dist_std_2D = np.std(distances, axis=0)
# calculate the coefficient of variation of the distances
dist_cv_2D = dist_std_2D / dist_mean_2D

## Save the 2D embeddings and 2D, 10D distances (along with other columns) to csv

In [46]:
# save aligned umap embeddings to csv
merged["JointUMAP_uninf_1"] = neighbors_mapper.embeddings_[0].T[0]
merged["JointUMAP_uninf_2"] = neighbors_mapper.embeddings_[0].T[1]
merged["JointUMAP_inf_1"] = neighbors_mapper.embeddings_[1].T[0]
merged["JointUMAP_inf_2"] = neighbors_mapper.embeddings_[1].T[1]

# save 2D distances to csv
merged[f"2d_mean_distance_traveled ({n_seeds} bootstraps)"] = dist_mean_2D
merged[f"2d_coefficient_variance_distance_traveled ({n_seeds} bootstraps)"] = dist_cv_2D

# save 10D distances to csv
merged[f"10d_mean_distance_traveled ({n_seeds} bootstraps)"] = dist_mean_10D
merged[f"10d_coefficient_variance_distance_traveled ({n_seeds} bootstraps)"] = dist_cv_10D

save_path = output_folder / f"{fig5_timestamp}_AlignedUMAP_embeddings_and_distances.csv"
merged.to_csv(save_path, index=False)

# save the number of seeds used to calculate the distances
%store n_seeds

Stored 'n_seeds' (int)
