## compute umap coordinates for Organelle IP (bootstrapping)
This notebook calculates 2D and 3D UMAP embeddings and generate plots

In [14]:
import os, sys, random, shutil
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import umap

plt.style.use('ggplot')
plt.rcParams['pdf.fonttype'] = 42

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus.plotting import plotly_umap as pu

save_path = Path.cwd() / "output"
if not os.path.exists(save_path):
    os.makedirs(save_path)

### load data

In [15]:
#%store -r timestamp
timestamp = "2023-10-21-imp5-for-figures"
print(f"Timestamp: {timestamp}") 

Timestamp: 2023-10-21-imp5-for-figures


In [16]:
# define files to load
input_dir = Path.cwd().parent.parent.parent.parent.parent.parent / "Fig2" / "panel_C" / "output"
umap_table_path = input_dir / f"{timestamp}_umap_table.csv"
quants_path = input_dir / f"{timestamp}_quants.csv"

# load data
try:
    # load the file
    umap_table = pd.read_csv(umap_table_path, index_col=0)
except FileNotFoundError:
    print(f"File {umap_table_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {umap_table_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

try:
    # load the file
    quants = pd.read_csv(quants_path, index_col=0)
except FileNotFoundError:
    print(f"File {quants_path} not found.\nPlease run Fig2_C_consensus_annotation.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {quants_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### preprocessing

In [17]:
# scale the table
scaled = pu.scale_table(matrix=quants, method='standard')

### compute UMAPs with different seeds and save to file

In [18]:
# shutil.rmtree(save_path / "bootstraps")
os.makedirs(save_path / "bootstraps", exist_ok=True)


n_bootstraps = 10
n_neighbors = 20
min_dist = 0.1
metric = 'euclidean'

# flip the umap coordinates (some times the UMAP algorithm flips the coordinates)
flip = True

for _ in range(n_bootstraps):
    UMAP_seed = random.randint(0, 10000)
    # calculate 2D UMAP embeddings
    fit = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        metric=metric, 
        random_state=UMAP_seed
    )
    u = fit.fit_transform(scaled)
    umap_table['umap_1'] = u[:, 0] 
    umap_table['umap_2'] = u[:, 1]

    # flip the UMAP coordinates
    if flip:
        max_x = max(umap_table["umap_1"])
        max_y = max(umap_table["umap_2"])
        umap_table["umap_1"] = max_x - umap_table["umap_1"]
        umap_table["umap_2"] = max_y - umap_table["umap_2"]
    
    # save umap embedding to csv file
    save_name = f"{timestamp}_UMAP_embeddings_seed={UMAP_seed}.csv"
    umap_table.to_csv(os.path.join(save_path / "bootstraps", save_name), index=False)