## Extract common metabolites from datasets and store them

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import seaborn as sb
import anndata as ad
import scanpy as sc
import os



### V11L12-109 neuroconcat


In [None]:
metabolites_df = pd.read_csv(metabolites_file)
metabolites_df["Compound"] = metabolites_df["Compound"].str.replace(" ", "_")


In [None]:
path_msi = "/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets/MSI_concat_neurotransmitters.h5ad"
adata_msi = sc.read_h5ad(path_msi)
dataset_name = "neuro"
output_dir = "/lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro"
metabolites_file = "/lustre/groups/ml01/workspace/eirini.giannakoulia/vitatrack/datasets/FMP10_metabolite_weights.csv"
# adata_filtered, matched_metabolites = filter_msi_by_fmp10_metabolites(adata_msi, "/lustre/groups/ml01/workspace/eirini.giannakoulia/vitatrack/datasets/FMP10_metabolite_weights.csv")

In [None]:
# def filter_msi_by_fmp10_metabolites(adata_msi, metabolites_file, tolerance=0.01):

# Load the FMP10 metabolite list
metabolites_df = pd.read_csv(metabolites_file)
metabolites_df["Compound"] = metabolites_df["Compound"].str.replace(" ", "_")

# Extract the observed m/z values from the file
target_mz_values = metabolites_df["Observed Mass (m/z)"].dropna().values

# Convert AnnData var_names to floats
available_mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Find matching m/z values and keep track of names
matched_metabolites = {}
for mz in available_mz_values:
    for target in target_mz_values:
        if abs(mz - target) < tolerance:
            compound_name = metabolites_df.loc[metabolites_df["Observed Mass (m/z)"] == target, "Compound"].values[0]
            matched_metabolites[compound_name] = mz
            break
# if not matched_metabolites:
#     print("No matching metabolites found.")
#     return None, {}

# Filter AnnData object based on matched m/z values
filtered_indices = [str(mz) in map(str, matched_metabolites.values()) for mz in adata_msi.var_names]
adata_filtered = adata_msi[:, filtered_indices].copy()

# Rename var_names to metabolite names
new_var_names = [name for name in matched_metabolites.keys() if str(matched_metabolites[name]) in adata_filtered.var_names]
adata_filtered.var_names = new_var_names

print(f"Filtered MSI data to {len(matched_metabolites)} matched metabolites.")

# return adata_filtered, matched_metabolites


# def save_filtered_data(adata_filtered, matched_metabolites, dataset_name, output_path):
output_dir = f"{output_path}/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)

# Save the full filtered AnnData object
all_metabolites_path = f"{output_dir}/{dataset_name}_common_metabolites.h5ad"
adata_filtered.write(all_metabolites_path)
print(f"Saved all metabolites to: {all_metabolites_path}")

for metabolite, mz in matched_metabolites.items():
    if str(mz) in adata_filtered.var_names:
        metabolite_adata = adata_filtered[:, adata_filtered.var_names == str(mz)].copy()
        metabolite_path = f"{output_dir}/{dataset_name}_{metabolite}.h5ad"
        # metabolite_adata.write(metabolite_path)
        print(f"Saved {metabolite} to: {metabolite_path}")

In [None]:
def filter_msi_by_fmp10_metabolites(adata_msi, metabolites_file, tolerance=0.01):
    
    # Load the FMP10 metabolite list
    metabolites_df = pd.read_csv(metabolites_file)
    metabolites_df["Compound"] = metabolites_df["Compound"].str.replace(" ", "_")
    
    # Extract the observed m/z values from the file
    target_mz_values = metabolites_df["Observed Mass (m/z)"].dropna().values

    # Convert AnnData var_names to floats
    available_mz_values = np.array([float(mz) for mz in adata_msi.var_names])

    # Find matching m/z values and keep track of names
    matched_metabolites = {}
    for mz in available_mz_values:
        for target in target_mz_values:
            if abs(mz - target) < tolerance:
                compound_name = metabolites_df.loc[metabolites_df["Observed Mass (m/z)"] == target, "Compound"].values[0]
                matched_metabolites[compound_name] = mz
                break
    if not matched_metabolites:
        print("No matching metabolites found.")
        return None, {}
    
    # Filter AnnData object based on matched m/z values
    filtered_indices = [str(mz) in map(str, matched_metabolites.values()) for mz in adata_msi.var_names]
    adata_filtered = adata_msi[:, filtered_indices].copy()

    # Rename var_names to metabolite names
    new_var_names = [name for name in matched_metabolites.keys() if str(matched_metabolites[name]) in adata_filtered.var_names]
    adata_filtered.var_names = new_var_names

    print(f"Filtered MSI data to {len(matched_metabolites)} matched metabolites.")

    return adata_filtered, matched_metabolites


def save_filtered_data(adata_filtered, matched_metabolites, dataset_name, output_path):
    output_dir = f"{output_path}/{dataset_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Save the full filtered AnnData object
    all_metabolites_path = f"{output_dir}/{dataset_name}_common_metabolites.h5ad"
    adata_filtered.write(all_metabolites_path)
    print(f"Saved all metabolites to: {all_metabolites_path}")

    for metabolite, mz in matched_metabolites.items():
        if str(mz) in adata_filtered.var_names:
            metabolite_adata = adata_filtered[:, adata_filtered.var_names == str(mz)].copy()
            metabolite_path = f"{output_dir}/{dataset_name}_{metabolite}.h5ad"
            # metabolite_adata.write(metabolite_path)
            print(f"Saved {metabolite} to: {metabolite_path}")

In [3]:
path_msi = "/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets/MSI_concat_neurotransmitters.h5ad"
adata_msi = sc.read_h5ad(path_msi)
dataset_name = "neuro"
output_dir = "/lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro"

adata_filtered, matched_metabolites = filter_msi_by_fmp10_metabolites(adata_msi, "/lustre/groups/ml01/workspace/eirini.giannakoulia/vitatrack/datasets/FMP10_metabolite_weights.csv")

  utils.warn_names_duplicates("obs")


Filtered MSI data to 28 matched metabolites.


  utils.warn_names_duplicates("obs")


In [9]:
matched_metabolites.items()

dict_items([('GABA', np.float64(371.17565)), ('Creatinine', np.float64(381.17134)), ('4-hydroxy benzaldehyde single', np.float64(390.14928)), ('Taurine', np.float64(393.12703)), ('Creatine', np.float64(399.18195)), ('Spermidine', np.float64(413.27046)), ('DA', np.float64(421.19136)), ('DOPAL-d3', np.float64(423.18201)), ('Hordenine single', np.float64(433.23603)), ('3-MT', np.float64(435.20692)), ('5-HT', np.float64(444.20715)), ('L-Tyrosine single', np.float64(449.19753)), ('HVA/MOPEGAL', np.float64(450.17016)), ('5-HIAA', np.float64(459.16353)), ('3-OMD', np.float64(479.20822)), ('Vanillactic acid', np.float64(480.17509)), ('Histamine double', np.float64(646.29606)), ('DOPAC/DOPEGAL (*)', np.float64(673.24799)), ('Tryptamine double', np.float64(681.30491)), ('3-MT double (*)', np.float64(688.29517)), ('6-OHDA double (*)', np.float64(690.27454)), ('5-HT double (*)', np.float64(697.30673)), ('3-MT double', np.float64(702.3195)), ('5-HT double', np.float64(711.31089)), ('5-HIAA double (

In [None]:
for metabolite, mz in matched_metabolites.items():
    print (f"{metabolite}, {mz}")

GABA,371.17565
Creatinine,381.17134
4-hydroxy benzaldehyde single,390.14928
Taurine,393.12703
Creatine,399.18195
Spermidine,413.27046
DA,421.19136
DOPAL-d3,423.18201
Hordenine single,433.23603
3-MT,435.20692
5-HT,444.20715
L-Tyrosine single,449.19753
HVA/MOPEGAL,450.17016
5-HIAA,459.16353
3-OMD,479.20822
Vanillactic acid,480.17509
Histamine double,646.29606
DOPAC/DOPEGAL (*),673.24799
Tryptamine double,681.30491
3-MT double (*),688.29517
6-OHDA double (*),690.27454
5-HT double (*),697.30673
3-MT double,702.3195
5-HT double,711.31089
5-HIAA double (*),712.25852
L-Tyrosine double,716.28983
3,4 dihydroxy Phenylalanine methyl ester double,746.30025
6-OHDA triple (*),957.37883


In [14]:
adata_filtered[:, adata_filtered.var_names == "371.17565"]

View of AnnData object with n_obs × n_vars = 5443 × 0
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide', 'msi_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'msi_highly_variable'
    uns: 'hvg', 'msi_concat_clusters', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [None]:
# save_filtered_data(adata_filtered, matched_metabolites, dataset_name, output_dir)

for metabolite, mz in matched_metabolites.items():
        metabolite_adata = adata_filtered[:, adata_filtered.var_names == str(mz)].copy()
        metabolite_path = f"{output_dir}/{dataset_name}_{metabolite}.h5ad"
        # metabolite_adata.write(metabolite_path)
        print(f"Saved {metabolite} to: {metabolite_path}")
        
    # if str(mz) in adata_filtered.var_names:
    #     metabolite_adata = adata_filtered[:, adata_filtered.var_names == str(mz)].copy()
    #     metabolite_path = f"{output_dir}/{dataset_name}_{metabolite}.h5ad"
    #     # metabolite_adata.write(metabolite_path)
    #     print(f"Saved {metabolite} to: {metabolite_path}")

Saved all metabolites to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro/neuro_common_metabolites.h5ad
Saved GABA to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_GABA.h5ad
Saved Creatinine to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_Creatinine.h5ad
Saved 4-hydroxy benzaldehyde single to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_4-hydroxy benzaldehyde single.h5ad
Saved Taurine to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_Taurine.h5ad
Saved Creatine to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_Creatine.h5ad
Saved Spermidine to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_Spermidine.h5ad
Saved DA to: /lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/neuro/neuro_DA.h5ad
Saved DOPAL-d3 to: /lustre/groups/ml0

In [44]:
adata_filtered
matched_metabolites

{'GABA': np.float64(371.17565),
 'Creatinine': np.float64(381.17134),
 '4-hydroxy benzaldehyde single': np.float64(390.14928),
 'Taurine': np.float64(393.12703),
 'Creatine': np.float64(399.18195),
 'Spermidine': np.float64(413.27046),
 'DA': np.float64(421.19136),
 'DOPAL-d3': np.float64(423.18201),
 'Hordenine single': np.float64(433.23603),
 '3-MT': np.float64(435.20692),
 '5-HT': np.float64(444.20715),
 'L-Tyrosine single': np.float64(449.19753),
 'HVA/MOPEGAL': np.float64(450.17016),
 '5-HIAA': np.float64(459.16353),
 '3-OMD': np.float64(479.20822),
 'Vanillactic acid': np.float64(480.17509),
 'Histamine double': np.float64(646.29606),
 'DOPAC/DOPEGAL (*)': np.float64(673.24799),
 'Tryptamine double': np.float64(681.30491),
 '3-MT double (*)': np.float64(688.29517),
 '6-OHDA double (*)': np.float64(690.27454),
 '5-HT double (*)': np.float64(697.30673),
 '3-MT double': np.float64(702.3195),
 '5-HT double': np.float64(711.31089),
 '5-HIAA double (*)': np.float64(712.25852),
 'L-Tyro