## Extract common metabolites from datasets and store them

In [None]:
# import scanpy as sc
# import numpy as np
# import pandas as pd
# from tqdm import tqdm
# from scipy.spatial import cKDTree
# from sklearn import model_selection
# import seaborn as sns
# import squidpy as sq
# from sklearn.preprocessing import StandardScaler
# from xgboost import XGBRegressor
# from sklearn.metrics import r2_score
# import matplotlib.pyplot as plt
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.decomposition import TruncatedSVD
# from scipy.stats import spearmanr, pearsonr

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import seaborn as sb
import anndata as ad
import scanpy as sc




In [2]:
import os

### V11L12-109_A1 slide

In [42]:
# def get_common_metabolites(matrix_type):
#     """Returns common metabolite m/z values for a given matrix type."""
#     common_metabolites = {
#         "FMP": {
#             "Glucose": 203.0532,
#             "Acetylcholine": 184.0734,
#             "Lactate": 113.0223,
#             "Glutamate": 170.0429,
#             "GABA": 126.0532,
#             "Choline": 126.0894,
#             "ATP": 530.1759,
#             "Serotonin": 199.0847,
#             "Dopamine": 176.0688,
#             "Histamine": 134.0688
#         },
#         "DHB": {
#             "Glucose": 181.0707,
#             "Acetylcholine": 146.1170,
#             "Lactate": 91.0399,
#             "Glutamate": 148.0604,
#             "GABA": 104.0708,
#             "Choline": 104.1070,
#             "ATP": 508.1934,
#             "Serotonin": 177.1023,
#             "Dopamine": 154.0863,
#             "Histamine": 112.0863
#         }
#     }
#     return common_metabolites.get(matrix_type, {})

# def filter_metabolites(adata_msi, matrix_type):
#     tolerance = 0.05
#     matched_metabolites = {}
    
#     for metabolite, target_mz in metabolite_mz.items():
#         matched_mz = [mz for mz in available_mz_values if abs(mz - target_mz) < tolerance]
#         if matched_mz:
#             matched_metabolites[metabolite] = matched_mz[0]

#     if not matched_metabolites:
#         raise ValueError("No common metabolites found in the MSI data.")

#     filtered_indices = [str(mz) in map(str, matched_metabolites.values()) for mz in adata_msi.var_names]
#     adata_filtered = adata_msi[:, filtered_indices].copy()
#     adata_filtered.var_names = [metabolite for metabolite, mz in matched_metabolites.items() if str(mz) in adata_filtered.var_names]
    
#     return adata_filtered, matched_metabolites

def filter_msi_by_fmp10_metabolites(adata_msi, metabolites_file, tolerance=0.01):
    
    # Load the FMP10 metabolite list
    metabolites_df = pd.read_csv(metabolites_file)
    
    # Extract the observed m/z values from the file
    target_mz_values = metabolites_df["Observed Mass (m/z)"].dropna().values

    # Convert AnnData var_names to floats
    available_mz_values = np.array([float(mz) for mz in adata_msi.var_names])

    # Find matching m/z values and keep track of names
    matched_metabolites = {}
    for mz in available_mz_values:
        for target in target_mz_values:
            if abs(mz - target) < tolerance:
                compound_name = metabolites_df.loc[metabolites_df["Observed Mass (m/z)"] == target, "Compound"].values[0]
                matched_metabolites[compound_name] = mz
                break
    # matched_metabolites = {}
    # for mz in available_mz_values:
    #     close_match = [target for target in target_mz_values if abs(mz - target) < tolerance]
    #     if close_match:
    #         compound_name = metabolites_df.loc[metabolites_df["Observed Mass (m/z)"] == close_match[0], "Compound"].values[0]
    #         matched_metabolites[compound_name] = close_match[0]
    
    if not matched_metabolites:
        print("No matching metabolites found.")
        return None, {}

    # # Find exact matches between MSI data and metabolite list
    # matched_metabolites = {
    #     metabolites_df.loc[metabolites_df["Observed Mass (m/z)"] == mz, "Compound"].values[0]: mz
    #     for mz in available_mz_values if mz in target_mz_values
    # }
    
    # if not matched_metabolites:
    #     print("No matching metabolites found.")
    #     return None, {}
    
    # Filter AnnData object based on matched m/z values
    filtered_indices = [str(mz) in map(str, matched_metabolites.values()) for mz in adata_msi.var_names]
    adata_filtered = adata_msi[:, filtered_indices].copy()

    # Rename var_names to metabolite names
    new_var_names = [name for name in matched_metabolites.keys() if str(matched_metabolites[name]) in adata_filtered.var_names]
    adata_filtered.var_names = new_var_names

    print(f"Filtered MSI data to {len(matched_metabolites)} matched metabolites.")

    return adata_filtered, matched_metabolites


def save_filtered_data(adata_filtered, slide, plate, matched_metabolites):
    output_dir = f"/lustre/groups/ml01/workspace/eirini.giannakoulia/common_metabolites/{slide}_{plate}/"
    os.makedirs(output_dir, exist_ok=True)

    # Save the full filtered AnnData object
    all_metabolites_path = f"{output_dir}/{slide}_{plate}_common_metabolites.h5ad"
    adata_filtered.write(all_metabolites_path)
    print(f"Saved all metabolites to: {all_metabolites_path}")

    # Save each metabolite separately with its name
    for metabolite, mz in matched_metabolites.items():
        metabolite_adata = adata_filtered[:, [str(mz) in adata_filtered.var_names]]
        metabolite_path = f"{output_dir}/{slide}_{plate}_{metabolite}.h5ad"
        metabolite_adata.write(metabolite_path)
        print(f"Saved {metabolite} to: {metabolite_path}")


In [43]:
# slide = "V11L12-109"  
# plate = "A1"  
# matrix = "FMP"  

# base_path = "/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets"
# path_msi = f"{base_path}/{slide}/{slide}_{plate}/{slide}_{plate}.MSI_MOSCOT_paired.h5ad"
path_msi = "/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets/MSI_concat_neurotransmitters.h5ad"
# path_msi = "/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/V11L12-038_A1.MSI_MOSCOT_paired_hvg.h5ad"
adata_msi = sc.read_h5ad(path_msi)

print(f"Filtering MSI data for common metabolites in {matrix} matrix...")
adata_filtered, matched_metabolites = filter_msi_by_fmp10_metabolites(adata_msi, "/lustre/groups/ml01/workspace/eirini.giannakoulia/vitatrack/datasets/FMP10_metabolite_weights.csv")

# save_filtered_data(adata_filtered, slide, plate, matched_metabolites)


Filtering MSI data for common metabolites in FMP matrix...
Filtered MSI data to 28 matched metabolites.


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [44]:
adata_filtered
matched_metabolites

{'GABA': np.float64(371.17565),
 'Creatinine': np.float64(381.17134),
 '4-hydroxy benzaldehyde single': np.float64(390.14928),
 'Taurine': np.float64(393.12703),
 'Creatine': np.float64(399.18195),
 'Spermidine': np.float64(413.27046),
 'DA': np.float64(421.19136),
 'DOPAL-d3': np.float64(423.18201),
 'Hordenine single': np.float64(433.23603),
 '3-MT': np.float64(435.20692),
 '5-HT': np.float64(444.20715),
 'L-Tyrosine single': np.float64(449.19753),
 'HVA/MOPEGAL': np.float64(450.17016),
 '5-HIAA': np.float64(459.16353),
 '3-OMD': np.float64(479.20822),
 'Vanillactic acid': np.float64(480.17509),
 'Histamine double': np.float64(646.29606),
 'DOPAC/DOPEGAL (*)': np.float64(673.24799),
 'Tryptamine double': np.float64(681.30491),
 '3-MT double (*)': np.float64(688.29517),
 '6-OHDA double (*)': np.float64(690.27454),
 '5-HT double (*)': np.float64(697.30673),
 '3-MT double': np.float64(702.3195),
 '5-HT double': np.float64(711.31089),
 '5-HIAA double (*)': np.float64(712.25852),
 'L-Tyro

In [None]:
# adata_filtered

AnnData object with n_obs × n_vars = 5443 × 3
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide', 'msi_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'msi_highly_variable'
    uns: 'hvg', 'msi_concat_clusters', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [None]:
matched_metabolites

{'Serotonin': np.float64(177.01202), 'Dopamine': np.float64(154.026815)}

In [71]:
test_msi = sc.read_h5ad("/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets/V11L12-038_A1/V11L12-038_A1_common_metabolites.h5ad")

In [73]:
test_msi.X

array([[13.4366044 , 11.10103741],
       [12.91320665, 12.19834288],
       [12.9797991 , 11.21413827],
       ...,
       [12.89470855, 11.76717277],
       [12.42302945, 10.86158308],
       [12.81337379, 11.18244822]])

In [65]:
non_zero_count/(5775*3)

0.6071572871572871

In [66]:
sc.read_h5ad("/lustre/groups/ml01/workspace/eirini.giannakoulia/datasets/V11L12-038_B1/V11L12-038_B1_common_metabolites.h5ad")

AnnData object with n_obs × n_vars = 5775 × 3
    obs: 'technology'
    var: 'technology'
    obsm: 'spatial'

In [5]:
adata_filtered.var_names

Index(['146.1181', '147.076965', '148.0607', '175.11939999999998',
       '205.09774'],
      dtype='object')

In [6]:
adata_rna = sc.read('/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/V11L12-038_A1.RNA_MOSCOT_paired_hvg.h5ad')
adata_msi = sc.read('/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/V11L12-038_A1.MSI_MOSCOT_paired_hvg.h5ad')

Glucose

In [None]:
# Define the expected m/z values based on ionization mode
glucose_mz_values = [180.0634, 181.0707, 179.0561, 203.0532]  # Adjust for ionization modes

# Convert var_names (m/z values) to floats
mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Find the closest match in the dataset (since experimental m/z values may have slight shifts)
tolerance = 0.01  # Adjust if needed
matching_mz = [mz for mz in glucose_mz_values if np.any(np.abs(mz_values - mz) < tolerance)]

if matching_mz:
    print(f"Matching m/z values found: {matching_mz}")
    # Subset AnnData to keep only the matched variable(s)
    adata_glucose = adata_msi[:, [str(mz) in map(str, matching_mz) for mz in adata_msi.var_names]]
else:
    print("No matching m/z found in the dataset.")


Matching m/z values found: [203.0532]


In [None]:
glucos  

[203.0532]

Dopamine

In [9]:
# Define expected dopamine m/z values for MALDI-FMP10
dopamine_mz_values = [154.0863, 176.0688, 192.0428]  # Protonated, Na+, K+ adducts

# Convert var_names (m/z values) from strings to floats
mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Set a small tolerance for m/z matching (adjust if needed)
tolerance = 0.01  

# Find matching m/z values in the dataset
matching_mz = [mz for mz in dopamine_mz_values if np.any(np.abs(mz_values - mz) < tolerance)]

if matching_mz:
    print(f"Matching m/z values found: {matching_mz}")
    # Subset the AnnData object to keep only the matching m/z values
    adata_dopamine = adata_msi[:, [str(mz) in map(str, matching_mz) for mz in adata_msi.var_names]]
else:
    print("No matching m/z found in the dataset.")

No matching m/z found in the dataset.


GABA

In [17]:
# Define expected GABA m/z values for MALDI-FMP10
gaba_mz_values = [104.0708, 126.0532, 142.0272]  # Protonated, Na+, K+ adducts

# Convert var_names (m/z values) from strings to floats
mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Set a small tolerance for m/z matching (adjust if needed)
tolerance = 0.8

# Find matching m/z values in the dataset
matching_mz = [mz for mz in gaba_mz_values if np.any(np.abs(mz_values - mz) < tolerance)]

if matching_mz:
    print(f"Matching m/z values found: {matching_mz}")
    # Subset the AnnData object to keep only the matching m/z values
    adata_gaba = adata_msi[:, [str(mz) in map(str, matching_mz) for mz in adata_msi.var_names]]
else:
    print("No matching m/z found in the dataset.")


No matching m/z found in the dataset.


Acetylocholine

In [18]:
# Define expected acetylcholine m/z values for MALDI-FMP10
ach_mz_values = [146.1170, 168.0994, 184.0734]  # Protonated, Na+, K+ adducts

# Convert var_names (m/z values) from strings to floats
mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Set a small tolerance for m/z matching (adjust if needed)
tolerance = 0.01  

# Find matching m/z values in the dataset
matching_mz = [mz for mz in ach_mz_values if np.any(np.abs(mz_values - mz) < tolerance)]

if matching_mz:
    print(f"Matching m/z values found: {matching_mz}")
    # Subset the AnnData object to keep only the matching m/z values
    adata_ach = adata_msi[:, [str(mz) in map(str, matching_mz) for mz in adata_msi.var_names]]
else:
    print("No matching m/z found in the dataset.")

Matching m/z values found: [184.0734]


In [23]:
# Sodium adduct m/z values for 5 highly probable metabolites
probable_na_mz_values = [
    113.0223,  # Lactate [M+Na]+
    170.0429,  # Glutamate [M+Na]+
    126.0532,  # GABA [M+Na]+
    126.0894,  # Choline [M+Na]+
    530.1759   # ATP [M+Na]+
]
# Convert var_names (m/z values) from strings to floats
mz_values = np.array([float(mz) for mz in adata_msi.var_names])

# Set a small tolerance for m/z matching
tolerance = 0.05

# Find matching m/z values in the dataset
matching_mz = [mz for mz in probable_mz_values if np.any(np.abs(mz_values - mz) < tolerance)]

if matching_mz:
    print(f"Matching m/z values found: {matching_mz}")
    # Subset the AnnData object to keep only the matching m/z values
    adata_probable = adata_msi[:, [str(mz) in map(str, matching_mz) for mz in adata_msi.var_names]]
else:
    print("No matching probable metabolites found.")

Matching m/z values found: [170.0429]


In [25]:
matching_mz

[170.0429]

In [24]:
adata_probable

View of AnnData object with n_obs × n_vars = 2681 × 0
    obs: 'technology', 'clusters', 'split', 'og_index'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'clusters', 'clusters_colors', 'hvg', 'log1p', 'moranI', 'neighbors', 'pca', 'spatial_neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'raw_counts', 'spatial', 'spatial_warp'
    varm: 'PCs'
    obsp: 'connectivities', 'distances', 'spatial_connectivities', 'spatial_distances'