# scRNA preprocessing notebook restricting genes [for given gene subset]

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import squidpy as sq
from sklearn.preprocessing import StandardScaler
# from sklearn import model_selection
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt



genes to keep: TH, DDC, SLC6A3, COMT, MAOA, MAOB, PINK1, PRKN, PARK7, NDUFS2, NDUFV1, ATP5A1, GPX1, SOD2, GSR, NFE2L2, TNF, IL6, CX3CR1, TREM2, SNCA, LRRK2, GBA, HSPA1A  

also metabolomes to keep: 154.09, 169.07, 181.08, 198.08, 135.02, 117.00, 119.00, 90.05, 89.02, 400.29, 428.32, 703.58, 731.61, 782.57, 806.57, 523.38, 521.35, 369.35, 650.00, 308.09, 613.17, 305.24, 351.23, 351.23

To make this easily runable for other ds, ask from user to enter input paths, name of final dataset, destination of saving

In [4]:
path_rna = "/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/V11L12-038_A1.RNA_MOSCOT_paired.h5ad"
path_msi = "/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/V11L12-038_A1.MSI_MOSCOT_paired.h5ad"


#Make sure you use the filename at the end, eg "path_to_dataset_dir/file_name.h5ad'
save_rna_ds_path = "/lustre/groups/ml01/workspace/eirini.giannakoulia/hand_prep/V11L12-038_A1.RNA_MOSCOT_paired_selectedgenesDHB.h5ad"
save_msi_ds_path = "/lustre/groups/ml01/workspace/eirini.giannakoulia/hand_prep/V11L12-038_A1.MSI_MOSCOT_paired_selectedgenesDHB.h5ad"

In [5]:
adata_msi= sc.read_h5ad(path_msi)
adata_rna = sc.read_h5ad(path_rna)

In [6]:
adata_msi

AnnData object with n_obs × n_vars = 2681 × 2754
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'clusters', 'clusters_colors', 'hvg', 'log1p', 'moranI', 'neighbors', 'pca', 'spatial_neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances', 'spatial_connectivities', 'spatial_distances'

In [7]:
adata_rna

AnnData object with n_obs × n_vars = 2681 × 14479
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'n_genes', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'technology'
    uns: 'clusters', 'clusters_colors', 'hvg', 'log1p', 'moranI', 'neighbors', 'pca', 'spatial', 'spatial_neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances', 'spatial_connectivities', 'spatial_distances'

Assuming the given genes will be in the above comma separated format

In [8]:
genes_str = "TH, DDC, SLC6A3, COMT, MAOA, MAOB, PINK1, PRKN, PARK7, NDUFS2, NDUFV1, ATP5A1, GPX1, SOD2, GSR, NFE2L2, TNF, IL6, CX3CR1, TREM2, SNCA, LRRK2, GBA, HSPA1A"
gene_list = [gene.strip() for gene in genes_str.split(",")]

#control for uppercase in given list:
gene_upper_map = {gene.upper(): gene for gene in adata_rna.var_names}
valid_genes = [gene_upper_map[gene.upper()] for gene in gene_list if gene.upper() in gene_upper_map]

# valid_genes = [gene for gene in gene_list if gene in adata_rna.var_names]
len(valid_genes)

21

In [9]:
adata_rna_subset = adata_rna[:, valid_genes].copy()
adata_rna_subset

AnnData object with n_obs × n_vars = 2681 × 21
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'n_genes', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'technology'
    uns: 'clusters', 'clusters_colors', 'hvg', 'log1p', 'moranI', 'neighbors', 'pca', 'spatial', 'spatial_neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances', 'spatial_connectivities', 'spatial_distances'

In [10]:
adata_rna_subset.write(save_rna_ds_path) #path given on the top of the nb

Assuming the given meatbolites will be in the above comma separated format

In [11]:
metabolites_str = "154.09, 169.07, 181.08, 198.08, 135.02, 117.00, 119.00, 90.05, 89.02, 400.29, 428.32, 703.58, 731.61, 782.57, 806.57, 523.38, 521.35, 369.35, 650.00, 308.09, 613.17, 305.24, 351.23, 351.23"

#Keep list of wanted metabolites as floats
target_mzs = [float(x.strip()) for x in metabolites_str.split(",")]
tol = 0.1 #add tolerance


valid_metabolites = set()
for target in target_mzs:
    for peak in adata_msi.var_names:
        peak_val = float(peak)
        if abs(peak_val - target) <= tol:
            valid_metabolites.add(peak)
valid_metabolites = list(valid_metabolites)

In [12]:
len(target_mzs)

24

In [13]:
len(valid_metabolites)

42

We see that using the full unprocessed slides many of the obtained m/z are superclose (indicating that possibly they are the ratios fromt the same metabolite) but of course this is unwanted, so the next block of code will try to clear this problem, by selecting the metabolite with the smalles absolute difference to the given values:

In [14]:
selected_metabolites = []
for target in target_mzs:
    best_peak = None
    best_diff = tol + 1  # initialize with a value larger than the tolerance
    for peak in adata_msi.var_names:
        try:
            peak_val = float(peak)
        except ValueError:
            continue  # skip if conversion fails
        diff = abs(peak_val - target)
        if diff <= tol and diff < best_diff:
            best_diff = diff
            best_peak = peak
    if best_peak is not None:
        selected_metabolites.append(best_peak)

# Remove duplicates in case different target values end up selecting the same peak
selected_metabolites = list(dict.fromkeys(selected_metabolites))

# Subset the AnnData object to include only the selected metabolite peaks
adata_msi_subset = adata_msi[:, selected_metabolites].copy()
print(selected_metabolites)
len(selected_metabolites)
adata_msi_subset.write(save_msi_ds_path) #path given on the top of the nb

['154.05953499999998', '169.07619', '198.089485', '400.342015', '428.372025', '703.56658', '731.596855', '782.5560700000001', '806.57208', '523.354765', '369.35074', '308.089615', '613.153765', '305.302855', '351.13323']


In [15]:
adata_msi_subset

AnnData object with n_obs × n_vars = 2681 × 15
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'clusters', 'clusters_colors', 'hvg', 'log1p', 'moranI', 'neighbors', 'pca', 'spatial_neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances', 'spatial_connectivities', 'spatial_distances'