In [1]:
import numpy as np
import pandas as pd
import cupy as cp  # For GPU array conversion
from xgboost import XGBRegressor
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error, r2_score  # Using MSE then taking sqrt
import scanpy as sc
import yaml # type: ignore
import hashlib
import os
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import squidpy as sq
from scipy.sparse import issparse



In [2]:
input_rna = "/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/RNA_concat_lipids.h5ad"
input_metabolomics = "/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/MSI_concat_lipids.h5ad"


In [3]:
adata_rna = sc.read_h5ad(input_rna)
adata_msi = sc.read_h5ad(input_metabolomics)
adata_rna.obs_names_make_unique()
adata_msi.obs_names_make_unique()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [4]:
top_genes = 200
split_name = 'half_split'
n_components = 20
n_neighbors = 6
featsel = 'svd_graph'


In [5]:
#Split into train test
rna_train = adata_rna[adata_rna.obs[split_name] == "train"]
rna_test = adata_rna[adata_rna.obs[split_name] == "test"]

#----------------------------------------------sc-seqRNA----------------------------------------------#
#-----SVD-----#
svd_reducer = TruncatedSVD(n_components=n_components)

svd_features_train = svd_reducer.fit_transform(rna_train.X.toarray())
rna_train.obsm["svd_features"] = svd_features_train

svd_features_test = svd_reducer.fit_transform(rna_test.X.toarray())
rna_test.obsm["svd_features"] = svd_features_test

#-----GRAPH-----#
# #Check if og data has spatial connectivities or else create the connectivity matrix 
if "spatial_connectivities" not in rna_train.var.columns:
    rna_train.obs_names_make_unique()
    rna_test.obs_names_make_unique()

    rna_train.obs_names = rna_train.obs.og_index.tolist().copy()
    rna_train.obs_names_make_unique()
    rna_train.obs_names = rna_train.obs_names + "_11"

    rna_test.obs_names = rna_test.obs.og_index.tolist().copy()
    rna_test.obs_names_make_unique()
    rna_test.obs_names = rna_test.obs_names + "_22"
    adata_temp = sc.concat([rna_train, rna_test])
    sq.gr.spatial_neighbors(adata_temp, coord_type="grid", spatial_key="spatial", n_neighs=n_neighbors)
    svd_reducer = TruncatedSVD(n_components=n_components)

    graph_feat_train = svd_reducer.fit_transform(adata_temp[rna_train.obs_names].obsp["spatial_connectivities"])
    graph_feat_test = svd_reducer.fit_transform(adata_temp[rna_test.obs_names].obsp["spatial_connectivities"])
else:
        svd_reducer = TruncatedSVD(n_components=n_components)
        graph_feat_train = svd_reducer.fit_transform(rna_train[rna_train.obs_names].obsp["spatial_connectivities"])
        graph_feat_test = svd_reducer.fit_transform(rna_test[rna_test.obs_names].obsp["spatial_connectivities"])

##Concatenate the standardized features as obtained by svd applied on adata.X and on the s
sc_svd = StandardScaler()
sc_gr = StandardScaler()

rna_sg_train = np.concatenate([sc_svd.fit_transform(svd_features_train), \
                                        sc_gr.fit_transform(graph_feat_train)],
                                        axis=1)
rna_train.obsm["svd_graph"] = rna_sg_train

rna_sg_test = np.concatenate([sc_svd.fit_transform(svd_features_test), \
                                        sc_gr.fit_transform(graph_feat_test)],
                                        axis=1)
rna_test.obsm["svd_graph"] = rna_sg_test




msi_train = adata_msi[adata_msi.obs[split_name] == "train"]
msi_test = adata_msi[adata_msi.obs[split_name] == "test"]



In [9]:
adata_rna_train = rna_train
adata_rna_test = rna_test
adata_msi_train = msi_train
adata_msi_test = msi_test

In [10]:
def convert_to_dense(matrix):
    """Converts a sparse matrix to dense if necessary."""
    if issparse(matrix):
        return matrix.toarray()
    return matrix

In [11]:
#adding feature selection as a param to select correct parts of the adata
if featsel == "hvg":
    X_train = adata_rna_train.X  
    X_test = adata_rna_test.X  
    Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
elif featsel == "hvg_svd":
    X_train = adata_rna_train.obsm["svd_features"]
    X_test = adata_rna_test.obsm["svd_features"]
    Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
elif featsel == "hvg_svd_graph":
    X_train = adata_rna_train.obsm["svd_graph"]
    X_test = adata_rna_test.obsm["svd_graph"] 
    Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
elif featsel == "svd":
    X_train = adata_rna_train.obsm["svd_features"]
    X_test = adata_rna_test.obsm["svd_features"]
    Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
elif featsel == "svd_graph":
    X_train = adata_rna_train.obsm["svd_graph"]
    X_test = adata_rna_test.obsm["svd_graph"]
    Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
else:
    raise ValueError(f"Unsupported feature selection method: {featsel}")

In [12]:
from sklearn.linear_model import LinearRegression

In [14]:
from sklearn.metrics import r2_score, root_mean_squared_error
from scipy.stats import spearmanr, pearsonr


In [15]:
# Convert to dense if needed
X_train = convert_to_dense(X_train)
X_test = convert_to_dense(X_test)
Y_train = convert_to_dense(Y_train)
Y_test = convert_to_dense(Y_test)

# Fit linear regression
lin = LinearRegression()
lin.fit(X_train, Y_train)

# Predictions and evaluation
Y_pred = lin.predict(X_test)

#Pearson spearman
pearson_corr = pearsonr(Y_pred.flatten(), Y_test.flatten())[0]
spearman_corr = spearmanr(Y_pred.flatten(), Y_test.flatten())[0]

#MSE and R2
# mse_train = root_mean_squared_error(Y_train, Y_train_pred)
mse_test = root_mean_squared_error(Y_test, Y_pred)
# r2_train = r2_score(matching_msi_train, msi_train_pred)
r2_test = r2_score(Y_test, Y_pred)

# Save results to a DataFrame
results = pd.DataFrame({
'mse': [mse_test],
'r2': [r2_test],
'pearson': [pearson_corr],
'spearman': [spearman_corr]})


In [16]:
results

Unnamed: 0,mse,r2,pearson,spearman
0,0.232766,-0.048329,0.960787,0.893783


In [17]:
adata_rna

AnnData object with n_obs × n_vars = 5618 × 14196
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide', 'gex_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'gex_highly_variable'
    uns: 'gex_concat_clusters', 'hvg', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [18]:
input_rna

'/lustre/groups/ml01/workspace/anastasia.litinetskaya/code/vitatrack/datasets/RNA_concat_lipids.h5ad'

In [7]:

#----------------------------------------------MSI----------------------------------------------#
#MSI processed only for highly variable metabolites, kept hvg_ for uniformality in vars
if "highly_variable" not in adata_msi.var.columns:
    sc.pp.highly_variable_genes(adata_msi, flavor='seurat', n_top_genes=top_genes)
hvg_msi = adata_msi[:, adata_msi.var["highly_variable"]]

hvg_msi_train = hvg_msi[hvg_msi.obs[split_name] == "train"]
hvg_msi_test = hvg_msi[hvg_msi.obs[split_name] == "test"]

#----------------------------------------------SAVE----------------------------------------------#


View of AnnData object with n_obs × n_vars = 2810 × 2000
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide', 'gex_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'gex_highly_variable'
    uns: 'gex_concat_clusters', 'hvg', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [None]:
hvg_rna_train
hvg_rna_test

View of AnnData object with n_obs × n_vars = 2808 × 2000
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide', 'gex_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'gex_highly_variable'
    uns: 'gex_concat_clusters', 'hvg', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [10]:
hvg_msi_train


View of AnnData object with n_obs × n_vars = 2810 × 500
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide', 'msi_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'msi_highly_variable'
    uns: 'hvg', 'msi_concat_clusters', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [11]:
hvg_msi_test

View of AnnData object with n_obs × n_vars = 2808 × 500
    obs: 'technology', 'clusters', 'random_split', 'half_split', 'slide', 'msi_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'msi_highly_variable'
    uns: 'hvg', 'msi_concat_clusters', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'