In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

import os

from xgboost import XGBRegressor
from xgboost import DMatrix

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error





Set up datasets

In [2]:
featsel = "hvg_svd"
base_path = f"/lustre/groups/ml01/workspace/eirini.giannakoulia/pipeline/dataset/processed/neurotransmitters_spatialy_highly_variable/{featsel}"

X_train = sc.read_h5ad(os.path.join(base_path,"rna_dataset_train.h5ad"))
X_test = sc.read_h5ad(os.path.join(base_path,"rna_dataset_test.h5ad"))
Y_train = sc.read_h5ad(os.path.join(base_path,"msi_dataset_train.h5ad"))
Y_test = sc.read_h5ad(os.path.join(base_path,"msi_dataset_test.h5ad"))

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [None]:
params = {'alpha': 50, 'lambda': 100, 'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'early_stopping_rounds': 10}


In [4]:
from scipy.sparse import issparse


In [5]:
import cupy as cp  # For GPU array conversion

In [8]:
def ensure_gpu(data):
    """
    Convert the data to a GPU array if it is not already.
    If the data is a pandas DataFrame or NumPy array, convert it to a CuPy array.
    """
    # If data already has the __cuda_array_interface__, assume it is a GPU array.
    if hasattr(data, '__cuda_array_interface__'):
        return data
    # If data is a DataFrame, convert its underlying NumPy array.
    if isinstance(data, pd.DataFrame):
        return cp.asarray(data.values)
    # If data is a NumPy array, convert it.
    if isinstance(data, np.ndarray):
        return cp.asarray(data)
    # Otherwise, attempt conversion.
    return cp.asarray(data)

def ensure_cpu(data):
    """
    Convert the data to a CPU (NumPy) array if it is on the GPU.
    """
    if hasattr(data, '__cuda_array_interface__'):
        return cp.asnumpy(data)
    return data
def run_xgboost(
        adata_rna_train,
        adata_rna_test,
        adata_msi_train,
        adata_msi_test, 
        params, 
        featsel,
        **kwargs):
    
    # Select features based on the provided method.
    if featsel == "hvg":
        X_train = adata_rna_train.X  
        X_test = adata_rna_test.X  
        Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
    elif featsel == "hvg_svd":
        X_train = adata_rna_train.obsm["svd_features"]
        X_test = adata_rna_test.obsm["svd_features"]
        Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
    elif featsel == "hvg_svd_graph":
        X_train = adata_rna_train.obsm["svd_graph"]
        X_test = adata_rna_test.obsm["svd_graph"] 
        Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
    elif featsel == "svd":
        X_train = adata_rna_train.obsm["svd_features"]
        X_test = adata_rna_test.obsm["svd_features"]
        Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
    elif featsel == "svd_graph":
        X_train = adata_rna_train.obsm["svd_graph"]
        X_test = adata_rna_test.obsm["svd_graph"]
        Y_train, Y_test = adata_msi_train.X, adata_msi_test.X
    else:
        raise ValueError(f"Unsupported feature selection method: {featsel}")

    if issparse(X_train):
        X_train = X_train.toarray()
    if issparse(X_test):
        X_test = X_test.toarray()
    if issparse(Y_train):
        Y_train = Y_train.toarray()
    if issparse(Y_test):
        Y_test = Y_test.toarray()

    #Since the model is configured to run on GPU (device="cuda"),
    #convert all input data to GPU arrays if they are not already.
    #If you don't need that, you can just comment out the conversions
    X_train = ensure_gpu(X_train)
    X_test = ensure_gpu(X_test)
    Y_train = ensure_gpu(Y_train)
    Y_test = ensure_gpu(Y_test)

    # XGBoost Hyperparameters
    alpha = float(params.get("alpha", 10))         # L1 regularization (Lasso)
    lambda_ = float(params.get("lambda", 50))       # L2 regularization (Ridge)
    max_depth = int(params.get("max_depth", 5))
    learning_rate = float(params.get("learning_rate", 0.1))
    n_estimators = int(params.get("n_estimators", 500))
    subsample = float(params.get("subsample", 0.9))
    colsample_bytree = float(params.get("colsample_bytree", 0.7))
    min_child_weight = int(params.get("min_child_weight", 2))
    early_stopping_rounds = int(params.get("early_stopping_rounds", 20))
    n_jobs = int(params.get("n_jobs", 15))


    # Initialize XGBoost model on GPU
    xgb_model = XGBRegressor(
        device="cuda",
        reg_alpha=alpha, 
        reg_lambda=lambda_,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        min_child_weight=min_child_weight,
        early_stopping_rounds=early_stopping_rounds,
        n_jobs=n_jobs
    )

    # Train the model
    xgb_model.fit(
        X_train, 
        Y_train, 
        eval_set=[(X_test, Y_test)], 
        verbose=True
    )

    # Predict on the test data
    Y_pred = xgb_model.predict(X_test)

    # Convert predictions and test labels back to CPU (NumPy) for evaluation
    Y_pred = ensure_cpu(Y_pred)
    Y_test_cpu = ensure_cpu(Y_test)

    # Pearson and Spearman correlation
    pearson_corr = pearsonr(Y_pred.flatten(), Y_test_cpu.flatten())[0]
    spearman_corr = spearmanr(Y_pred.flatten(), Y_test_cpu.flatten())[0]

    # Compute Root Mean Squared Error and R2 score
    rmse_test = root_mean_squared_error(Y_test_cpu, Y_pred)
    r2_test = r2_score(Y_test_cpu, Y_pred)
    mae_test = mean_absolute_error(Y_test_cpu, Y_pred)

    #Save results to a DataFrame
    metrics = pd.DataFrame({
        'rmse': [rmse_test],
        'mae': [mae_test],
        'r2': [r2_test],
        'pearson': [pearson_corr],
        'spearman': [spearman_corr]
    })

    # #Add this for interpretability later, check outputs of each model's preds
    # predictions = pd.DataFrame({
    #     'y_true': Y_test_cpu.flatten(),
    #     'y_pred': Y_pred.flatten()
    # })
    return metrics


In [10]:
run_xgboost(
        adata_rna_train = X_train,
        adata_rna_test = X_test,
        adata_msi_train = Y_train,
        adata_msi_test = Y_test, 
        params = params, 
        featsel = "hvg_svd")

[0]	validation_0-rmse:1.55429
[1]	validation_0-rmse:1.49564
[2]	validation_0-rmse:1.44026
[3]	validation_0-rmse:1.38805
[4]	validation_0-rmse:1.33886
[5]	validation_0-rmse:1.29266
[6]	validation_0-rmse:1.24925
[7]	validation_0-rmse:1.20841
[8]	validation_0-rmse:1.17011
[9]	validation_0-rmse:1.13431
[10]	validation_0-rmse:1.10072
[11]	validation_0-rmse:1.06918
[12]	validation_0-rmse:1.03978
[13]	validation_0-rmse:1.01226
[14]	validation_0-rmse:0.98665
[15]	validation_0-rmse:0.96278
[16]	validation_0-rmse:0.94055
[17]	validation_0-rmse:0.91982
[18]	validation_0-rmse:0.90057
[19]	validation_0-rmse:0.88268
[20]	validation_0-rmse:0.86627
[21]	validation_0-rmse:0.85104
[22]	validation_0-rmse:0.83686
[23]	validation_0-rmse:0.82380
[24]	validation_0-rmse:0.81172
[25]	validation_0-rmse:0.80058
[26]	validation_0-rmse:0.79032
[27]	validation_0-rmse:0.78086
[28]	validation_0-rmse:0.77220
[29]	validation_0-rmse:0.76414
[30]	validation_0-rmse:0.75677
[31]	validation_0-rmse:0.75006
[32]	validation_0-

Unnamed: 0,rmse,mae,r2,pearson,spearman
0,0.654776,0.510165,-0.00391,0.908177,0.849818


	rmse	mae	r2	pearson	spearman
0	0.654776	0.510165	-0.00391	0.908177	0.849818


In [29]:
adata_rna_train = sc.read_h5ad("/home/icb/eirini.giannakoulia/pipeline/dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad")
adata_rna_test = sc.read_h5ad("/home/icb/eirini.giannakoulia/pipeline/dataset/processed/lipids/hvg_svd_graph/rna_dataset_test.h5ad")
adata_msi_train = sc.read_h5ad("/home/icb/eirini.giannakoulia/pipeline/dataset/processed/lipids/hvg_svd_graph/rna_dataset_train.h5ad")
adata_msi_test = sc.read_h5ad("/home/icb/eirini.giannakoulia/pipeline/dataset/processed/lipids/hvg_svd_graph/rna_dataset_test.h5ad")


In [30]:
adata_rna_train

AnnData object with n_obs × n_vars = 2810 × 2000
    obs: 'in_tissue', 'array_row', 'array_col', 'mt_frac', 'total_counts', 'n_counts', 'clusters', 'technology', 'random_split', 'half_split', 'og_index', 'slide', 'gex_concat_clusters'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'spatialy_highly_variable', 'gex_highly_variable'
    uns: 'gex_concat_clusters', 'hvg', 'neighbors', 'pca', 'slide_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial', 'spatial_warp', 'svd_features', 'svd_graph'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [31]:
if "spatial_connectivities" not in adata_rna_train.obsp:
    print("oops")
else:
    print("this works")


oops


In [32]:
params={}

In [50]:
X_train = adata_rna_train.obsm["svd_graph"]
X_test = adata_rna_test.obsm["svd_graph"] 
Y_train, Y_test = adata_msi_train.X, adata_msi_test.X


# Since the model is configured to run on GPU (device="cuda"),
# convert all input data to GPU arrays if they are not already.
from scipy.sparse import issparse

if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()
if issparse(Y_train):
    Y_train = Y_train.toarray()
if issparse(Y_test):
    Y_test = Y_test.toarray()

X_train = ensure_gpu(X_train)
X_test = ensure_gpu(X_test)
Y_train = ensure_gpu(Y_train)
Y_test = ensure_gpu(Y_test)

# XGBoost Hyperparameters
alpha = float(params.get("alpha", 50))         # L1 regularization (Lasso)
lambda_ = float(params.get("lambda", 100))       # L2 regularization (Ridge)
max_depth = int(params.get("max_depth", 3))
learning_rate = float(params.get("learning_rate", 0.05))
n_estimators = int(params.get("n_estimators", 1000))
subsample = float(params.get("subsample", 0.8))
colsample_bytree = float(params.get("colsample_bytree", 0.8))
min_child_weight = int(params.get("min_child_weight", 3))
early_stopping_rounds = int(params.get("early_stopping_rounds", 20))
n_jobs = int(params.get("n_jobs", 20))


In [51]:
# Initialize XGBoost model on GPU
xgb_model = XGBRegressor(
    device="cuda",
    reg_alpha=alpha, 
    reg_lambda=lambda_,
    max_depth=max_depth,
    learning_rate=learning_rate,
    n_estimators=n_estimators,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_weight=min_child_weight,
    early_stopping_rounds=early_stopping_rounds,
    n_jobs=20
)

In [52]:
# Train the model
xgb_model.fit(
    X_train, 
    Y_train, 
    eval_set=[(X_test, Y_test)]
)

[0]	validation_0-rmse:0.55078
[1]	validation_0-rmse:0.54177
[2]	validation_0-rmse:0.53351
[3]	validation_0-rmse:0.52596
[4]	validation_0-rmse:0.51887
[5]	validation_0-rmse:0.51241
[6]	validation_0-rmse:0.50640
[7]	validation_0-rmse:0.50099
[8]	validation_0-rmse:0.49592
[9]	validation_0-rmse:0.49130
[10]	validation_0-rmse:0.48707
[11]	validation_0-rmse:0.48325
[12]	validation_0-rmse:0.47969
[13]	validation_0-rmse:0.47650
[14]	validation_0-rmse:0.47355
[15]	validation_0-rmse:0.47082
[16]	validation_0-rmse:0.46839
[17]	validation_0-rmse:0.46615
[18]	validation_0-rmse:0.46408
[19]	validation_0-rmse:0.46215
[20]	validation_0-rmse:0.46047
[21]	validation_0-rmse:0.45891
[22]	validation_0-rmse:0.45750
[23]	validation_0-rmse:0.45620
[24]	validation_0-rmse:0.45501
[25]	validation_0-rmse:0.45391
[26]	validation_0-rmse:0.45288
[27]	validation_0-rmse:0.45194
[28]	validation_0-rmse:0.45112
[29]	validation_0-rmse:0.45037
[30]	validation_0-rmse:0.44968
[31]	validation_0-rmse:0.44905
[32]	validation_0-

In [46]:
# Predict on the test data
Y_pred = xgb_model.predict(X_test)

# Convert predictions and test labels back to CPU (NumPy) for evaluation
Y_pred = ensure_cpu(Y_pred)
Y_test_cpu = ensure_cpu(Y_test)

# Pearson and Spearman correlation
pearson_corr = pearsonr(Y_pred.flatten(), Y_test_cpu.flatten())[0]
spearman_corr = spearmanr(Y_pred.flatten(), Y_test_cpu.flatten())[0]

# Compute Root Mean Squared Error and R2 score
mse_test = np.sqrt(mean_squared_error(Y_test_cpu, Y_pred))
r2_test = r2_score(Y_test_cpu, Y_pred)


In [None]:
Y_pred

In [38]:
metrics_df = pd.DataFrame({
    "pearson_corr": [pearson_corr],
    "spearman_corr": [spearman_corr],
    "mse_test": [mse_test],
    "r2_test": [r2_test]
})

print(metrics_df)  #slightly different because of transfer to cpu?? sparse???



   pearson_corr  spearman_corr  mse_test   r2_test
0      0.613199       0.387206  0.442813 -0.024167


In [None]:
metrics_df = pd.DataFrame({
    "pearson_corr": [pearson_corr],
    "spearman_corr": [spearman_corr],
    "mse_test": [mse_test],
    "r2_test": [r2_test]
})

print(metrics_df)


   pearson_corr  spearman_corr  mse_test   r2_test
0      0.757909       0.401818  0.232958 -0.155623


In [8]:
# Initialize XGBoost model
xgb_model = XGBRegressor(
    reg_alpha=alpha, 
    reg_lambda=lambda_,
    max_depth=max_depth,
    learning_rate=learning_rate,
    n_estimators=n_estimators,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    min_child_weight=min_child_weight,
    early_stopping_rounds=early_stopping_rounds,
    n_jobs=4,
    eval_metric="rmse"  # Move eval_metric here!
)



In [9]:
X_train

array([[ 0.05498842, -0.06983834,  0.0322768 , ...,  0.51202821,
        -0.26191808, -0.46791773],
       [ 0.05441122, -0.06843096,  0.03007761, ...,  0.49982992,
        -0.25936533, -0.45385613],
       [ 0.00235679,  0.00355971, -0.00435029, ...,  0.07794559,
        -0.02062425,  0.03063063],
       ...,
       [-0.29679115, -0.10184726,  0.32446133, ...,  0.19241489,
         0.07684792, -0.02325384],
       [-0.29679115, -0.10184726,  0.32446133, ...,  0.19241489,
         0.07684792, -0.02325384],
       [-0.0161145 , -0.02394049,  0.00454063, ...,  0.05357949,
        -0.03564126,  0.02442186]])

In [10]:
Y_train

array([[17.21782452, 13.4366044 , 14.03917749, ...,  0.        ,
        11.85679868, 11.93244451],
       [17.11674455, 12.91320665, 14.6769257 , ...,  0.        ,
         0.        ,  0.        ],
       [17.1153376 , 12.9797991 , 13.45785454, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [17.03575537, 12.89470855, 13.74117638, ...,  0.        ,
         0.        ,  0.        ],
       [16.75114047, 12.42302945, 14.0629563 , ...,  9.13668217,
         0.        ,  0.        ],
       [17.16235163, 12.81337379, 13.68246508, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
# Train the model
xgb_model.fit(
    X_train, 
    Y_train, 
    eval_set=[(X_test, Y_test)], 
    verbose=True
)

[0]	validation_0-rmse:5.93997
[1]	validation_0-rmse:5.75319
[2]	validation_0-rmse:5.57800
[3]	validation_0-rmse:5.41405
[4]	validation_0-rmse:5.26070
[5]	validation_0-rmse:5.11733
[6]	validation_0-rmse:4.98342
[7]	validation_0-rmse:4.85855
[8]	validation_0-rmse:4.74220
[9]	validation_0-rmse:4.63398
[10]	validation_0-rmse:4.53341
[11]	validation_0-rmse:4.43994
[12]	validation_0-rmse:4.35316
[13]	validation_0-rmse:4.27279
[14]	validation_0-rmse:4.19829
[15]	validation_0-rmse:4.12931
[16]	validation_0-rmse:4.06567
[17]	validation_0-rmse:4.00673
[18]	validation_0-rmse:3.95235
[19]	validation_0-rmse:3.90218
[20]	validation_0-rmse:3.85597
[21]	validation_0-rmse:3.81344
[22]	validation_0-rmse:3.77428
[23]	validation_0-rmse:3.73811
[24]	validation_0-rmse:3.70483
[25]	validation_0-rmse:3.67423
[26]	validation_0-rmse:3.64612
[27]	validation_0-rmse:3.62027
[28]	validation_0-rmse:3.59655
[29]	validation_0-rmse:3.57478
[30]	validation_0-rmse:3.55473
[31]	validation_0-rmse:3.53633
[32]	validation_0-

In [12]:
# Predictions and evaluation
Y_pred = xgb_model.predict(X_test)

# Pearson and Spearman correlation
pearson_corr = pearsonr(Y_pred.flatten(), Y_test.flatten())[0]
spearman_corr = spearmanr(Y_pred.flatten(), Y_test.flatten())[0]

#MSE and R2
mse_test = root_mean_squared_error(Y_test, Y_pred)
r2_test = r2_score(Y_test, Y_pred)

In [13]:
# Save results to a DataFrame
results = pd.DataFrame({
    "mse": [mse_test],
    "r2": [r2_test],
    "pearson": [pearson_corr],
    "spearman": [spearman_corr],
    "alpha": [alpha],
    "lambda": [lambda_],
    "max_depth": [max_depth],
    "learning_rate": [learning_rate],
    "n_estimators": [n_estimators],
    "subsample": [subsample],
    "colsample_bytree": [colsample_bytree],
    "min_child_weight": [min_child_weight]
})


In [14]:
results #hvgsvd_graph

Unnamed: 0,mse,r2,pearson,spearman,alpha,lambda,max_depth,learning_rate,n_estimators,subsample,colsample_bytree,min_child_weight
0,2.688927,0.048431,0.84764,0.906086,50.0,100.0,3,0.05,500,0.8,0.8,3


In [22]:
results

Unnamed: 0,mse,r2,pearson,spearman,alpha,lambda,max_depth,learning_rate,n_estimators,subsample,colsample_bytree,min_child_weight
0,2.702831,0.043914,0.845553,0.905244,50.0,100.0,3,0.05,500,0.8,0.8,3


In [None]:
res = run_xgboost(adata_rna=adata_rna,adata_metabolomics=adata_msi, featsel=feat_sel, params=params)