In [1]:
import os

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

from sklearn.linear_model import ElasticNet


from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import root_mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim

## Fix the data

In [15]:
featsel = "hvg_svd_graph"
task = "lipids"
# rna_data = f"./dataset/processed/vitatrack/{featsel}/rna_dataset.h5ad"
# msi_data = f"./dataset/processed/vitatrack/{featsel}/msi_dataset.h5ad"
adata_rna_train = sc.read_h5ad(f"/home/icb/eirini.giannakoulia/pipeline/dataset/processed/{task}/{featsel}/rna_dataset_train.h5ad")
adata_rna_test = sc.read_h5ad(f"/home/icb/eirini.giannakoulia/pipeline/dataset/processed/{task}/{featsel}/rna_dataset_test.h5ad")
adata_msi_train = sc.read_h5ad(f"/home/icb/eirini.giannakoulia/pipeline/dataset/processed/{task}/{featsel}/rna_dataset_train.h5ad")
adata_msi_test = sc.read_h5ad(f"/home/icb/eirini.giannakoulia/pipeline/dataset/processed/{task}/{featsel}/rna_dataset_test.h5ad")

In [16]:
if featsel in ["hvg", "svd", "hvg_svd"]:
    X_train = adata_rna_train.X  
    X_test = adata_rna_test.X  
elif featsel in ["hvg_svd_graph", "svd_graph"]:
    # Use SVD features if available; otherwise, fall back to graph-based features
    X_train = adata_rna_train.obsm.get("svd_features", adata_rna_train.obsm["svd_graph"])
    X_test = adata_rna_test.obsm.get("svd_features", adata_rna_test.obsm["svd_graph"])
else:
    raise ValueError(f"Unsupported feature selection method: {featsel}")

Y_train = adata_msi_train.X
Y_test = adata_msi_test.X



In [18]:
X_train, Y_train


(array([[15.899316  , -0.59822905,  4.01089   , ...,  1.2812364 ,
          0.03816527, -0.99586916],
        [17.993187  ,  0.7389461 ,  3.6721458 , ...,  0.8969562 ,
         -0.2374196 , -0.48757803],
        [16.580381  ,  0.1163224 ,  4.891423  , ...,  2.0224001 ,
          0.984409  , -1.6672854 ],
        ...,
        [16.951843  ,  0.06999123,  4.1847334 , ..., -0.4984367 ,
         -0.5534885 ,  1.6347868 ],
        [17.505602  ,  4.364483  ,  0.5860475 , ..., -0.637969  ,
         -0.696187  ,  0.47918594],
        [17.505602  ,  4.364483  ,  0.5860475 , ..., -0.637969  ,
         -0.696187  ,  0.47918594]], dtype=float32),
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 653723 stored elements and shape (2810, 2000)>)

In [19]:
from sklearn.linear_model import ElasticNet
from scipy.sparse import issparse


### Elastic Net


In [20]:
def convert_to_dense(matrix):
    """Converts a sparse matrix to dense if necessary."""
    if issparse(matrix):
        return matrix.toarray()
    return matrix

In [23]:
# Convert to dense if needed
X_train = convert_to_dense(X_train)
X_test = convert_to_dense(X_test)
Y_train = convert_to_dense(Y_train)
Y_test = convert_to_dense(Y_test)



In [24]:
alpha = 0.001
l1_ratio = 0.1

# Initialize and train the ElasticNet model
elastic_net = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
elastic_net.fit(X_train, Y_train)


In [27]:
# Make predictions on the test set
Y_pred = elastic_net.predict(X_test)

# Compute evaluation metrics
pearson_corr = pearsonr(Y_pred.flatten(), Y_test.flatten())[0]
spearman_corr = spearmanr(Y_pred.flatten(), Y_test.flatten())[0]
rmse_test = root_mean_squared_error(Y_test, Y_pred)
r2_test = r2_score(Y_test, Y_pred)

# Save the evaluation results to a DataFrame
results = pd.DataFrame({
    'rmse': [rmse_test],
    'r2': [r2_test],
    'pearson': [pearson_corr],
    'spearman': [spearman_corr]
})


In [13]:
results

Unnamed: 0,rmse,r2,pearson,spearman
0,0.008467,0.95909,0.999562,0.551562


In [28]:
results

Unnamed: 0,rmse,r2,pearson,spearman
0,0.374174,-0.02775,0.604335,0.362582


## Conditional Variational Autoencoder (CVAE)
This script defines a CVAE that conditions on the scRNA features (X) to reconstruct the metabolomic profile (Y). 
During training the encoder uses the concatenated input (X and Y) to learn a latent representation,
 and the decoder learns to reconstruct Y from X and a sampled latent variable.
  At inference time (when only X is available) we sample the latent from a standard normal distribution and decode.



In [29]:
def convert_to_tensor(X):
    # If X is sparse, convert to dense
    if issparse(X):
        X = X.toarray()
    # If it's not a numpy array, try to convert it to one
    if not isinstance(X, np.ndarray):
        # If it's already a torch tensor, move it to CPU and convert to numpy
        if isinstance(X, torch.Tensor):
            X = X.cpu().numpy()
        else:
            X = np.array(X)
    return torch.tensor(X, dtype=torch.float32)

class CVAE(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, latent_dim):
        super(CVAE, self).__init__()
        # Encoder: takes concatenated (x, y)
        self.encoder = nn.Sequential(
            nn.Linear(input_dim + output_dim, hidden_dim),
            nn.ReLU(),
        )
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        # Decoder: takes concatenated (x, z)
        self.decoder = nn.Sequential(
            nn.Linear(input_dim + latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)  # inherits device from std
        return mu + eps * std

    def forward(self, x, y):
        # Concatenate condition (x) and target (y) for encoding
        encoder_input = torch.cat([x, y], dim=1)
        h = self.encoder(encoder_input)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        z = self.reparameterize(mu, logvar)
        # Decode using condition x and latent variable z
        decoder_input = torch.cat([x, z], dim=1)
        y_recon = self.decoder(decoder_input)
        return y_recon, mu, logvar

In [30]:
params = {
    'hidden_dim': 256,    # Size of the hidden layer in both encoder and decoder
    'latent_dim': 20,     # Dimension of the latent space
    'lr': 0.0005,         # Learning rate for the optimizer
    'epochs': 150,        # Number of training epochs
    'batch_size': 64      # Batch size during training
}


In [31]:
from scipy.sparse import issparse


In [32]:
# def run_cvae(adata_rna_train,
#             adata_rna_test,
#             adata_msi_train,
#             adata_msi_test,
#             params,
#             featsel,
#             **kwargs):
# # --- Feature selection ---
# --- Convert sparse matrices to dense arrays if needed ---
if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()
if issparse(Y_train):
    Y_train = Y_train.toarray()
if issparse(Y_test):
    Y_test = Y_test.toarray()


In [33]:
# --- Convert to torch tensors if necessary ---
X_train = convert_to_tensor(X_train)
X_test = convert_to_tensor(X_test)
Y_train = convert_to_tensor(Y_train)
Y_test = convert_to_tensor(Y_test)

# --- Device Setup: use GPU if available ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
Y_train = Y_train.to(device)
X_test = X_test.to(device)
Y_test = Y_test.to(device)

# --- Hyperparameters ---
hidden_dim = int(params.get('hidden_dim', 128))
latent_dim = int(params.get('latent_dim', 10))
lr = float(params.get('lr', 1e-3))
epochs = int(params.get('epochs', 100))
batch_size = int(params.get('batch_size', 32))

input_dim = X_train.shape[1]
output_dim = Y_train.shape[1]

# --- Initialize model and optimizer ---
model = CVAE(input_dim, output_dim, hidden_dim, latent_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
mse_loss = nn.MSELoss()



In [34]:
# --- Training loop ---
model.train()
dataset = torch.utils.data.TensorDataset(X_train, Y_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    epoch_loss = 0.0
    num_batches = 0
    for x_batch, y_batch in dataloader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_recon, mu, logvar = model(x_batch, y_batch)
        # Reconstruction loss
        recon_loss = mse_loss(y_recon, y_batch)
        # KL divergence loss
        kl_loss = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
        loss = recon_loss + kl_loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        num_batches += 1

    avg_loss = epoch_loss / num_batches
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


Epoch 1/150, Loss: 0.3535
Epoch 2/150, Loss: 0.2119
Epoch 3/150, Loss: 0.1909
Epoch 4/150, Loss: 0.1811
Epoch 5/150, Loss: 0.1751
Epoch 6/150, Loss: 0.1709
Epoch 7/150, Loss: 0.1683
Epoch 8/150, Loss: 0.1663
Epoch 9/150, Loss: 0.1648
Epoch 10/150, Loss: 0.1635
Epoch 11/150, Loss: 0.1626
Epoch 12/150, Loss: 0.1617
Epoch 13/150, Loss: 0.1609
Epoch 14/150, Loss: 0.1602
Epoch 15/150, Loss: 0.1595
Epoch 16/150, Loss: 0.1590
Epoch 17/150, Loss: 0.1585
Epoch 18/150, Loss: 0.1580
Epoch 19/150, Loss: 0.1575
Epoch 20/150, Loss: 0.1572
Epoch 21/150, Loss: 0.1567
Epoch 22/150, Loss: 0.1563
Epoch 23/150, Loss: 0.1560
Epoch 24/150, Loss: 0.1557
Epoch 25/150, Loss: 0.1553
Epoch 26/150, Loss: 0.1550
Epoch 27/150, Loss: 0.1547
Epoch 28/150, Loss: 0.1545
Epoch 29/150, Loss: 0.1542
Epoch 30/150, Loss: 0.1539
Epoch 31/150, Loss: 0.1537
Epoch 32/150, Loss: 0.1535
Epoch 33/150, Loss: 0.1532
Epoch 34/150, Loss: 0.1530
Epoch 35/150, Loss: 0.1528
Epoch 36/150, Loss: 0.1526
Epoch 37/150, Loss: 0.1524
Epoch 38/1

In [49]:
model.eval()
with torch.no_grad():
    z_sample = torch.randn(X_test.size(0), latent_dim).to(device)
    decoder_input = torch.cat([X_test, z_sample], dim=1)
    Y_pred = model.decoder(decoder_input).cpu().numpy()

# --- Evaluation metrics ---
Y_test_np = Y_test.cpu().numpy()
pearson_corr = pearsonr(Y_pred.flatten(), Y_test_np.flatten())[0]
spearman_corr = spearmanr(Y_pred.flatten(), Y_test_np.flatten())[0]
rmse_test = np.sqrt(root_mean_squared_error(Y_test_np, Y_pred))
r2_test = r2_score(Y_test_np, Y_pred)

results = pd.DataFrame({
    'rmse': [rmse_test],
    'r2': [r2_test],
    'pearson': [pearson_corr],
    'spearman': [spearman_corr]
})



In [50]:
results

Unnamed: 0,rmse,r2,pearson,spearman
0,0.567462,0.067784,0.773216,0.435394


## GNN

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

def run_gnn(adata_rna_train,
            adata_rna_test,
            adata_msi_train,
            adata_msi_test,
            params,
            featsel,
            **kwargs):
    # --- Expecting 'edge_index' in kwargs (a torch.LongTensor of shape [2, num_edges]) ---
    if 'edge_index' not in kwargs:
        raise ValueError("GNN requires 'edge_index' in kwargs.")
    edge_index = kwargs['edge_index']

    # --- Feature selection ---
    if featsel in ["hvg", "svd"]:
        X_train = adata_rna_train.X  
        X_test = adata_rna_test.X  
    elif featsel in ["hvg_svd", "hvg_svd_graph", "svd_graph"]:
        X_train = adata_rna_train.obsm.get("svd_features", adata_rna_train.obsm["svd_graph"])
        X_test = adata_rna_test.obsm.get("svd_features", adata_rna_test.obsm["svd_graph"])
    else:
        raise ValueError(f"Unsupported feature selection method: {featsel}")
    Y_train = adata_msi_train.X
    Y_test = adata_msi_test.X

    # --- Convert to torch tensors ---
    X_train = torch.tensor(np.array(X_train), dtype=torch.float32)
    Y_train = torch.tensor(np.array(Y_train), dtype=torch.float32)
    X_test = torch.tensor(np.array(X_test), dtype=torch.float32)
    Y_test = torch.tensor(np.array(Y_test), dtype=torch.float32)

    # --- Create torch_geometric Data objects ---
    data_train = Data(x=X_train, edge_index=edge_index, y=Y_train)
    data_test = Data(x=X_test, edge_index=edge_index, y=Y_test)

    # --- Device Setup: use GPU if available ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_train = data_train.to(device)
    data_test = data_test.to(device)

    # --- Hyperparameters ---
    hidden_dim = int(params.get('hidden_dim', 64))
    lr = float(params.get('lr', 1e-3))
    epochs = int(params.get('epochs', 100))
    input_dim = X_train.shape[1]
    output_dim = Y_train.shape[1]

    # --- Initialize model and optimizer ---
    model = GNNModel(input_dim, hidden_dim, output_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    mse_loss = nn.MSELoss()

    # --- Training loop ---
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data_train)
        loss = mse_loss(out, data_train.y)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    # --- Inference ---
    model.eval()
    with torch.no_grad():
        Y_pred = model(data_test).cpu().numpy()

    # --- Evaluation metrics ---
    Y_test_np = Y_test.cpu().numpy()
    pearson_corr = pearsonr(Y_pred.flatten(), Y_test_np.flatten())[0]
    spearman_corr = spearmanr(Y_pred.flatten(), Y_test_np.flatten())[0]
    rmse_test = np.sqrt(mean_squared_error(Y_test_np, Y_pred))
    r2_test = r2_score(Y_test_np, Y_pred)

    results = pd.DataFrame({
        'rmse': [rmse_test],
        'r2': [r2_test],
        'pearson': [pearson_corr],
        'spearman': [spearman_corr]
    })

    return results
