In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from scipy.stats import t, shapiro, kstest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

In [27]:
#data preparation
def prepare_data():
    #loading data
    abundance = pd.read_csv('~/icr/simko/data/simko2_data/passport_prots.csv', index_col=0)
    abundance.index = abundance.index.astype(str)

    #removing cell lines with over 4000 nans
    nans_per_cl = abundance.isna().sum(axis=0)
    abundance_cl_filtered = abundance.loc[:, nans_per_cl<4000]

    #getting rid of protein with over 80% NaN (from the dataset filtered by CLs)
    prot_nan_count = abundance_cl_filtered.isna().sum(axis=1)
    prot_nan_percent = (prot_nan_count/abundance_cl_filtered.shape[1])*100

    abundance_filtered = abundance_cl_filtered[prot_nan_percent<80]

    #imputing witht the lower quartile average for each protein
    #set the protein names as the index - ignores it while we find the lower quartile

    def average_lower_quartile(x):
        sorted_abundances = x.dropna().sort_values()
        lower_qt_values = sorted_abundances.iloc[:int(len(sorted_abundances) * 0.25)]
        return lower_qt_values.mean()


    lower_qt_averages = abundance_filtered.apply(average_lower_quartile, axis=1)

    abundance_filtered_no_nan = abundance_filtered.apply(lambda x: x.fillna(lower_qt_averages[x.name]), axis=1)

    #transposing
    abundance_imputed = abundance_filtered_no_nan.T

    #scaling the imputed data
    scaler = StandardScaler()
    scaled_data = pd.DataFrame(scaler.fit_transform(abundance_imputed), index=abundance_imputed.index, columns=abundance_imputed.columns)

    return scaled_data

scaled_data = prepare_data()

In [33]:
#definsing autoencoder
# Define the Conditional VAE
class ConditionalVAE(nn.Module):
    def __init__(self, input_dim, cond_dim, hidden_dim, latent_dim):
        """
        input_dim: Number of protein features.
        cond_dim: Dimensionality of the condition vector (e.g., same as input_dim if you use a binary mask).
        hidden_dim: Number of hidden units.
        latent_dim: Size of the latent space.
        """
        super(ConditionalVAE, self).__init__()
        self.input_dim = input_dim
        self.cond_dim = cond_dim
        
        # Encoder: input is concatenation of data and condition.
        self.fc1 = nn.Linear(input_dim + cond_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder: latent vector concatenated with condition.
        self.fc3 = nn.Linear(latent_dim + cond_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)
    
    def encode(self, x, c):
        # Concatenate the protein data and the condition vector.
        x_cond = torch.cat([x, c], dim=1)
        h = F.relu(self.fc1(x_cond))
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z, c):
        # Concatenate the latent code and the condition vector.
        z_cond = torch.cat([z, c], dim=1)
        h = F.relu(self.fc3(z_cond))
        x_recon = self.fc4(h)
        return x_recon
    
    def forward(self, x, c):
        mu, logvar = self.encode(x, c)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z, c)
        return x_recon, mu, logvar

# Loss function: Reconstruction loss (MSE for continuous data) + KL divergence
def CVAE_loss_function(x_recon, x, mu, logvar):
    # Reconstruction loss: Mean Squared Error summed over features.
    recon_loss = F.mse_loss(x_recon, x, reduction='sum')
    # KL divergence loss
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kl_loss

In [37]:
# ---------------------------
# Example: Synthetic Proteomics Data & Knockout Condition
# ---------------------------

#creating tensor
X = scaled_data
X_tensor = torch.tensor(X.values, dtype=torch.float32)
#batch_size = 32
#dataset = Data.TensorDataset(X_tensor)
#dataloader = Data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

#defining dimensions
input_dim = X_tensor.shape[1]  # number of proteins
cond_dim = X_tensor.shape[1]     # using a binary mask with same size (1 means active, 0 means knockout)
hidden_dim = 32
latent_dim = 10
num_samples = 895

# Create a condition vector: all ones initially
c_data = torch.ones(X.shape[0], X.shape[1])  # Shape: (num_samples, input_dim)

#selecting pbrm1 to be knocked out
knockout_index = X.columns.get_loc('PBRM1')

#setting pbrm1 to 0 in the condition data
c_data[:, knockout_index] = 0

# Create a dataset and DataLoader
dataset = TensorDataset(X_tensor, c_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate the model and optimizer
model = ConditionalVAE(input_dim, cond_dim, hidden_dim, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)



In [38]:
# ---------------------------
# Training Loop
# ---------------------------
num_epochs = 20
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch_x, batch_c in dataloader:
        optimizer.zero_grad()
        x_recon, mu, logvar = model(batch_x, batch_c)
        loss = CVAE_loss_function(x_recon, batch_x, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataset):.4f}")


Epoch 1/20, Loss: 7045.9399
Epoch 2/20, Loss: 6944.0444
Epoch 3/20, Loss: 6932.0866
Epoch 4/20, Loss: 6922.5169
Epoch 5/20, Loss: 6915.3936
Epoch 6/20, Loss: 6909.5964
Epoch 7/20, Loss: 6905.4986
Epoch 8/20, Loss: 6902.1624
Epoch 9/20, Loss: 6899.7941
Epoch 10/20, Loss: 6897.8827
Epoch 11/20, Loss: 6896.6188
Epoch 12/20, Loss: 6895.5571
Epoch 13/20, Loss: 6894.7640
Epoch 14/20, Loss: 6894.1056
Epoch 15/20, Loss: 6893.8854
Epoch 16/20, Loss: 6893.4927
Epoch 17/20, Loss: 6893.2022
Epoch 18/20, Loss: 6893.0658
Epoch 19/20, Loss: 6892.9404
Epoch 20/20, Loss: 6893.0669


In [36]:
# ---------------------------
# Inference: Simulate a Knockout
# ---------------------------
model.eval()
with torch.no_grad():
    # Select a sample protein abundance profile
    sample_x = x_data[0:1]
    
    # Create a condition vector that represents a knockout scenario.
    # For instance, set the protein at index 10 to zero (knockout) while leaving others intact.
    sample_c = torch.ones(1, cond_dim)
    sample_c[0, 10] = 0  # knockout protein 10
    
    # Generate the reconstructed profile under the knockout condition.
    x_recon, _, _ = model(sample_x, sample_c)
    
    print("Original protein profile (sample):")
    print(sample_x)
    print("\nReconstructed profile under knockout condition (protein 10 knocked out):")
    print(x_recon)

NameError: name 'x_data' is not defined