# Autoencoder

We inspired our encoder in the one used in CFGEN, which has is a multi-layer perceptron (MLP) with two hidden layers of dimensions
[512, 256] that map the input to a 50-dimensional latent space.

In [1]:
import anndata
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from typing import List, Optional, Callable
import torch.nn as nn
import torch.nn.functional as F


In [2]:

# -------------------------------
# Define MLP (like the one in CFGEN)
# -------------------------------
class MLP(nn.Module):
    def __init__(self, 
                 dims: List[int],
                 batch_norm: bool = True, 
                 dropout: bool = True, 
                 dropout_p: float = 0.1, 
                 activation: Optional[Callable] = nn.ELU, 
                 final_activation: Optional[str] = None):
        super().__init__()
        self.dims = dims
        layers = []
        for i in range(len(dims[:-2])):
            block = [nn.Linear(dims[i], dims[i+1])]
            if batch_norm:
                block.append(nn.BatchNorm1d(dims[i+1]))
            block.append(activation())
            if dropout:
                block.append(nn.Dropout(dropout_p))
            layers.append(nn.Sequential(*block))
        layers.append(nn.Linear(dims[-2], dims[-1]))
        self.net = nn.Sequential(*layers)
        if final_activation == "tanh":
            self.final_activation = nn.Tanh()
        elif final_activation == "sigmoid":
            self.final_activation = nn.Sigmoid()
        else:
            self.final_activation = None

    def forward(self, x):
        x = self.net(x)
        return x if self.final_activation is None else self.final_activation(x)

In [3]:
# -------------------------------
# Negative Binomial log-likelihood
# -------------------------------
def negative_binomial_log_likelihood(x, mu, theta, eps=1e-8):
    t1 = torch.lgamma(theta + eps) + torch.lgamma(x + 1.0) - torch.lgamma(x + theta + eps)
    t2 = (theta * (torch.log(theta + eps) - torch.log(mu + theta + eps))) + \
         (x * (torch.log(mu + eps) - torch.log(mu + theta + eps)))
    return t1 + t2


In [4]:

# -------------------------------
# NB Autoencoder
# -------------------------------
class NB_Autoencoder(nn.Module):
    def __init__(self,
                 num_features: int,
                 latent_dim: int = 50,
                 hidden_dims: List[int] = [512, 256],
                 dropout_p: float = 0.1,
                 l2_reg: float = 1e-5,
                 kl_reg: float = 1e-3):
        super().__init__()
        self.num_features = num_features
        self.latent_dim = latent_dim
        self.l2_reg = l2_reg
        self.kl_reg = kl_reg

        self.encoder = MLP(
            dims=[num_features, *hidden_dims, latent_dim],
            batch_norm=True,
            dropout=True,
            dropout_p=dropout_p
        )

        self.decoder = MLP(
            dims=[latent_dim, *hidden_dims[::-1], num_features],
            batch_norm=True,
            dropout=True,
            dropout_p=dropout_p
        )

        self.log_theta = nn.Parameter(torch.randn(num_features) * 0.01)

    def forward(self, x):
        z = self.encoder(x)
        mu = F.softplus(self.decoder(z))
        theta = torch.exp(self.log_theta).unsqueeze(0).expand_as(mu)
        return {"z": z, "mu": mu, "theta": theta}

    def loss_function(self, x, outputs):
        mu = outputs["mu"]
        theta = outputs["theta"]
        z = outputs["z"]
        nll = -negative_binomial_log_likelihood(x, mu, theta).sum(dim=1).mean()
        l2_loss = sum((p**2).sum() for p in self.parameters()) * self.l2_reg
        kl_loss = (z**2).mean() * self.kl_reg
        loss = nll + l2_loss + kl_loss
        return {"loss": loss, "nll": nll, "l2": l2_loss, "kl": kl_loss}


In [16]:
#check if it's corrupted
import anndata as ad
input_file_path = "/dtu/blackhole/06/213542/paperdata/pbmc3k_train.h5ad"
#adata = ad.read_h5ad("/zhome/99/9/213566/deepL/adata_preprocessed_subset.h5ad", backed='r')
adata = ad.read_h5ad(input_file_path, backed='r')
adata.obs.head()


Unnamed: 0_level_0,n_genes,percent_mito,n_counts,cell_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CCAATTTGAACGTC-1,932,0.010483,2671.0,CD14+ Monocytes
AACAGCACAAGAGT-1,343,0.024709,688.0,CD14+ Monocytes
AGAGATGACTGAAC-1,678,0.023834,1972.0,CD4 T cells
AATAGGGAGAATGA-1,756,0.036002,1861.0,CD8 T cells
GCCTGACTCTCAAG-1,757,0.014493,2001.0,CD14+ Monocytes


In [27]:



# -------------------------------
# Main training + encoding
# -------------------------------
    # --- Hyperparameters ---
input_file = input_file_path
latent_dim = 50
hidden_dims = [512, 256]
batch_size = 512
epochs = 5            # short run to check
learning_rate = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Load data ---
adata = anndata.read_h5ad(input_file)
X = adata.X
if hasattr(X, "toarray"):
    X = X.toarray()
X = torch.tensor(X, dtype=torch.float32)

dataset = TensorDataset(X)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # --- Initialize model ---
num_genes = adata.n_vars
model = NB_Autoencoder(num_features=num_genes, latent_dim=latent_dim, hidden_dims=hidden_dims)
model = model.to(device)
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # --- Training loop ---
for epoch in range(epochs):
    epoch_loss = 0
    for batch in dataloader:
        x_batch = batch[0].to(device)
        outputs = model(x_batch)
        loss_dict = model.loss_function(x_batch, outputs)
        loss = loss_dict["loss"]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * x_batch.size(0)

    epoch_loss /= len(dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.3f}, NLL: {loss_dict['nll'].item():.3f}")

    # --- Save trained model ---
model_file = input_file_path.replace(".h5ad", "_nb_autoencoder.pt")
torch.save(model.state_dict(), model_file)
print(f"Trained model saved to {model_file}")

    # --- Encode all cells into latent space ---
model.eval()
all_z = []
with torch.no_grad():
    for batch in DataLoader(dataset, batch_size=batch_size):
        x_batch = batch[0].to(device)
        z = model(x_batch)["z"].cpu().numpy()
        all_z.append(z)
latent = np.concatenate(all_z, axis=0)

    # --- Save latent space to AnnData ---
adata.obsm["X_latent"] = latent
output_file = input_file.replace(".h5ad", "_with_latent.h5ad")
adata.write(output_file)
print(f"Latent space saved to {output_file}")



Epoch 1/5, Loss: 5834.202, NLL: 5413.974
Epoch 2/5, Loss: 4930.286, NLL: 4476.204
Epoch 3/5, Loss: 4199.324, NLL: 3906.454
Epoch 4/5, Loss: 3704.829, NLL: 3630.023
Epoch 5/5, Loss: 3445.340, NLL: 3424.888
Trained model saved to /dtu/blackhole/1e/213566/datasets/pbmc3k/pbmc3k_train_nb_autoencoder.pt
Latent space saved to /dtu/blackhole/1e/213566/datasets/pbmc3k/pbmc3k_train_with_latent.h5ad


Running on new data:

In [28]:
input_file_path2 = "/dtu/blackhole/1e/213566/datasets/pbmc3k/pbmc3k_test.h5ad"
new_adata = anndata.read_h5ad(input_file_path2)

In [30]:
# --- Load model ---
num_genes = new_adata.n_vars
model = NB_Autoencoder(num_features=num_genes, latent_dim=50, hidden_dims=[512, 256])
model.load_state_dict(torch.load("/dtu/blackhole/1e/213566/datasets/pbmc3k/pbmc3k_train_nb_autoencoder.pt"))
model = model.to(device)
model.eval()

# Encode new cells
X_new = new_adata.X
if hasattr(X_new, "toarray"):
    X_new = X_new.toarray()
X_new = torch.tensor(X_new, dtype=torch.float32).to(device)

with torch.no_grad():
    z_new = model(X_new)["z"].cpu().numpy()

# Save to AnnData
new_adata.obsm["X_latent"] = z_new
new_adata.write("/dtu/blackhole/1e/213566/datasets/pbmc3k/new_cells_with_latent.h5ad")
