In [1]:
import h5py
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# --------------------------------------------------------------------
# 1. Load Training Data with Slice Information
# --------------------------------------------------------------------
h5_file_path = "/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5"

with h5py.File(h5_file_path, "r") as f:
    train_spots = f["spots/Train"]
    # Each slide's data is loaded and tagged with its slide name.
    train_spot_tables = {
        slide: pd.DataFrame(np.array(train_spots[slide])).assign(slice_name=slide)
        for slide in train_spots.keys()
    }
train_df = pd.concat(train_spot_tables.values(), ignore_index=True)

# Assume the first two columns are x and y, and the remaining 35 columns are cell abundances.
cell_types = [f"C{i+1}" for i in range(35)]
train_df.columns = ["x", "y"] + cell_types + ["slice_name"]
print("Training data shape:", train_df.shape)

# --------------------------------------------------------------------
# 2. Define Datasets for Foundation Autoencoder and Main Mapping
# --------------------------------------------------------------------
# Dataset for the foundation autoencoder: uses the cell abundance vector.
class FoundationDataset(Dataset):
    def __init__(self, abundances):
        self.data = torch.tensor(abundances, dtype=torch.float32)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

# Dataset for training the main model: maps (x, y) coordinates to the foundation embedding.
class MainMappingDataset(Dataset):
    def __init__(self, coords, embeddings):
        self.coords = torch.tensor(coords, dtype=torch.float32)
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
    def __len__(self):
        return len(self.coords)
    def __getitem__(self, idx):
        return self.coords[idx], self.embeddings[idx]

# --------------------------------------------------------------------
# 3. Define the Foundation Autoencoder
# --------------------------------------------------------------------
class FoundationAutoencoder(nn.Module):
    """
    Autoencoder that learns a low-dimensional embedding from cell abundance vectors.
    The encoder should capture relationships among cell types (e.g., co-occurrence, ranking, etc.).
    """
    def __init__(self, input_dim=35, embed_dim=16, hidden_dim=64):
        super(FoundationAutoencoder, self).__init__()
        # Encoder network: compresses the 35-d cell abundance vector into a lower-dimensional embedding.
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embed_dim)
        )
        # Decoder network: reconstructs the cell abundance vector from the embedding.
        self.decoder = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
    def forward(self, x):
        emb = self.encoder(x)
        recon = self.decoder(emb)
        return emb, recon

# --------------------------------------------------------------------
# 4. Define the Main Model: Mapping from Coordinates to Embedding
# --------------------------------------------------------------------
class MainModelMapping(nn.Module):
    """
    Main model that learns to predict the foundation embedding from (x, y) coordinates.
    During inference, the predicted embedding is passed to the foundation decoder to generate cell abundances.
    """
    def __init__(self, input_dim=2, embed_dim=16, hidden_dim=64):
        super(MainModelMapping, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embed_dim)
        )
    def forward(self, x):
        return self.net(x)

# --------------------------------------------------------------------
# 5. Define Training Functions for Each Model
# --------------------------------------------------------------------
def train_foundation_autoencoder(model, dataloader, num_epochs=20, lr=0.001, device='cpu'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for data in dataloader:
            data = data.to(device)
            emb, recon = model(data)
            loss = criterion(recon, data)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * data.size(0)
        print(f"Foundation Autoencoder Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader.dataset):.4f}")
    return model

def train_main_mapping(model, dataloader, num_epochs=20, lr=0.001, device='cpu'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for coords, target_emb in dataloader:
            coords, target_emb = coords.to(device), target_emb.to(device)
            pred_emb = model(coords)
            loss = criterion(pred_emb, target_emb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * coords.size(0)
        print(f"Main Mapping Model Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader.dataset):.4f}")
    return model

# --------------------------------------------------------------------
# 6. Train the Foundation Autoencoder
# --------------------------------------------------------------------
# Create dataset from the training cell abundance vectors.
foundation_dataset = FoundationDataset(train_df[cell_types].values)
foundation_loader = DataLoader(foundation_dataset, batch_size=32, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
foundation_model = FoundationAutoencoder(input_dim=35, embed_dim=16, hidden_dim=64)
print("Training Foundation Autoencoder...")
foundation_model = train_foundation_autoencoder(foundation_model, foundation_loader, num_epochs=200, lr=0.001, device=device)

# --------------------------------------------------------------------
# 7. Precompute Foundation Embeddings for Training Spots
# --------------------------------------------------------------------
with torch.no_grad():
    foundation_model.eval()
    train_abundances = torch.tensor(train_df[cell_types].values, dtype=torch.float32).to(device)
    train_embeddings, _ = foundation_model(train_abundances)
    train_embeddings = train_embeddings.cpu().numpy()

# --------------------------------------------------------------------
# 8. Train the Main Model to Predict Embeddings from Coordinates
# --------------------------------------------------------------------
main_dataset = MainMappingDataset(train_df[['x','y']].values, train_embeddings)
main_loader = DataLoader(main_dataset, batch_size=32, shuffle=True)

main_model = MainModelMapping(input_dim=2, embed_dim=16, hidden_dim=64)
print("Training Main Mapping Model...")
main_model = train_main_mapping(main_model, main_loader, num_epochs=200, lr=0.001, device=device)

# --------------------------------------------------------------------
# 9. Inference on Test Data and Submission Creation
# --------------------------------------------------------------------
with h5py.File(h5_file_path, "r") as f:
    test_spots = f["spots/Test"]
    test_array = np.array(test_spots["S_7"])
    test_df = pd.DataFrame(test_array)
    # Test data columns are assumed to be: x, y, Test_set.
    if test_df.shape[1] == 3:
        test_df.columns = ["x", "y", "Test_set"]
        test_df = test_df[["x", "y"]]
    elif test_df.shape[1] == 2:
        test_df.columns = ["x", "y"]
    else:
        raise ValueError("Unexpected number of columns in test data.")

test_coords = test_df[['x','y']].values
test_coords_tensor = torch.tensor(test_coords, dtype=torch.float32).to(device)

main_model.eval()
with torch.no_grad():
    predicted_embeddings = main_model(test_coords_tensor)
# Use the foundation decoder to convert embeddings into cell abundance predictions.
foundation_model.eval()
with torch.no_grad():
    predicted_abundances = foundation_model.decoder(predicted_embeddings)
    predicted_abundances = predicted_abundances.cpu().numpy()

submission_df = pd.DataFrame(predicted_abundances, columns=cell_types)
submission_df.insert(0, 'ID', test_df.index)
submission_file = "submission.csv"
submission_df.to_csv(submission_file, index=False)
print(f"Submission file '{submission_file}' created!")


Training data shape: (8349, 38)
Training Foundation Autoencoder...
Foundation Autoencoder Epoch 1/200, Loss: 0.3422
Foundation Autoencoder Epoch 2/200, Loss: 0.0376
Foundation Autoencoder Epoch 3/200, Loss: 0.0228
Foundation Autoencoder Epoch 4/200, Loss: 0.0178
Foundation Autoencoder Epoch 5/200, Loss: 0.0143
Foundation Autoencoder Epoch 6/200, Loss: 0.0118
Foundation Autoencoder Epoch 7/200, Loss: 0.0097
Foundation Autoencoder Epoch 8/200, Loss: 0.0076
Foundation Autoencoder Epoch 9/200, Loss: 0.0060
Foundation Autoencoder Epoch 10/200, Loss: 0.0054
Foundation Autoencoder Epoch 11/200, Loss: 0.0050
Foundation Autoencoder Epoch 12/200, Loss: 0.0051
Foundation Autoencoder Epoch 13/200, Loss: 0.0042
Foundation Autoencoder Epoch 14/200, Loss: 0.0038
Foundation Autoencoder Epoch 15/200, Loss: 0.0038
Foundation Autoencoder Epoch 16/200, Loss: 0.0034
Foundation Autoencoder Epoch 17/200, Loss: 0.0035
Foundation Autoencoder Epoch 18/200, Loss: 0.0032
Foundation Autoencoder Epoch 19/200, Loss: