In [1]:
import h5py
import numpy as np
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import os
from torchvision import transforms
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import lr_scheduler, SGD
import time
from tempfile import TemporaryDirectory
import pandas as pd

In [2]:
# Load and display (x,y) spot locations and cell type annotation table for Train slides
with h5py.File("/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5", "r") as f:
    train_spots = f["spots/Train"]
    
    # Dictionary to store DataFrames for each slide
    train_spot_tables = {}
    
    for slide_name in train_spots.keys():
        # Load dataset as NumPy structured array
        spot_array = np.array(train_spots[slide_name])
        
        # Convert to DataFrame
        df = pd.DataFrame(spot_array)
        
        # Store in dictionary
        train_spot_tables[slide_name] = df

# Example: Display the spots table for slide 'S_1'
train_spot_tables['S_1']

Unnamed: 0,x,y,C1,C2,C3,C4,C5,C6,C7,C8,...,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35
0,1554,1297,0.014401,0.057499,0.022033,0.001704,0.533992,1.511707,0.015313,0.020029,...,1.009572e-03,2.068237,0.121361,0.007344,0.000017,0.036891,0.035934,0.118937,0.001472,0.050057
1,462,1502,0.116196,0.197176,0.110600,0.042614,5.587681,0.006885,0.096346,0.001711,...,6.918171e-04,0.014442,0.000238,0.024071,0.000023,0.217589,0.100662,0.004027,0.004122,0.049491
2,1488,1548,0.133284,0.035880,0.061352,0.003073,1.104479,0.009174,0.009175,0.000114,...,9.577447e-05,0.149792,0.001401,0.000699,0.000009,0.024491,0.018810,0.004171,0.000425,0.015348
3,1725,1182,0.087715,0.235223,0.090382,0.013902,8.760482,0.140912,0.188859,0.010154,...,1.964150e-03,0.142549,0.002036,0.047165,0.000022,0.180372,0.202981,0.003709,0.001845,0.116022
4,581,1113,0.128468,0.066399,0.098982,0.047022,3.425771,0.001009,0.026881,0.000468,...,7.189078e-05,0.005920,0.000048,0.006359,0.000585,0.052661,0.032168,0.000107,0.000107,0.013103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,1055,701,0.047236,0.112721,0.052490,0.002092,0.000913,0.393805,0.045220,0.013414,...,7.530558e-03,0.559967,0.154793,0.133587,0.000013,0.051041,0.113100,0.003147,0.004971,0.128265
2193,1225,862,0.070764,0.119310,0.193938,0.001239,0.003851,0.588956,0.037731,0.004072,...,3.219223e-02,0.667076,0.094247,0.136623,0.000009,0.050518,0.119685,0.027743,0.016413,0.287171
2194,765,1479,0.194491,0.048068,0.150061,0.002978,0.119206,0.000213,0.005332,0.000078,...,2.628421e-04,0.000273,0.000022,0.000027,0.000042,0.029386,0.019977,0.000039,0.000119,0.006778
2195,607,1525,0.002968,0.151899,0.015931,0.000071,1.354983,0.000152,0.019254,0.001267,...,1.377452e-03,0.003754,0.001420,0.088000,0.000536,0.073240,0.113631,0.009040,0.000090,0.025716


In [3]:
# Display spot table for Test slide (only the spot coordinates on 2D array)
with h5py.File("/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5", "r") as f:
    test_spots = f["spots/Test"]
    spot_array = np.array(test_spots['S_7'])
    test_spot_table = pd.DataFrame(spot_array)
    
# Show the test spots coordinates for slide 'S_7'
test_spot_table

Unnamed: 0,x,y,Test_Set
0,1499,1260,2
1,1435,1503,2
2,558,1082,2
3,736,1304,1
4,1257,1592,1
...,...,...,...
2083,736,639,2
2084,1016,684,2
2085,1181,839,2
2086,735,1436,1


In [4]:
hyper_params = {
    "patch_size": 224,
    "batch_size": 32,
    "lr": 0.001,
    "gamma": 0.1,
    "epochs": 12,
    "step": 7,
}

In [5]:
# Define image transformations for training, validation, and test sets
transform = {
    'train': transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
    'test': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
}

# Custom dataset class to extract and process patches from HDF5 slides
class CustomDataset(Dataset):
    def __init__(self, data_path, slide_names, transform=None, is_train=True):
        self.transform = transform
        self.data = []
        self.targets = []
        self.patch_size = hyper_params["patch_size"]
        self.slide_names = slide_names
        self.is_train = is_train

        with h5py.File(data_path, "r") as f:
            images = f["images/Train"] if self.is_train else f["images/Test"]
            coords = f["spots/Train"] if self.is_train else f["spots/Test"]
            for slide_name in self.slide_names:
                slide = np.array(images[slide_name])
                spots = np.array(coords[slide_name])
                df = pd.DataFrame(spots)

                # Apply specific x/y shifts for alignment if needed
                x_shift, y_shift = 0, 0
                if slide_name == 'S_1':
                    x_shift, y_shift = 50, 60
                elif slide_name == 'S_2':
                    x_shift, y_shift = 95, 55
                df['x'] -= x_shift
                df['y'] -= y_shift

                # Extract patches and corresponding targets
                for _, row in df.iterrows():
                    x_center, y_center = int(row['x']), int(row['y'])
                    x0 = x_center - self.patch_size // 2
                    y0 = y_center - self.patch_size // 2
                    patch = slide[y0:y0 + self.patch_size, x0:x0 + self.patch_size, :]
                    self.data.append(patch)
                    self.targets.append(row[2:].values)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        target = self.targets[index]
        data = self.data[index]
        if self.transform:
            return (self.transform(data), target) if self.is_train else self.transform(data)
        else:
            return (data, target) if self.is_train else data

# Wrapper to apply transforms to dataset subsets (e.g., train/val split)
class TransformedSubset(Dataset):
    def __init__(self, subset, transform):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        data, target = self.subset[index]
        if self.transform:
            data = self.transform(data)
        return data, target

    def __len__(self):
        return len(self.subset)

# Dataset paths and slide setup
data_path = "/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5"
batch_size = hyper_params["batch_size"]
train_slides = ['S_1', 'S_2', 'S_3', 'S_4', 'S_5']
val_slide = ['S_6']
test_slide = ['S_7']

# Load full dataset and split into train/val subsets
dataset = CustomDataset(data_path, slide_names=train_slides + val_slide, transform=None)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_subset, val_subset = random_split(dataset, [train_size, val_size])

# Wrap subsets with respective transforms
train_dataset = TransformedSubset(train_subset, transform['train'])
val_dataset = TransformedSubset(val_subset, transform['val'])
test_dataset = CustomDataset(data_path, slide_names=test_slide, is_train=False, transform=transform['test'])

# Create final dataset and dataloaders
dataset = {
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
}
dataloaders = {
    x: DataLoader(dataset[x], batch_size=batch_size, shuffle=True)
    for x in ['train', 'val']
}
test_loader = DataLoader(dataset['test'], batch_size=batch_size, shuffle=False)
dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val', 'test']}

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")


Using cuda device


In [6]:
# Computes the average Spearman correlation coefficient row-wise
# between the true and predicted values
def rowwise_spearman(y_true, y_pred):
    scores = []
    for yt, yp in zip(y_true, y_pred):
        coef, _ = spearmanr(yt, yp)  # Compute Spearman correlation for each row
        scores.append(coef)
    return np.mean(scores)  # Return average correlation across all rows

In [7]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_score = 0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_score = 0.0
                # Iterate over data.
                for inputs, targets in tqdm(dataloaders[phase]):
                    inputs = inputs.to(device).float()
                    targets = targets.to(device).float()
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)
                        score = rowwise_spearman(outputs.cpu().detach().numpy(), targets.cpu().detach().numpy())
                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_score += score * inputs.size(0)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_score = running_score / dataset_sizes[phase]
                print(f'{phase} Loss: {epoch_loss:.4f} Score: {epoch_score:.4f}')
                
                # deep copy the model
                if phase == 'val' and epoch_score > best_score:
                    best_score = epoch_score
                    torch.save(model.state_dict(), best_model_params_path)

            print()
            
        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Score: {best_score:4f}')
        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path, weights_only=True))
    return model

In [8]:
# Custom transfer learning model using a pretrained CNN as a frozen feature extractor
class TransferModel(nn.Module):
    def __init__(self, pretrained):
        super(TransferModel, self).__init__()
        
        # Extract all layers except the classification head from the pretrained model
        self.feature_extractor = nn.Sequential(*list(pretrained.children())[:-1])
        
        # Custom head: reduces channels, flattens, then maps to 35 output features
        self.custom_layers = nn.Sequential(
            nn.Conv2d(1792, 512, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(512, 256, kernel_size=1),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Flatten(),
            nn.Linear(256, 35)
        )

        # Freeze pretrained model parameters
        for param in self.feature_extractor.parameters():
            param.requires_grad = False

        # Enable training only on the custom head
        for param in self.custom_layers.parameters():
            param.requires_grad = True

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.custom_layers(x)
        return x


In [9]:
# Set random seed for reproducibility
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed = 123
seed_everything(seed)

# Ensure deterministic behavior in cuDNN
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [10]:
# Set up training parameters and model
epochs = hyper_params["epochs"]
lr = hyper_params["lr"]
gamma = hyper_params["gamma"]
step = hyper_params["step"]

# Load EfficientNet-B4 pretrained weights and initialize custom model
pretrained = models.efficientnet_b4(pretrained=True)
model = TransferModel(pretrained)
model.to(device)

# Define loss function, optimizer, and learning rate scheduler
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=step, gamma=gamma)

# Train the model
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=epochs)

Downloading: "https://download.pytorch.org/models/efficientnet_b4_rwightman-23ab8bcd.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_rwightman-23ab8bcd.pth
100%|██████████| 74.5M/74.5M [00:00<00:00, 202MB/s]


Epoch 0/11
----------


100%|██████████| 209/209 [01:20<00:00,  2.59it/s]


train Loss: 0.2230 Score: 0.5676


100%|██████████| 53/53 [00:08<00:00,  6.34it/s]


val Loss: 0.2016 Score: 0.7342

Epoch 1/11
----------


100%|██████████| 209/209 [01:20<00:00,  2.59it/s]


train Loss: 0.1871 Score: 0.6719


100%|██████████| 53/53 [00:08<00:00,  6.08it/s]


val Loss: 0.1830 Score: 0.7186

Epoch 2/11
----------


100%|██████████| 209/209 [01:21<00:00,  2.56it/s]


train Loss: 0.1787 Score: 0.6931


100%|██████████| 53/53 [00:08<00:00,  5.98it/s]


val Loss: 0.1776 Score: 0.7381

Epoch 3/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.54it/s]


train Loss: 0.1752 Score: 0.7015


100%|██████████| 53/53 [00:08<00:00,  5.96it/s]


val Loss: 0.1741 Score: 0.7536

Epoch 4/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.55it/s]


train Loss: 0.1716 Score: 0.7058


100%|██████████| 53/53 [00:08<00:00,  5.95it/s]


val Loss: 0.1705 Score: 0.7728

Epoch 5/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.54it/s]


train Loss: 0.1676 Score: 0.7114


100%|██████████| 53/53 [00:08<00:00,  6.03it/s]


val Loss: 0.1725 Score: 0.7457

Epoch 6/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.53it/s]


train Loss: 0.1678 Score: 0.7105


100%|██████████| 53/53 [00:08<00:00,  5.97it/s]


val Loss: 0.1638 Score: 0.7632

Epoch 7/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.52it/s]


train Loss: 0.1603 Score: 0.7457


100%|██████████| 53/53 [00:08<00:00,  5.95it/s]


val Loss: 0.1602 Score: 0.7852

Epoch 8/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.52it/s]


train Loss: 0.1618 Score: 0.7605


100%|██████████| 53/53 [00:09<00:00,  5.88it/s]


val Loss: 0.1634 Score: 0.7836

Epoch 9/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.52it/s]


train Loss: 0.1584 Score: 0.7659


100%|██████████| 53/53 [00:08<00:00,  5.94it/s]


val Loss: 0.1641 Score: 0.7793

Epoch 10/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.53it/s]


train Loss: 0.1582 Score: 0.7654


100%|██████████| 53/53 [00:08<00:00,  5.97it/s]


val Loss: 0.1543 Score: 0.7893

Epoch 11/11
----------


100%|██████████| 209/209 [01:22<00:00,  2.52it/s]


train Loss: 0.1583 Score: 0.7669


100%|██████████| 53/53 [00:08<00:00,  5.99it/s]


val Loss: 0.1568 Score: 0.7875

Training complete in 18m 14s
Best val Score: 0.789335


In [11]:
# Run model in evaluation mode on test set and collect predictions
was_training = model.training  # Save current training state
model.eval()  # Switch to evaluation mode (disables dropout, etc.)

preds = []
with torch.no_grad():  # Disable gradient computation for inference
    for i, inputs in enumerate(test_loader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds.append(outputs.cpu().numpy())  # Move predictions to CPU

model.train(mode=was_training)  # Restore original training state

# Concatenate predictions from all batches
preds = np.concatenate(preds)

In [12]:
# Create a random submission
# (predictions of cell type abundances for 35 classes across the Test slide spots;
# spot order should be same as in the 'Test' spots table)

# Use the cell type columns from the train spots table; assuming first two columns are (x, y)
cell_type_columns = train_spot_tables['S_1'].columns[2:].values  # Expecting 35 cell types here
indices = test_spot_table.index.values  # All spots on the Test slide

# Create a 2D array of random floats between 0 and 2 for each spot and cell type
#prediction_matrix = 2 * np.random.rand(len(indices), len(cell_type_columns))
predicted_labels = pd.DataFrame(preds, columns=cell_type_columns, index=indices)

predicted_labels.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35
0,0.032836,0.039854,0.025462,0.000133,0.018642,0.000938,0.005183,0.000397,0.002795,-0.000126,...,0.000862,0.000598,0.000448,-2e-06,0.00069,0.024209,0.017196,0.000879,0.000366,0.007307
1,0.035431,0.06509,0.029693,0.0001,0.095727,0.000786,0.010287,0.003091,0.002515,0.000261,...,0.010768,0.000474,0.001566,0.000865,0.041818,0.038171,0.027283,0.004539,0.001431,0.012592
2,0.042189,0.081278,0.041745,0.008947,1.76537,0.000954,0.033754,0.002177,0.001763,-0.000435,...,0.00103,0.000843,0.000871,0.036692,0.000764,0.060153,0.037954,0.002204,0.000492,0.019852
3,0.175396,0.06432,0.127633,0.059151,0.641923,0.000618,0.018592,0.001472,0.002209,-0.000177,...,0.001222,0.000479,0.000555,0.023381,0.001633,0.04602,0.032016,0.002823,0.001817,0.015404
4,0.449737,0.041152,0.3201,0.223742,0.265361,0.000702,0.010742,0.000465,0.001706,-0.000375,...,0.00051,0.00504,0.000256,0.00024,0.000446,0.03383,0.023943,-6.9e-05,0.002304,0.013969


In [13]:
# Prepare submission DataFrame: spot_id column and then predictions for each cell type
submission_df = predicted_labels.copy()
submission_df.insert(0, 'ID', submission_df.index)

# Save the submission file as submission.csv
submission_df.to_csv("./submission.csv", index=False)
print("Submission file 'submission.csv' created!")

Submission file 'submission.csv' created!
