In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5


## Elucidata competition

This is my main notebook for the Elucidata Competition.

Other notebooks:
- [EDA Plotting Cell Type Distribution by Slice](http://https://www.kaggle.com/code/dalloliogm/eda-plotting-cell-distribution-by-slice/notebook)
- [EDA Exploring Cell Type Abundance](https://www.kaggle.com/code/dalloliogm/eda-exploring-cell-type-abundance)

In [2]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RANSACRegressor
from scipy.spatial import KDTree
from scipy.stats import spearmanr
from tqdm import tqdm

class ElucidataPipeline:
    def __init__(self, h5_file_path, test_slide='S_7', submission_path='submission.csv', test_size=0.02, random_state=2024):
        self.h5_file_path = h5_file_path
        self.test_slide = test_slide
        self.submission_path = submission_path
        self.test_size = test_size
        self.random_state = random_state

        # Data holders
        self.train_df = None
        self.test_df = None
        self.smoothed_ranks = None

        # Model I/O
        self.X = None
        self.y = None
        self.X_train = None
        self.X_valid = None
        self.y_train = None
        self.y_valid = None
        self.models = {}
        self.predictions = None

    def load_train_data(self):
        print("Loading training data...")
        with h5py.File(self.h5_file_path, "r") as f:
            train_spots = f["spots/Train"]
            train_spot_tables = {
                slide_name: pd.DataFrame(np.array(train_spots[slide_name]))
                for slide_name in train_spots.keys()
            }
        self.train_df = pd.concat(train_spot_tables.values(), ignore_index=True)
        print(f"Training data loaded. Shape: {self.train_df.shape}")

    def prepare_data(self):
        print("Preparing training data...")
        self.X = self.train_df[['x', 'y']]
        self.y = self.train_df.iloc[:, 2:]  # C1 to C35
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_state
        )
        print("Data split into training and validation sets.")

    def define_models(self):
        print("Defining models...")
        self.models = {
            "RANSACRegressor": RANSACRegressor()
        }
        print("Models defined:", list(self.models.keys()))

    def train_models(self):
        print("Training models...")
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
        print("Model training complete.")

    def validate_models(self):
        print("Validating models on the validation set...")
        preds_valid = {}
        for name, model in self.models.items():
            print(f"Predicting with {name} on validation data...")
            preds_valid[name] = model.predict(self.X_valid)
        return preds_valid

    def load_test_data(self):
        print("Loading test data...")
        with h5py.File(self.h5_file_path, "r") as f:
            test_spots = f["spots/Test"]
            self.test_df = pd.DataFrame(np.array(test_spots[self.test_slide]))
        print(f"Test data loaded. Shape: {self.test_df.shape}")

    def predict_test(self):
        print("Predicting on test data...")
        X_test = self.test_df[['x', 'y']]
        test_preds = np.zeros((X_test.shape[0], self.y.shape[1]))
        for name, model in self.models.items():
            print(f"Predicting with {name}...")
            test_preds += model.predict(X_test)
        test_preds /= len(self.models)
        self.predictions = test_preds
        print("Test predictions complete.")

    def create_submission(self):
        print("Creating submission file...")
        submission_df = pd.DataFrame(self.predictions, columns=self.y.columns)
        submission_df.insert(0, 'ID', self.test_df.index)
        submission_df.to_csv(self.submission_path, index=False)
        print(f"Submission file '{self.submission_path}' created!")

    def compute_smoothed_ranks(self, radius=100):
        print("Computing smoothed ranks from training data...")
        df = self.train_df.copy()
        cell_types = [f"C{i+1}" for i in range(35)]
        df.columns.values[2:] = cell_types  # Rename columns to C1–C35
        long_df = df.melt(id_vars=["x", "y"], var_name="cell_type", value_name="abundance")
        long_df["rank"] = long_df.groupby(["x", "y"])["abundance"].rank(method="dense", ascending=False)

        coords = long_df[["x", "y"]].drop_duplicates().values
        tree = KDTree(coords)

        smoothed = []
        for i, (x, y) in enumerate(tqdm(coords, desc="Smoothing ranks")):
            idx = tree.query_ball_point([x, y], r=radius)
            spot_neighbors = long_df.set_index(["x", "y"]).loc[[tuple(coords[j]) for j in idx]]
            avg_ranks = spot_neighbors.groupby("cell_type")["rank"].mean()
            smoothed.append(avg_ranks)

        smoothed_df = pd.DataFrame(smoothed).reset_index(drop=True)
        smoothed_df["x"] = coords[:, 0]
        smoothed_df["y"] = coords[:, 1]
        self.smoothed_ranks = smoothed_df
        print("Smoothed ranks computed.")

    def score_submission_feasibility(self, radius=100, alpha=0.7):
        print("Scoring submission for biological/spatial plausibility...")
        test_coords = self.test_df[['x', 'y']].to_numpy()
        train_coords = self.smoothed_ranks[["x", "y"]].to_numpy()
        tree = KDTree(train_coords)

        cell_cols = [f"C{i+1}" for i in range(35)]
        avg_global_rank = self.smoothed_ranks[cell_cols].mean().to_numpy()

        submission_df = pd.DataFrame(self.predictions, columns=cell_cols)
        scores = []

        for i, (x, y) in enumerate(test_coords):
            pred = submission_df.iloc[i].to_numpy()
            neighbor_idx = tree.query_ball_point([x, y], r=radius)
            local_sim = None
            if neighbor_idx:
                neighbors = self.smoothed_ranks.iloc[neighbor_idx][cell_cols].mean().to_numpy()
                local_sim, _ = spearmanr(pred, neighbors)
            global_sim, _ = spearmanr(pred, avg_global_rank)

            if local_sim is not None:
                score = alpha * local_sim + (1 - alpha) * global_sim
            else:
                score = global_sim
            scores.append(score)

        final_score = np.nanmean(scores)
        print(f"Feasibility Score: {final_score:.4f}")
        return final_score

    def run_pipeline(self):
        self.load_train_data()
        self.prepare_data()
        self.define_models()
        self.train_models()
        _ = self.validate_models()
        self.load_test_data()
        self.predict_test()
        self.create_submission()


In [3]:
pipeline = ElucidataPipeline("/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5")
pipeline.run_pipeline()

# Optional: compute and score submission
pipeline.compute_smoothed_ranks(radius=100)
pipeline.score_submission_feasibility()


Loading training data...
Training data loaded. Shape: (8349, 37)
Preparing training data...
Data split into training and validation sets.
Defining models...
Models defined: ['RANSACRegressor']
Training models...
Training RANSACRegressor...
Model training complete.
Validating models on the validation set...
Predicting with RANSACRegressor on validation data...
Loading test data...
Test data loaded. Shape: (2088, 3)
Predicting on test data...
Predicting with RANSACRegressor...
Test predictions complete.
Creating submission file...
Submission file 'submission.csv' created!
Computing smoothed ranks from training data...


Smoothing ranks: 100%|██████████| 8341/8341 [09:14<00:00, 15.04it/s]


Smoothed ranks computed.
Scoring submission for biological/spatial plausibility...
Feasibility Score: -0.2239


-0.22391028778107622