In [1]:
import h5py
import numpy as np
import cudf
import cupy as cp
import pandas as pd  # Used only for CSV export if desired
import matplotlib.pyplot as plt

from cuml.preprocessing import StandardScaler
from cuml.ensemble import RandomForestRegressor

class GPUMultiOutputRegressor:
    """
    A simple multi-output regressor that fits one cuML RandomForestRegressor per target.
    """
    def __init__(self, estimator, n_estimators=100, random_state=42):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models_ = {}  # Dictionary to store one model per target

    def fit(self, X, Y):
        # Y is expected to be a cuDF DataFrame with each column a target variable.
        for col in Y.columns:
            # Initialize a new estimator for each target
            model = self.estimator(n_estimators=self.n_estimators, random_state=self.random_state)
            # Fit the model on the current target column
            model.fit(X, Y[col])
            self.models_[col] = model
        return self

    def predict(self, X):
        # Predict each target separately and combine into a cuDF DataFrame.
        preds = {}
        for col, model in self.models_.items():
            preds[col] = model.predict(X)
        # Construct a cuDF DataFrame from the dictionary.
        return cudf.DataFrame(preds)

class CellTypeGPUPipeline:
    """
    A GPU-accelerated pipeline for cell type prediction using RAPIDS.
    
    This version includes feature engineering:
      - It augments the original features ("x" and "y") with:
          * x2: x^2
          * y2: y^2
          * xy: x * y
          * dist: sqrt(x^2 + y^2)
    """
    def __init__(self, h5_file_path):
        self.h5_file_path = h5_file_path
        self.train_spot_tables = {}
        self.cell_type_columns = None
        self.scaler = StandardScaler()  # cuML StandardScaler
        self.model = None  # To be set after training

    def load_train_data(self):
        """
        Loads training spot data from the H5 file and converts each slide to a cuDF DataFrame.
        """
        with h5py.File(self.h5_file_path, "r") as f:
            train_spots = f["spots/Train"]
            for slide_name in train_spots.keys():
                # Read the structured array and convert first to pandas then to cuDF
                spot_array = np.array(train_spots[slide_name])
                pdf = pd.DataFrame(spot_array)
                df = cudf.DataFrame.from_pandas(pdf)
                self.train_spot_tables[slide_name] = df
        print("Training data loaded (GPU version) successfully.")

    def engineer_features(self, X):
        """
        Applies feature engineering on the cuDF DataFrame X containing 'x' and 'y' columns.
        
        Adds:
            - x2: x^2
            - y2: y^2
            - xy: x * y
            - dist: sqrt(x^2 + y^2)
        """
        X = X.copy()
        X['x2'] = X['x'] ** 2
        X['y2'] = X['y'] ** 2
        X['xy'] = X['x'] * X['y']
        # Compute the Euclidean distance using exponentiation
        X['dist'] = (X['x'] ** 2 + X['y'] ** 2) ** 0.5
        return X

    def prepare_training_set(self, slide_id='S_1'):
        """
        Prepares training features and targets from a given slide.
        
        Assumes that the first two columns are 'x' and 'y', and the remaining columns are cell type abundances.
        Applies feature engineering to augment the original features.
        """
        if slide_id not in self.train_spot_tables:
            raise ValueError(f"Slide {slide_id} not found in training data.")
        df = self.train_spot_tables[slide_id]
        # Use original coordinate columns.
        feature_cols = ['x', 'y']
        target_cols = [col for col in df.columns if col not in feature_cols]
        self.cell_type_columns = target_cols
        # Convert features and targets to float32 for GPU ops.
        X = df[feature_cols].astype('float32')
        y = df[target_cols].astype('float32')
        # Apply feature engineering
        X = self.engineer_features(X)
        return X, y

    def load_test_data(self, slide_id):
        """
        Loads test spot data for a given slide and returns a cuDF DataFrame.
        """
        with h5py.File(self.h5_file_path, "r") as f:
            test_spots = f["spots/Test"]
            if slide_id not in test_spots:
                raise ValueError(f"Slide {slide_id} not found in test data.")
            spot_array = np.array(test_spots[slide_id])
            pdf = pd.DataFrame(spot_array)
            test_df = cudf.DataFrame.from_pandas(pdf)
        print(f"Test data for slide {slide_id} loaded (GPU version) successfully.")
        return test_df

    def prepare_test_set(self, test_df):
        """
        Prepares test features by selecting coordinate columns and applying feature engineering.
        """
        X = test_df[['x', 'y']].astype('float32')
        X = self.engineer_features(X)
        return X

    def train(self, X_train, y_train):
        """
        Scales training features and trains a multi-output model on GPU.
        """
        # Fit the scaler and transform X_train on GPU.
        X_train_scaled = self.scaler.fit_transform(X_train)
        # Initialize and train a multi-output regressor using cuML's RandomForestRegressor.
        gpu_multioutput = GPUMultiOutputRegressor(RandomForestRegressor)
        gpu_multioutput.fit(X_train_scaled, y_train)
        self.model = gpu_multioutput
        print("GPU model training complete.")
        return self.model

    def predict(self, X_test):
        """
        Scales test features and predicts cell type abundances.
        """
        X_test_scaled = self.scaler.transform(X_test)
        predictions = self.model.predict(X_test_scaled)
        return predictions

    def create_submission(self, test_df, predictions, submission_filename="submission.csv"):
        """
        Creates a submission CSV file. Converts GPU DataFrames to pandas before export.
        """
        pred_df = predictions
        pred_df.insert(0, 'ID', test_df.index)
        submission_pdf = pred_df.to_pandas()
        submission_pdf.to_csv(submission_filename, index=False)
        print(f"Submission file '{submission_filename}' created!")


In [2]:

# Example usage:
if __name__ == "__main__":
    # Specify the path to your dataset file.
    h5_file_path = "/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5"
    
    # Initialize the pipeline.
    pipeline = CellTypeGPUPipeline(h5_file_path)
    
    # Load training data and prepare the training set from slide 'S_1' with feature engineering.
    pipeline.load_train_data()
    X_train, y_train = pipeline.prepare_training_set(slide_id='S_1')
    
    # Train the final model on the full training set.
    pipeline.train(X_train, y_train)
    
    # Load test data for slide 'S_7', prepare it with feature engineering, and make predictions.
    test_df = pipeline.load_test_data(slide_id='S_7')
    X_test = pipeline.prepare_test_set(test_df)
    predictions = pipeline.predict(X_test)
    
    # Create and save the submission CSV file.
    pipeline.create_submission(test_df, predictions, submission_filename="submission.csv")


Training data loaded (GPU version) successfully.


  return init_func(self, *args, **kwargs)


GPU model training complete.
Test data for slide S_7 loaded (GPU version) successfully.
Submission file 'submission.csv' created!
