<a href="https://www.kaggle.com/code/dalloliogm/submission-ensemble?scriptVersionId=240054101" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Ensemble 

This notebook ensembles the best submissions from public notebooks. These have been manually downloaded and upldoade in the elucidata-submissions dataset.

Enjoy and good luck!

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def ensemble_submissions(
    submission_folder,
    use_rank=False,
    rank_axis="columns",  # "columns" or "rows"
    output_prefix=""
):
    """
    Generate multiple ensemble strategies from Kaggle submission files.

    Args:
        submission_folder (str): Path to folder containing submission_*.csv files.
        use_rank (bool): If True, rank predictions before ensembling (for Spearman-based tasks).
        rank_axis (str): 'columns' (rank each column) or 'rows' (rank each row).
        output_prefix (str): Optional prefix for output files.
    """
    assert rank_axis in {"columns", "rows"}, "rank_axis must be 'columns' or 'rows'"
    import scipy.stats as stats

    # === Step 1: Load submissions and scores ===
    pattern = re.compile(r"submission_(\d+)\.csv")
    predictions = []
    weights = []
    submission_names = []

    for fname in os.listdir(submission_folder):
        match = pattern.match(fname)
        if not match:
            continue
        score_str = match.group(1)
        score = float("0." + score_str.lstrip("0"))
        df = pd.read_csv(os.path.join(submission_folder, fname))
        pred_cols = [col for col in df.columns if col != "ID"]
        pred = df[pred_cols].copy()

        if use_rank:
            axis = 0 if rank_axis == "columns" else 1
            pred = pred.rank(axis=axis, method="average")

        predictions.append(pred)
        weights.append(score)
        submission_names.append(fname)

    # === Step 2: Stack data ===
    pred_array = np.stack([df.values for df in predictions], axis=-1)  # shape: (N, C, M)
    weights = np.array(weights)
    weights = weights / weights.sum()
    n_samples, n_targets, n_models = pred_array.shape
    ID = df["ID"]
    columns = predictions[0].columns

    # Suffix for rank info
    rank_suffix = f"_ranked_{rank_axis}" if use_rank else "_raw"

    def save(df_values, name):
        df_out = pd.DataFrame(df_values, columns=columns)
        df_out.insert(0, "ID", ID)
        filename = f"{output_prefix}{name}{rank_suffix}.csv"
        df_out.to_csv(filename, index=False)

    # === 1. Power Weighted Average (global) ===
    power = 3
    w_pow = weights**power
    w_pow /= w_pow.sum()
    ensemble_power = np.average(pred_array, axis=-1, weights=w_pow)
    save(ensemble_power, "ensemble_power_weighted")

    # === 2. Top-k Averaging ===
    k = 3
    top_k_idx = np.argsort(weights)[-k:]
    top_k_preds = pred_array[:, :, top_k_idx]
    top_k_weights = weights[top_k_idx]
    top_k_weights = top_k_weights / top_k_weights.sum()
    ensemble_top_k = np.average(top_k_preds, axis=-1, weights=top_k_weights)
    save(ensemble_top_k, "ensemble_top_k")

    # === 3. Blend Best + Average of Others ===
    best_idx = np.argmax(weights)
    best_model = pred_array[:, :, best_idx]
    others = np.delete(pred_array, best_idx, axis=2)
    others_avg = np.mean(others, axis=2)
    alpha = 0.8
    ensemble_blend = alpha * best_model + (1 - alpha) * others_avg
    save(ensemble_blend, "ensemble_blend_best_avg")

    # === 4. Threshold-Based Ensemble ===
    threshold = 0.3
    mask = weights >= threshold
    if mask.sum() > 0:
        selected_preds = pred_array[:, :, mask]
        selected_weights = weights[mask] / weights[mask].sum()
        ensemble_thresh = np.average(selected_preds, axis=-1, weights=selected_weights)
        save(ensemble_thresh, "ensemble_threshold_0.3")

    # === 5. Residual-Based Per-Feature Weighting ===
    baseline = ensemble_power.copy()
    errors = np.zeros((n_targets, n_models))
    for i in range(n_models):
        pred_i = pred_array[:, :, i]
        errors[:, i] = ((pred_i - baseline) ** 2).mean(axis=0)
    weights_per_target = 1 / (errors + 1e-6)
    weights_per_target /= weights_per_target.sum(axis=1, keepdims=True)
    ensemble_residual = np.einsum("ncm,cm->nc", pred_array, weights_per_target)
    save(ensemble_residual, "ensemble_residual_weighted")

    # === 6. Save diagnostics ===
    pd.DataFrame(errors, index=columns, columns=submission_names).to_csv(
        f"{output_prefix}model_target_errors{rank_suffix}.csv"
    )
    pd.DataFrame(weights_per_target, index=columns, columns=submission_names).to_csv(
        f"{output_prefix}residual_weights{rank_suffix}.csv"
    )

    print("✅ Ensembles saved using " + ("ranked " if use_rank else "raw ") + f"predictions (axis={rank_axis})")


In [2]:
# Raw value ensembling
ensemble_submissions("/kaggle/input/elucidata-submissions", use_rank=False)



✅ Ensembles saved using raw predictions (axis=columns)


In [3]:
# Rank-based ensembling across columns (C1 to C36)
ensemble_submissions("/kaggle/input/elucidata-submissions", use_rank=True, rank_axis="columns")



✅ Ensembles saved using ranked predictions (axis=columns)


In [4]:
# Rank-based ensembling across rows (per sample)
ensemble_submissions("/kaggle/input/elucidata-submissions", use_rank=True, rank_axis="rows")


✅ Ensembles saved using ranked predictions (axis=rows)
