In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances


def load_submissions_and_scores(folder_path):
    submissions = []
    scores = []
    filenames = []
    
    for fname in os.listdir(folder_path):
        if fname.endswith(".csv"):
            try:
                score = float(fname.rstrip(".csv").split("_")[-1])
                df = pd.read_csv(os.path.join(folder_path, fname))
                submissions.append(df)
                scores.append(score)
                filenames.append(fname)
            except Exception as e:
                print(f"Skipping {fname}: {e}")
    return submissions, scores, filenames


def ensemble_best5_diverse(submissions, scores):
    meta_cols = ['ID', 'resname', 'resid']
    all_structures = []
    structure_sources = []  # Keep track of where each structure came from (submission index)

    # Collect all structures from all submissions
    for s_idx, df in enumerate(submissions):
        for i in range(5):
            coords = df[[f"x_{i+1}", f"y_{i+1}", f"z_{i+1}"]].values  # shape: (residues, 3)
            all_structures.append(coords)
            structure_sources.append((s_idx, i))
    
    # Flatten each structure to a 1D vector (residues * 3) for distance comparison
    flat_structures = [s.reshape(-1) for s in all_structures]
    
    # Compute pairwise Euclidean distances
    D = pairwise_distances(flat_structures, metric='euclidean')

    # Start from the best-scoring submission (highest score)
    best_submission_idx = np.argmax(scores)
    first_structure_idx = structure_sources.index((best_submission_idx, 0))  # first structure of best model
    selected_idx = [first_structure_idx]

    # Greedily pick most dissimilar structures
    while len(selected_idx) < 5:
        remaining = list(set(range(len(all_structures))) - set(selected_idx))
        next_idx = max(remaining, key=lambda i: min(D[i, j] for j in selected_idx))
        selected_idx.append(next_idx)

    # Rebuild submission
    selected = [all_structures[i] for i in selected_idx]
    output = submissions[0][meta_cols].copy()
    for i in range(5):
        output[f"x_{i+1}"] = selected[i][:, 0]
        output[f"y_{i+1}"] = selected[i][:, 1]
        output[f"z_{i+1}"] = selected[i][:, 2]

    return output


# === Run the ensembling ===
folder = "/kaggle/input/stanford-rna-folding-best-submissions"
submissions, scores, filenames = load_submissions_and_scores(folder)

ensemble = ensemble_best5_diverse(submissions, scores)
ensemble.to_csv("submission_best5_diverse.csv", index=False)
print("✅ submission_best5_diverse.csv created.")


✅ submission_best5_diverse.csv created.
