In [1]:
import os
import pandas as pd
import numpy as np

def load_submissions_and_scores(folder_path):
    submissions = []
    scores = []
    filenames = []
    
    for fname in os.listdir(folder_path):
        if fname.endswith(".csv"):
            try:
                score = float(fname.rstrip(".csv").split("_")[-1])
                df = pd.read_csv(os.path.join(folder_path, fname))
                submissions.append(df)
                scores.append(score)
                filenames.append(fname)
            except Exception as e:
                print(f"Skipping {fname}: {e}")
    return submissions, scores, filenames


def ensemble_weighted_average(submissions, scores):
    meta_cols = ['ID', 'resname', 'resid']
    num_structures = 5
    
    # Normalize scores to weights
    weights = np.array(scores) / np.sum(scores)
    
    # Stack predictions
    pred_stack = []
    for df in submissions:
        pred_coords = []
        for i in range(num_structures):
            pred_coords.append(df[[f"x_{i+1}", f"y_{i+1}", f"z_{i+1}"]].values)
        pred_coords = np.stack(pred_coords, axis=1)  # (num_residues, 5, 3)
        pred_stack.append(pred_coords)
    pred_stack = np.stack(pred_stack, axis=0)  # (num_submissions, num_residues, 5, 3)

    # Weighted average across submissions
    weighted_avg = np.tensordot(weights, pred_stack, axes=([0], [0]))  # (num_residues, 5, 3)

    # Build submission DataFrame
    output = submissions[0][meta_cols].copy()
    for i in range(num_structures):
        output[f"x_{i+1}"] = weighted_avg[:, i, 0]
        output[f"y_{i+1}"] = weighted_avg[:, i, 1]
        output[f"z_{i+1}"] = weighted_avg[:, i, 2]
    
    return output


def ensemble_best_5_structures(submissions, scores):
    meta_cols = ['ID', 'resname', 'resid']
    all_structures = []
    
    for df in submissions:
        for i in range(5):
            coords = df[[f"x_{i+1}", f"y_{i+1}", f"z_{i+1}"]].values
            all_structures.append(coords)
    
    # Dummy selection: just pick the 5 most confident ones (from highest-scoring submissions)
    selected = np.array(all_structures[:5])  # TODO: Replace with actual selection logic
    
    # Build output
    output = submissions[0][meta_cols].copy()
    for i in range(5):
        output[f"x_{i+1}"] = selected[i][:, 0]
        output[f"y_{i+1}"] = selected[i][:, 1]
        output[f"z_{i+1}"] = selected[i][:, 2]
    
    return output


# Usage example
folder = "/kaggle/input/stanford-rna-folding-best-submissions"
submissions, scores, filenames = load_submissions_and_scores(folder)

# Strategy 1: Weighted average
avg_ensemble = ensemble_weighted_average(submissions, scores)
avg_ensemble.to_csv("submission_weighted_average.csv", index=False)

# Strategy 2: Best-5 (basic)
best5_ensemble = ensemble_best_5_structures(submissions, scores)
best5_ensemble.to_csv("submission_best5_concat.csv", index=False)
