In [1]:
import pandas as pd
import numpy as np
import os

def median_ensemble(file_paths, output_path='submission.tsv', chunksize=100000):
    print(f"Processing in chunks of {chunksize}")

    # Step 1: Collect all unique keys (protein_go_term pairs)
    all_keys = set()
    for path in file_paths:
        for chunk in pd.read_csv(path, sep='\t', header=None,
                                 names=['protein', 'go_term', 'score'],
                                 dtype={'protein': str, 'go_term': str, 'score': float},
                                 chunksize=chunksize):
            chunk = chunk.dropna(subset=['protein', 'go_term'])
            chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
            all_keys.update(chunk['key'].values)

    all_keys = sorted(all_keys)
    print(f"Total unique predictions: {len(all_keys)}")

    temp_files = []

    # Step 2: Process by chunks of keys
    for start_idx in range(0, len(all_keys), chunksize):
        end_idx = min(start_idx + chunksize, len(all_keys))
        key_chunk = all_keys[start_idx:end_idx]
        result = pd.DataFrame({'key': key_chunk})

        # Step 3: Load model scores for current key chunk
        for i, path in enumerate(file_paths):
            model_data = []
            for chunk in pd.read_csv(path, sep='\t', header=None,
                                     names=['protein', 'go_term', 'score'],
                                     dtype={'protein': str, 'go_term': str, 'score': float},
                                     chunksize=chunksize):
                chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
                chunk_filtered = chunk[chunk['key'].isin(key_chunk)][['key', 'score']]
                model_data.append(chunk_filtered)

            if model_data:
                model_df = pd.concat(model_data, ignore_index=True)
                model_df = model_df.rename(columns={'score': f'score_{i}'})
                result = result.merge(model_df, on='key', how='left')

        # Step 4: Replace NaNs with 0
        for i in range(len(file_paths)):
            result[f'score_{i}'] = result[f'score_{i}'].fillna(0)

        # Step 5: Compute median across models
        model_cols = [f'score_{i}' for i in range(len(file_paths))]
        result['final_score'] = result[model_cols].median(axis=1)

        # Step 6: Split key back into protein and go_term
        result['protein'] = result['key'].str.rsplit('_', n=1).str[0]
        result['go_term'] = result['key'].str.rsplit('_', n=1).str[-1]

        temp_file = f'temp_chunk_{start_idx}.csv'
        result[['protein', 'go_term', 'final_score']].to_csv(temp_file, index=False, sep='\t', header=False)
        temp_files.append(temp_file)
        print(f"Processed chunk {len(temp_files)}")

    # Step 7: Combine all chunk files
    all_data = [pd.read_csv(f, sep='\t', header=None, names=['protein', 'go_term', 'final_score'])
                for f in temp_files]
    final_result = pd.concat(all_data, ignore_index=True)
    final_result.to_csv(output_path, sep='\t', index=False, header=False)

    # Step 8: Clean up temp files
    for temp_file in temp_files:
        os.remove(temp_file)

    print(f"Saved median ensemble to {output_path}")
    return final_result


if __name__ == "__main__":
    file_paths = [
        '/kaggle/input/cafa-6-t5-embeddings-with-ensemble/submission.tsv',
        '/kaggle/input/cafa-6-predictions/submission.tsv'
    ]

    result = median_ensemble(file_paths, chunksize=10_000_000)


Processing in chunks of 10000000
Total unique predictions: 34775536
Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Saved median ensemble to submission.tsv
