In [1]:
import pandas as pd


def ensemble_submissions(file_paths, weights, output_path='submission.tsv'):
    dfs = []
    for i, path in enumerate(file_paths):
        df = pd.read_csv(path, sep='\t', header=None, names=['protein', 'go_term', 'score'])
        df['key'] = df['protein'] + '_' + df['go_term']
        df = df.rename(columns={'score': f'score_{i}'})
        dfs.append(df)
        print(f"Loaded {len(df)} predictions from file {i+1}")
    
    result = dfs[0][['protein', 'go_term', 'key', 'score_0']].copy()
    for i in range(1, len(dfs)):
        result = result.merge(dfs[i][['key', f'score_{i}']], on='key', how='outer')
    
    for i in range(len(dfs)):
        result[f'score_{i}'] = result[f'score_{i}'].fillna(0)
    
    result['score'] = sum(weights[i] * result[f'score_{i}'] for i in range(len(dfs)))
    result['protein'] = result['protein'].fillna(result['key'].str.split('_').str[0])
    result['go_term'] = result['go_term'].fillna(result['key'].str.split('_').str[-1])
    
    result = result.sort_values('score', ascending=False)
    result[['protein', 'go_term', 'score']].to_csv(
        output_path,
        sep='\t',
        index=False,
        header=False
    )
    
    print(f"\nSaved {len(result)} predictions to {output_path}")
    return result


if __name__ == "__main__":
    file_paths = [
        '/kaggle/input/gaf-submission/submission.tsv',
        '/kaggle/input/cafa-6-predictions/submission.tsv'
    ]
    weights = [0.5, 0.5]
    
    ensemble_submissions(file_paths, weights)

Loaded 2431175 predictions from file 1
Loaded 31157430 predictions from file 2

Saved 31157430 predictions to submission.tsv
