In [1]:
import os
import pandas as pd
import networkx as nx
from tqdm.auto import tqdm

obo_file_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
submission_files = [
    "/kaggle/input/cafa-dataset/submission_fold1.tsv",
    "/kaggle/input/cafa-dataset/submission_fold2.tsv",
    "/kaggle/input/cafa-dataset/submission_fold3.tsv",
]
final_output_file = "submission.tsv"

def create_ontology_graph(obo_path):
    ontology_graph = nx.DiGraph()
    
    if not os.path.exists(obo_path):
        raise FileNotFoundError(f"file not found: {obo_path}")

    current_id = None
    
    with open(obo_path, "r") as file:
        for line in file:
            line = line.strip()
            
            if line == "[Term]":
                current_id = None
            elif line.startswith("id: "):
                current_id = line.split("id: ")[1].strip()
                ontology_graph.add_node(current_id)
            elif line.startswith("is_a: "):
                parent_id = line.split()[1].strip()
                if current_id:
                    ontology_graph.add_edge(current_id, parent_id)
            elif line.startswith("relationship: part_of "):
                parts = line.split()
                if len(parts) >= 3:
                    parent_id = parts[2].strip()
                    if current_id:
                        ontology_graph.add_edge(current_id, parent_id)

    if not nx.is_directed_acyclic_graph(ontology_graph):
        for cycle in nx.simple_cycles(ontology_graph):
            ontology_graph.remove_edge(cycle[0], cycle[1])
    
    topological_order = list(nx.topological_sort(ontology_graph))
    child_to_parents = {node: list(ontology_graph.successors(node)) for node in ontology_graph.nodes()}
    
    return topological_order, child_to_parents

def propagate_scores(predictions_df, topological_order, child_to_parents):
    print("Log2")
    
    grouped_by_protein = predictions_df.groupby('protein_id')
    results = []
    
    for protein_id, protein_group in tqdm(grouped_by_protein, total=grouped_by_protein.ngroups):
        term_scores = dict(zip(protein_group['go_term'], protein_group['score']))
        
        for child_term in topological_order:
            if child_term in term_scores:
                child_score = term_scores[child_term]
                for parent_term in child_to_parents.get(child_term, []):
                    current_parent_score = term_scores.get(parent_term, 0.0)
                    if child_score > current_parent_score:
                        term_scores[parent_term] = child_score
        
        for term, score in term_scores.items():
            if score >= 0.001:
                results.append((protein_id, term, score))
    
    return pd.DataFrame(results, columns=['protein_id', 'go_term', 'score'])

print("Log1")
all_predictions = []

for file_path in submission_files:
    fold_predictions = pd.read_csv(file_path, sep='\t', header=None, 
                                   names=['protein_id', 'go_term', 'score'])
    all_predictions.append(fold_predictions)

combined_predictions = pd.concat(all_predictions, ignore_index=True)
averaged_predictions = combined_predictions.groupby(['protein_id', 'go_term'])['score'].mean().reset_index()


ontology_order, parent_mapping = create_ontology_graph(obo_file_path)
final_predictions = propagate_scores(averaged_predictions, ontology_order, parent_mapping)

final_predictions['score'] = final_predictions['score'].round(3)
final_predictions.to_csv(final_output_file, sep='\t', index=False, header=False)

Log1
Log2


  0%|          | 0/279437 [00:00<?, ?it/s]