In [1]:
import os
import gc
import pandas as pd
import numpy as np
import networkx as nx
from tqdm.auto import tqdm

# ==========================================
# CONFIGURATION
# ==========================================
OBO_PATH = "/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo"
SUBMISSION_INPUT = '/kaggle/input/cafa6-protein-function-enhanced-nb-v2/submission.tsv'
SUBMISSION_OUTPUT = 'submission.tsv'

# Threshold to remove weak noise.
MIN_SCORE_THRESHOLD = 0.001 

# ==========================================
# 1. FAST GRAPH BUILDER (NetworkX)
# ==========================================
def build_go_graph(go_obo_path):
    print(f"[1/4] Parsing OBO into DAG from {go_obo_path}...")
    go_graph = nx.DiGraph()
    
    if not os.path.exists(go_obo_path):
        raise FileNotFoundError(f"OBO not found: {go_obo_path}")

    with open(go_obo_path, "r") as f:
        cur_id = None
        for line in f:
            line = line.strip()
            if line == "[Term]":
                cur_id = None
            elif line.startswith("id: "):
                cur_id = line.split("id: ")[1].strip()
                go_graph.add_node(cur_id)
            elif line.startswith("is_a: "):
                pid = line.split()[1].strip()
                if cur_id:
                    go_graph.add_edge(cur_id, pid)
            elif line.startswith("relationship: part_of "):
                parts = line.split()
                if len(parts) >= 3:
                    pid = parts[2].strip()
                    if cur_id:
                        go_graph.add_edge(cur_id, pid)

    print("      -> Calculating Topological Order...")
    if not nx.is_directed_acyclic_graph(go_graph):
        cycles = list(nx.simple_cycles(go_graph))
        for cycle in cycles:
            go_graph.remove_edge(cycle[0], cycle[1])

    topo_order = list(nx.topological_sort(go_graph))
    term_parents_map = {n: list(go_graph.successors(n)) for n in go_graph.nodes()}
    
    return topo_order, term_parents_map

# ==========================================
# 2. FAST PROPAGATION (HARD MAX)
# ==========================================
def propagate_hard_max(df, topo_order, term_parents_map):
    print("[3/4] Propagating Scores (Hard Max)...")
    print("      -> Grouping data...")
    
    # Process by group to save memory and ensure isolation
    groups = df.groupby('protein_id')
    final_rows = []
    
    for pid, group in tqdm(groups):
        # Map term -> score
        scores = dict(zip(group['go_term'], group['score']))
        
        # Propagate from leaves to roots
        for child in topo_order:
            if child in scores:
                child_score = scores[child]
                if child in term_parents_map:
                    for parent in term_parents_map[child]:
                        current_parent_score = scores.get(parent, 0.0)
                        # HARD MAX RULE: Parent score must be at least child score
                        if child_score > current_parent_score:
                            scores[parent] = child_score
        
        # Filter and collect
        for term, score in scores.items():
            if score >= MIN_SCORE_THRESHOLD:
                final_rows.append((pid, term, score))

    return pd.DataFrame(final_rows, columns=['protein_id', 'go_term', 'score'])

# ==========================================
# 3. MAIN PIPELINE
# ==========================================

# 1. Setup
topo_order, term_parents_map = build_go_graph(OBO_PATH)

print(f"[2/4] Loading submission...")
# FIXED: names list now has 3 items to match your file
submission = pd.read_csv(SUBMISSION_INPUT, sep='\t', header=None, 
                         names=['protein_id', 'go_term', 'score'], 
                         dtype={'score': np.float32},
                         on_bad_lines='skip') # Skip lines that don't match

# 2. Propagate
submission_improved = propagate_hard_max(submission, topo_order, term_parents_map)

# 3. Save
print(f"[4/4] Saving improved submission ({len(submission_improved)} rows)...")
submission_improved['score'] = submission_improved['score'].round(3)
submission_improved.to_csv(SUBMISSION_OUTPUT, sep='\t', index=False, header=False)

print("------------------------------------------------")
print(f"[Done] Saved to {SUBMISSION_OUTPUT}")
print("------------------------------------------------")
print(submission_improved.head())

[1/4] Parsing OBO into DAG from /kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo...
      -> Calculating Topological Order...
[2/4] Loading submission...
[3/4] Propagating Scores (Hard Max)...
      -> Grouping data...


  0%|          | 0/279437 [00:00<?, ?it/s]

[4/4] Saving improved submission (55558156 rows)...
------------------------------------------------
[Done] Saved to submission.tsv
------------------------------------------------
   protein_id     go_term  score
0  A0A009IHW8  GO:0003953  1.005
1  A0A009IHW8  GO:0007165  1.005
2  A0A009IHW8  GO:0016787  1.006
3  A0A009IHW8  GO:0019677  1.006
4  A0A009IHW8  GO:0050135  1.006
