In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/esm-2/keras/esm2_t6_8m/1/config.json
/kaggle/input/esm-2/keras/esm2_t6_8m/1/preprocessor.json
/kaggle/input/esm-2/keras/esm2_t6_8m/1/tokenizer.json
/kaggle/input/esm-2/keras/esm2_t6_8m/1/metadata.json
/kaggle/input/esm-2/keras/esm2_t6_8m/1/model.weights.h5
/kaggle/input/esm-2/keras/esm2_t6_8m/1/task.json
/kaggle/input/esm-2/keras/esm2_t6_8m/1/task.weights.h5
/kaggle/input/esm-2/keras/esm2_t6_8m/1/assets/tokenizer/vocabulary.txt
/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-6-protein-function-prediction/IA.tsv
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta
/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-6-protein-function-prediction/T

In [2]:
"""
CAFA-6 PROTEIN FUNCTION PREDICTION - INCREMENTAL IMPROVEMENTS
Base Score: 2.2 â†’ Target: 2.5+

Changes from baseline:
1. FIX: Score clipping bug (max 142 â†’ max 1.0)
2. IMPROVE: Better threshold search range
3. IMPROVE: More labels (4000 â†’ 6000)
4. IMPROVE: IA-weighted loss function
5. IMPROVE: Better CAFA5 ensemble weights
"""

import os, gc, time, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, regularizers
import tensorflow.keras.backend as K

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

print("="*80)
print("CAFA-6 PROTEIN FUNCTION PREDICTION - V2 (INCREMENTAL IMPROVEMENTS)")
print("="*80)

# =============================================================================
# CONFIGURATION
# =============================================================================
CONFIG = {
    'BASE': "/kaggle/input/cafa-6-protein-function-prediction",
    'CAFA5_PATH': "/kaggle/input/cafa5-055923-pred/submission.tsv",
    'OUTPUT': "/kaggle/working/submission.tsv",
    
    # CHANGE 1: More labels
    'TOP_K_LABELS': 6000,  # 4000 â†’ 6000
    'MIN_FREQ': 2,  # 3 â†’ 2 (include rarer terms)
    
    'BATCH_SIZE': 32,
    'EPOCHS': 20,  # More epochs
    'LR': 1e-4,  # Slightly lower LR
    'HIDDEN': [1536, 768, 384],
    'DROPOUT': 0.4,  # Less dropout
    'L2_REG': 5e-6,  # Less regularization
    
    'TOP_K_PRED': 800,  # 500 â†’ 800 (more predictions per protein)
    'PROP_ROUNDS': 15,  # 10 â†’ 15
    'TEST_BATCH_SIZE': 5000,
    
    'USE_TFIDF': True,
    'USE_AA_COMP': True,
    'USE_DIPEP': True,
    'USE_PHYSICHEM': True,
    'TFIDF_MAX_FEATURES': 15000,  # 12000 â†’ 15000
    'KMER_SIZE': 3,
    
    # CHANGE 2: IA-weighted loss
    'USE_IA_WEIGHTED_LOSS': True,
    
    # CHANGE 3: Better ensemble
    'CAFA5_WEIGHT': 0.4,  # 0.3 â†’ 0.4 (CAFA5 has good performance)
    
    'SEED': 42,
}

np.random.seed(CONFIG['SEED'])
tf.random.set_seed(CONFIG['SEED'])

start_time = time.time()

def log(msg, level="INFO"):
    elapsed = time.time() - start_time
    print(f"[{elapsed:7.1f}s] {level:5s} | {msg}")

# =============================================================================
# UTILITY FUNCTIONS (SAME AS BEFORE)
# =============================================================================
def calculate_aa_composition(seq):
    aas = "ACDEFGHIKLMNPQRSTVWY"
    comp = {aa: 0 for aa in aas}
    for aa in seq:
        if aa in comp:
            comp[aa] += 1
    total = len(seq) or 1
    return np.array([comp[aa] / total for aa in aas])

def calculate_dipeptide_composition(seq):
    top_dipeptides = [
        'AL', 'LA', 'AA', 'LE', 'EA', 'AS', 'LL', 'EL', 'SA', 'VA',
        'AR', 'GA', 'LG', 'AG', 'PA', 'AP', 'GG', 'VS', 'GL', 'LV',
        'KA', 'VE', 'AK', 'TA', 'GS', 'RA', 'AT', 'VL', 'AV', 'DA',
        'LK', 'SG', 'KL', 'EV', 'TL', 'LT', 'KE', 'LS', 'AD', 'SE'
    ]
    dipep_counts = defaultdict(int)
    for i in range(len(seq) - 1):
        dipep = seq[i:i+2]
        if len(dipep) == 2:
            dipep_counts[dipep] += 1
    total = max(len(seq) - 1, 1)
    return np.array([dipep_counts[dp] / total for dp in top_dipeptides])

def calculate_physicochemical_properties(seq):
    hydro = {'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
             'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
             'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
             'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3}
    mw = {'A': 89, 'C': 121, 'D': 133, 'E': 147, 'F': 165, 'G': 75,
          'H': 155, 'I': 131, 'K': 146, 'L': 131, 'M': 149, 'N': 132,
          'P': 115, 'Q': 146, 'R': 174, 'S': 105, 'T': 119, 'V': 117,
          'W': 204, 'Y': 181}
    
    if not seq:
        return np.zeros(8)
    
    avg_hydro = np.mean([hydro.get(aa, 0) for aa in seq])
    avg_mw = np.mean([mw.get(aa, 0) for aa in seq])
    positive = sum(1 for aa in seq if aa in 'RK')
    negative = sum(1 for aa in seq if aa in 'DE')
    polar = sum(1 for aa in seq if aa in 'STNQ')
    helix_formers = sum(1 for aa in seq if aa in 'AELM')
    sheet_formers = sum(1 for aa in seq if aa in 'VIF')
    total = len(seq)
    
    return np.array([
        avg_hydro, avg_mw / 150, positive / total, negative / total,
        polar / total, helix_formers / total, sheet_formers / total, len(seq) / 1000
    ])

def get_kmers(seq, k=3):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

# =============================================================================
# DATA LOADING (SAME)
# =============================================================================
log("Loading FASTA sequences...")

def read_fasta(path):
    seqs = {}
    pid, seq = None, []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if pid:
                    seqs[pid] = ''.join(seq)
                parts = line[1:].split('|')
                pid = parts[1] if len(parts) > 1 else line[1:].split()[0]
                seq = []
            else:
                seq.append(line)
        if pid:
            seqs[pid] = ''.join(seq)
    return seqs

train_seqs = read_fasta(f"{CONFIG['BASE']}/Train/train_sequences.fasta")
test_seqs = read_fasta(f"{CONFIG['BASE']}/Test/testsuperset.fasta")

log(f"Loaded {len(train_seqs):,} training sequences")
log(f"Loaded {len(test_seqs):,} test sequences")

df_terms = pd.read_csv(f"{CONFIG['BASE']}/Train/train_terms.tsv", sep='\t', header=None,
                       names=['protein_id', 'go_term', 'ontology'])
df_terms = df_terms[df_terms['protein_id'] != 'EntryID'].reset_index(drop=True)

df_ia = pd.read_csv(f"{CONFIG['BASE']}/IA.tsv", sep='\t', header=None, names=['go_term', 'ia'])
ia_weights = dict(zip(df_ia['go_term'], df_ia['ia']))

log(f"Loaded {len(df_terms):,} annotations for {df_terms['protein_id'].nunique():,} proteins")
log(f"Loaded {len(ia_weights):,} IA weights")

# =============================================================================
# GO ONTOLOGY PARSING (SAME)
# =============================================================================
log("Parsing GO ontology hierarchy...")

parents = defaultdict(set)
term_ontology = {}

with open(f"{CONFIG['BASE']}/Train/go-basic.obo") as f:
    cur_id = None
    for line in f:
        line = line.strip()
        if line == "[Term]":
            cur_id = None
        elif line.startswith("id: "):
            cur_id = line.split("id: ")[1]
        elif line.startswith("namespace: "):
            if cur_id:
                term_ontology[cur_id] = line.split("namespace: ")[1]
        elif line.startswith("is_a: ") and cur_id:
            parents[cur_id].add(line.split()[1])
        elif line.startswith("relationship: part_of ") and cur_id:
            parts = line.split()
            if len(parts) >= 3:
                parents[cur_id].add(parts[2])

log(f"Parsed {len(parents):,} GO terms")

def get_all_ancestors(term):
    ancestors = set()
    queue = [term]
    while queue:
        current = queue.pop(0)
        for parent in parents.get(current, []):
            if parent not in ancestors:
                ancestors.add(parent)
                queue.append(parent)
    return ancestors

# =============================================================================
# LABEL PROPAGATION (SAME)
# =============================================================================
log("Propagating labels...")

protein_to_terms = defaultdict(set)
for _, row in df_terms.iterrows():
    protein_to_terms[row['protein_id']].add(row['go_term'])

propagated_terms = {}
for i, (protein, terms) in enumerate(protein_to_terms.items()):
    all_terms = set(terms)
    for term in terms:
        all_terms.update(get_all_ancestors(term))
    propagated_terms[protein] = all_terms
    if (i + 1) % 25000 == 0:
        log(f"Propagated {i+1:,}/{len(protein_to_terms):,} proteins", "PROG")

log(f"Before: {sum(len(v) for v in protein_to_terms.values()):,}, After: {sum(len(v) for v in propagated_terms.values()):,}")

# =============================================================================
# LABEL SELECTION (MORE LABELS)
# =============================================================================
log("Selecting labels...")

term_counts = Counter()
for terms in propagated_terms.values():
    term_counts.update(terms)

frequent_terms = {t for t, c in term_counts.items() if c >= CONFIG['MIN_FREQ']}

mf_candidates = [t for t, c in term_counts.most_common() 
                 if t in frequent_terms and term_ontology.get(t) == 'molecular_function']
bp_candidates = [t for t, c in term_counts.most_common() 
                 if t in frequent_terms and term_ontology.get(t) == 'biological_process']
cc_candidates = [t for t, c in term_counts.most_common() 
                 if t in frequent_terms and term_ontology.get(t) == 'cellular_component']

per_ontology = CONFIG['TOP_K_LABELS'] // 3
selected_mf = mf_candidates[:per_ontology]
selected_bp = bp_candidates[:per_ontology]
selected_cc = cc_candidates[:per_ontology]
top_terms = selected_mf + selected_bp + selected_cc

log(f"Labels: MF={len(selected_mf)}, BP={len(selected_bp)}, CC={len(selected_cc)}, Total={len(top_terms)}")

valid_proteins = [p for p in propagated_terms.keys() if p in train_seqs]
filtered_terms = {p: [t for t in propagated_terms[p] if t in top_terms] for p in valid_proteins}
valid_proteins = [p for p in valid_proteins if filtered_terms[p]]

log(f"Training proteins: {len(valid_proteins):,}")

# =============================================================================
# FEATURE EXTRACTION (MORE FEATURES)
# =============================================================================
log("Extracting features for training set...")

if CONFIG['USE_TFIDF']:
    log("  Computing TF-IDF...")
    train_texts = [get_kmers(train_seqs[p], CONFIG['KMER_SIZE']) for p in valid_proteins]
    tfidf = TfidfVectorizer(analyzer='word', token_pattern=r'(?u)\b\w+\b',
                            max_features=CONFIG['TFIDF_MAX_FEATURES'])
    X_tfidf = tfidf.fit_transform(train_texts).toarray().astype(np.float32)
    log(f"    TF-IDF: {X_tfidf.shape}")
    del train_texts
    gc.collect()
else:
    X_tfidf = None

feature_parts = []
if X_tfidf is not None:
    feature_parts.append(X_tfidf)

if CONFIG['USE_AA_COMP']:
    log("  Computing AA composition...")
    X_aa = np.array([calculate_aa_composition(train_seqs[p]) for p in valid_proteins], dtype=np.float32)
    feature_parts.append(X_aa)
    del X_aa
    gc.collect()

if CONFIG['USE_DIPEP']:
    log("  Computing dipeptide composition...")
    X_dipep = np.array([calculate_dipeptide_composition(train_seqs[p]) for p in valid_proteins], dtype=np.float32)
    feature_parts.append(X_dipep)
    del X_dipep
    gc.collect()

if CONFIG['USE_PHYSICHEM']:
    log("  Computing physicochemical properties...")
    X_physchem = np.array([calculate_physicochemical_properties(train_seqs[p]) for p in valid_proteins], dtype=np.float32)
    feature_parts.append(X_physchem)
    del X_physchem
    gc.collect()

X_combined = np.concatenate(feature_parts, axis=1)
log(f"Combined features: {X_combined.shape}")

del feature_parts
if X_tfidf is not None:
    del X_tfidf
gc.collect()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined).astype(np.float32)
del X_combined
gc.collect()

# =============================================================================
# PREPARE LABELS
# =============================================================================
log("Preparing labels...")

mlb = MultiLabelBinarizer(classes=sorted(top_terms))
y_list = [filtered_terms[p] for p in valid_proteins]
Y = mlb.fit_transform(y_list).astype(np.float32)

log(f"Label matrix: {Y.shape}, sparsity: {(1 - Y.mean()) * 100:.2f}%")

# CHANGE 4: Prepare IA weights for loss
label_ia = np.array([ia_weights.get(t, 1.0) for t in mlb.classes_])
label_ia = label_ia / label_ia.mean()  # Normalize

# =============================================================================
# BUILD MODEL WITH IA-WEIGHTED LOSS
# =============================================================================
log("Building model with IA-weighted loss...")

def weighted_binary_crossentropy(y_true, y_pred):
    """IA-weighted binary cross-entropy"""
    weights_tensor = K.constant(label_ia, dtype='float32')
    bce = K.binary_crossentropy(y_true, y_pred)
    weighted_bce = bce * weights_tensor
    return K.mean(weighted_bce, axis=-1)

def build_model(input_dim, output_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = inputs
    
    for i, hidden_size in enumerate(CONFIG['HIDDEN']):
        x = layers.Dense(hidden_size, kernel_regularizer=regularizers.l2(CONFIG['L2_REG']))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Dropout(CONFIG['DROPOUT'])(x)
    
    outputs = layers.Dense(output_dim, activation='sigmoid')(x)
    model = models.Model(inputs=inputs, outputs=outputs)
    
    loss_fn = weighted_binary_crossentropy if CONFIG['USE_IA_WEIGHTED_LOSS'] else 'binary_crossentropy'
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=CONFIG['LR']),
        loss=loss_fn,
        metrics=['precision', 'recall']
    )
    return model

model = build_model(X_scaled.shape[1], Y.shape[1])
log(f"Model params: {model.count_params():,}")

# =============================================================================
# TRAIN
# =============================================================================
log("Splitting data and training...")

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, Y, test_size=0.15, random_state=CONFIG['SEED']
)

log(f"Train: {X_train.shape}, Val: {X_val.shape}")

callbacks_list = [
    callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7, verbose=1)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=CONFIG['BATCH_SIZE'],
    epochs=CONFIG['EPOCHS'],
    callbacks=callbacks_list,
    verbose=2
)

log(f"Trained {len(history.history['loss'])} epochs")

del X_train, y_train, X_scaled, Y
gc.collect()

# =============================================================================
# THRESHOLD OPTIMIZATION (BETTER SEARCH RANGE)
# =============================================================================
log("Optimizing threshold...")

y_val_pred = model.predict(X_val, batch_size=64, verbose=0)

def calc_weighted_f1(y_true, y_pred_bin, weights):
    tp = ((y_true == 1) & (y_pred_bin == 1)).sum(0).astype(float)
    fp = ((y_true == 0) & (y_pred_bin == 1)).sum(0).astype(float)
    fn = ((y_true == 1) & (y_pred_bin == 0)).sum(0).astype(float)
    prec = tp / (tp + fp + 1e-12)
    rec = tp / (tp + fn + 1e-12)
    f1 = 2 * prec * rec / (prec + rec + 1e-12)
    return (f1 * weights).sum() / (weights.sum() + 1e-12)

best_thr, best_f1 = 0.5, 0
# CHANGE: Better threshold search - try lower AND higher thresholds
for t in np.arange(0.005, 0.8, 0.01):  # 0.01-0.5 â†’ 0.005-0.8
    f1 = calc_weighted_f1(y_val, (y_val_pred >= t).astype(int), label_ia)
    if f1 > best_f1:
        best_f1, best_thr = f1, t

log(f"Best threshold: {best_thr:.3f}, Weighted F1: {best_f1:.4f}")

del X_val, y_val, y_val_pred
gc.collect()

# =============================================================================
# LOAD CAFA5 BASELINE
# =============================================================================
log("Loading CAFA5 baseline...")

try:
    df_cafa5 = pd.read_csv(CONFIG['CAFA5_PATH'], sep='\t', header=None,
                          names=['protein_id', 'go_term', 'score'])
    cafa5_lookup = defaultdict(dict)
    for _, row in df_cafa5.iterrows():
        try:
            cafa5_lookup[row['protein_id']][row['go_term']] = float(row['score'])
        except:
            pass
    log(f"CAFA5 covers {len(cafa5_lookup):,} proteins, {sum(len(v) for v in cafa5_lookup.values()):,} predictions")
except Exception as e:
    log(f"CAFA5 not loaded: {e}", "WARN")
    cafa5_lookup = {}

# =============================================================================
# STREAMING TEST PREDICTION
# =============================================================================
log("Predicting test set (streaming mode)...")

test_protein_ids = list(test_seqs.keys())
n_test = len(test_protein_ids)
batch_size = CONFIG['TEST_BATCH_SIZE']

term_to_idx = {t: i for i, t in enumerate(mlb.classes_)}
restricted_parents = {
    t: [p for p in parents.get(t, []) if p in term_to_idx]
    for t in mlb.classes_
}

total_predictions = 0

with open(CONFIG['OUTPUT'], 'w') as f_out:
    
    for batch_start in range(0, n_test, batch_size):
        batch_end = min(batch_start + batch_size, n_test)
        batch_ids = test_protein_ids[batch_start:batch_end]
        
        if (batch_start // batch_size) % 10 == 0:
            log(f"Processing batch {batch_start:,}-{batch_end:,}/{n_test:,}", "PROG")
        
        # Extract features
        batch_features = []
        
        if CONFIG['USE_TFIDF']:
            batch_texts = [get_kmers(test_seqs[p], CONFIG['KMER_SIZE']) for p in batch_ids]
            batch_tfidf = tfidf.transform(batch_texts).toarray().astype(np.float32)
            batch_features.append(batch_tfidf)
            del batch_texts, batch_tfidf
        
        if CONFIG['USE_AA_COMP']:
            batch_aa = np.array([calculate_aa_composition(test_seqs[p]) for p in batch_ids], dtype=np.float32)
            batch_features.append(batch_aa)
            del batch_aa
        
        if CONFIG['USE_DIPEP']:
            batch_dipep = np.array([calculate_dipeptide_composition(test_seqs[p]) for p in batch_ids], dtype=np.float32)
            batch_features.append(batch_dipep)
            del batch_dipep
        
        if CONFIG['USE_PHYSICHEM']:
            batch_phys = np.array([calculate_physicochemical_properties(test_seqs[p]) for p in batch_ids], dtype=np.float32)
            batch_features.append(batch_phys)
            del batch_phys
        
        X_batch = np.concatenate(batch_features, axis=1)
        del batch_features
        
        X_batch = scaler.transform(X_batch).astype(np.float32)
        
        # Predict
        y_batch_pred = model.predict(X_batch, batch_size=64, verbose=0)
        del X_batch
        
        # Propagate
        for _ in range(CONFIG['PROP_ROUNDS']):
            for child, parent_list in restricted_parents.items():
                if not parent_list:
                    continue
                c_idx = term_to_idx[child]
                for parent in parent_list:
                    p_idx = term_to_idx[parent]
                    mask = y_batch_pred[:, c_idx] > y_batch_pred[:, p_idx]
                    if mask.any():
                        y_batch_pred[mask, p_idx] = y_batch_pred[mask, c_idx]
        
        # Write predictions
        for i, pid in enumerate(batch_ids):
            scores = y_batch_pred[i]
            top_idx = np.argsort(scores)[-CONFIG['TOP_K_PRED']:][::-1]
            
            for idx in top_idx:
                score = float(scores[idx])
                go_term = mlb.classes_[idx]
                
                # CHANGE: Fixed ensemble - clip to [0,1]
                if pid in cafa5_lookup and go_term in cafa5_lookup[pid]:
                    cafa5_score = cafa5_lookup[pid][go_term]
                    score = (1 - CONFIG['CAFA5_WEIGHT']) * score + CONFIG['CAFA5_WEIGHT'] * cafa5_score
                    score = np.clip(score, 0, 1)  # FIX: Ensure scores are in [0,1]
                
                if score > 0.001:
                    f_out.write(f"{pid}\t{go_term}\t{score:.3g}\n")
                    total_predictions += 1
        
        del y_batch_pred
        gc.collect()

log(f"Submission saved to: {CONFIG['OUTPUT']}")
log(f"Total predictions written: {total_predictions:,}")

# =============================================================================
# ANALYZE SUBMISSION
# =============================================================================
log("Analyzing submission...")

df_sub = pd.read_csv(CONFIG['OUTPUT'], sep='\t', header=None, names=['pid', 'go', 'score'])

log(f"Final score check - Min: {df_sub['score'].min():.4f}, Max: {df_sub['score'].max():.4f}")

if df_sub['score'].max() > 1.0:
    log("WARNING: Scores exceed 1.0, something is wrong!", "ERROR")

print("\n" + "="*80)
print("ðŸŽ¯ IMPROVEMENTS SUMMARY")
print("="*80)
print(f"""
CHANGES FROM BASELINE:
  1. âœ… More labels: 4000 â†’ {len(top_terms):,}
  2. âœ… IA-weighted loss: {'Enabled' if CONFIG['USE_IA_WEIGHTED_LOSS'] else 'Disabled'}
  3. âœ… Better threshold search: 0.01-0.5 â†’ 0.005-0.8 (found: {best_thr:.3f})
  4. âœ… Fixed score clipping bug (max was 142, now: {df_sub['score'].max():.4f})
  5. âœ… More predictions per protein: 500 â†’ {CONFIG['TOP_K_PRED']}
  6. âœ… Better CAFA5 ensemble weight: 0.3 â†’ {CONFIG['CAFA5_WEIGHT']}

RESULTS:
  - Training proteins: {len(valid_proteins):,}
  - Test proteins: {df_sub['pid'].nunique():,}
  - Total predictions: {len(df_sub):,}
  - Avg per protein: {len(df_sub)/df_sub['pid'].nunique():.1f}
  - Weighted F1: {best_f1:.4f}

OUTPUT: {CONFIG['OUTPUT']}
""")

print("="*80)
print("âœ… READY FOR SUBMISSION!")
print("="*80)

gc.collect()

2025-11-14 16:37:40.498898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763138260.952377      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763138261.057032      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CAFA-6 PROTEIN FUNCTION PREDICTION - V2 (INCREMENTAL IMPROVEMENTS)
[    0.0s] INFO  | Loading FASTA sequences...
[    3.2s] INFO  | Loaded 82,404 training sequences
[    3.2s] INFO  | Loaded 224,309 test sequences
[    3.7s] INFO  | Loaded 537,027 annotations for 82,404 proteins
[    3.7s] INFO  | Loaded 40,122 IA weights
[    3.7s] INFO  | Parsing GO ontology hierarchy...
[    4.3s] INFO  | Parsed 40,121 GO terms
[    4.3s] INFO  | Propagating labels...
[   24.7s] PROG  | Propagated 25,000/82,404 proteins
[   25.7s] PROG  | Propagated 50,000/82,404 proteins
[   26.6s] PROG  | Propagated 75,000/82,404 proteins
[   26.9s] INFO  | Before: 537,027, After: 3,564,990
[   26.9s] INFO  | Selecting labels...
[   27.4s] INFO  | Labels: MF=2000, BP=2000, CC=2000, Total=6000
[  190.4s] INFO  | Training proteins: 82,404
[  190.4s] INFO  | Extracting features for training set...
[  190.4s] INFO  |   Computing TF-IDF...
[  231.0s] INFO  |     TF-IDF: (82404, 8558)
[  231.5s] INFO  |   Computing AA c

I0000 00:00:1763138566.171559      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763138566.172355      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


[  291.4s] INFO  | Model params: 17,047,536
[  291.4s] INFO  | Splitting data and training...
[  292.8s] INFO  | Train: (70043, 8626), Val: (12361, 8626)
Epoch 1/20


I0000 00:00:1763138584.026347      64 service.cc:148] XLA service 0x7e7ea40165a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763138584.027753      64 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1763138584.027775      64 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1763138584.582739      64 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1763138588.041620      64 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


2189/2189 - 31s - 14ms/step - loss: 0.0729 - precision: 0.0148 - recall: 0.1212 - val_loss: 0.0428 - val_precision: 0.0152 - val_recall: 0.1127 - learning_rate: 1.0000e-04
Epoch 2/20
2189/2189 - 15s - 7ms/step - loss: 0.0322 - precision: 0.0158 - recall: 0.1175 - val_loss: 0.0320 - val_precision: 0.0179 - val_recall: 0.1332 - learning_rate: 1.0000e-04
Epoch 3/20
2189/2189 - 15s - 7ms/step - loss: 0.0269 - precision: 0.0177 - recall: 0.1320 - val_loss: 0.0246 - val_precision: 0.0208 - val_recall: 0.1560 - learning_rate: 1.0000e-04
Epoch 4/20
2189/2189 - 15s - 7ms/step - loss: 0.0223 - precision: 0.0195 - recall: 0.1457 - val_loss: 0.0206 - val_precision: 0.0221 - val_recall: 0.1658 - learning_rate: 1.0000e-04
Epoch 5/20
2189/2189 - 15s - 7ms/step - loss: 0.0191 - precision: 0.0213 - recall: 0.1593 - val_loss: 0.0185 - val_precision: 0.0228 - val_recall: 0.1714 - learning_rate: 1.0000e-04
Epoch 6/20
2189/2189 - 15s - 7ms/step - loss: 0.0172 - precision: 0.0228 - recall: 0.1709 - val_loss

0