# Cellular Component GO term prediction
This notebook implements the trained CC prediction model on the test dataset and outputs final CC term predictions as a csv or text file.

### 1. Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import tensorflow as tf

### 2. Setup
Artifacts are reused without retraining.

In [2]:
# defining paths
meta_dir       = "metadata/CC"
CC_model_path  = "metadata/CC/cc_model.h5"
test_dir       = "data/test"

x_test_path    = os.path.join(test_dir,  "X_test.npy")
test_ids_path  = os.path.join(test_dir, "test_ids.txt")
scaler_path    = os.path.join(meta_dir, "embed_scaler.pkl")
pred_out_csv   = os.path.join(meta_dir, "cc_predictions.csv")

In [3]:
pos_weight = 8.0  # must match training value

def weighted_bce(y_true, y_pred):
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    weights = y_true * pos_weight + (1.0 - y_true)
    return tf.reduce_mean(weights * bce)

# loading model
model = tf.keras.models.load_model(
    CC_model_path,
    custom_objects={"weighted_bce": weighted_bce}
)

In [4]:
# loading test data
X_test = np.load(x_test_path)

# checking and loading test_ids
if os.path.exists(test_ids_path):
    with open(test_ids_path, "r") as f:
        test_ids = [line.strip() for line in f if line.strip()]
else:
    raise FileNotFoundError(f"Could not find test IDs at: {test_ids_path}")

assert len(test_ids) == X_test.shape[0], "Mismatch: number of test_ids != rows in X_test"

In [5]:
# loading metadata

cc_keep = np.load(f"{meta_dir}/go_terms.npy", allow_pickle=True)
with open(f"{meta_dir}/go_map.pkl", "rb") as f:
    go_cc_map = pickle.load(f)

# inverse mapping column index to GO term
inv_go_cc_map = {v: k for k, v in go_cc_map.items()}

### 3. Test feature normalization, same as training

In [6]:
# Test data normalization

# loading scaler
with open(scaler_path, "rb") as f:
    scaler = pickle.load(f)

# Separating InterPro and ProtT5 features exactly as in training
interpro_dim = 1000
Xtest_domains = X_test[:, :interpro_dim].astype(np.float32)
Xtest_embed   = X_test[:, interpro_dim:].astype(np.float32)

# Scaling
Xtest_embed_scaled = scaler.transform(Xtest_embed)

# Recombining
X_test_scaled = np.concatenate(
    [Xtest_domains, Xtest_embed_scaled],
    axis=1
)

### 4. Running the model and recording the output

In [7]:
test_probs = model.predict(X_test_scaled, batch_size=512, verbose=1)



In [11]:
max_protein_terms = 500   # cap of terms per sub-ontology per protein
THR = 0.4

rows = []
for i, pid in enumerate(test_ids):
    probs = test_probs[i]

    # candidate indices above threshold
    idx = np.where(probs >= THR)[0]

    # if none are above the threshold, output the top few
    if idx.size == 0:
        idx = np.argsort(-probs)[:10]

    # sort by probability
    idx = idx[np.argsort(-probs[idx])][:max_protein_terms]

    for j in idx:
        go = inv_go_cc_map[int(j)]
        score = float(probs[j])

        # enforce (0, 1.000] and avoid 0
        if score <= 0.0:
            continue
        if score > 1.0:
            score = 1.0

        # format to 3 significant figures
        score_str = f"{score:.3g}"
        rows.append((pid, go, score_str))

In [12]:
pred_df = pd.DataFrame(rows, columns=["Protein_ID", "GO_term", "score"])
pred_df.to_csv(pred_out_csv, index=False)
print("Saved:", pred_out_csv, "rows:", len(pred_df))

Saved: metadata/CC\cc_predictions.csv rows: 29642


In [13]:
submission_txt = os.path.join(meta_dir, "cc_submission.txt")
with open(submission_txt, "w") as f:
    for pid, go, score_str in rows:
        f.write(f"{pid}\t{go}\t{score_str}\n")
print("Saved:", submission_txt)

Saved: metadata/CC\cc_submission.txt
