**1. TRAIN CHUNG 1 MODEL, LẤY TOP-K, NHƯ DƯỚI LÀ TOP-K = 150**

**2. TRAIN TÁCH BIỆT 3 MODEL, LẤY TOP C, F, P. NHƯ DƯỚI LÀ LẤY 20-40-145**

In [None]:
# deepgo_pipeline_split_aspect.py
import os
import time
import csv
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

# ---------------------------
# CONFIG
# ---------------------------
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
EPOCHS = 40
LR = 5e-4
WEIGHT_DECAY = 1e-4
CLIP_NORM = 5.0
FEATURE_DROPOUT = 0.2
LABEL_SMOOTH = 0.03
THRESHOLD = 0.2

TOP_K = {"C": 20, "F": 50, "P": 175}
OUT_SUBMIT_FINAL = "/kaggle/working/submission.tsv"

np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ---------------------------
# LOAD DATA
# ---------------------------
X_train = np.load('/kaggle/input/cafa56-end/650_taxon_features_X_INPUT.npy')
X_test = np.load('/kaggle/input/cafa56-end/X_test.npy')
ids_test = np.load('/kaggle/input/cafa56-end/protein_ids_test.npy')

Y_sparse_C = load_npz("/kaggle/input/cafa56-end/Y_C.npz")
GO_terms_C = np.load("/kaggle/input/cafa56-end/GO_terms_C.npy", allow_pickle=True)

Y_sparse_F = load_npz("/kaggle/input/cafa56-end/Y_F.npz")
GO_terms_F = np.load("/kaggle/input/cafa56-end/GO_terms_F.npy", allow_pickle=True)

Y_sparse_P = load_npz("/kaggle/input/cafa56-end/Y_P.npz")
GO_terms_P = np.load("/kaggle/input/cafa56-end/GO_terms_P.npy", allow_pickle=True)

Y_sparse_dict = {"C": Y_sparse_C, "F": Y_sparse_F, "P": Y_sparse_P}
GO_terms_dict = {"C": GO_terms_C, "F": GO_terms_F, "P": GO_terms_P}

# normalization
global_mean = X_train.mean(axis=0).astype(np.float32)
global_std  = X_train.std(axis=0).astype(np.float32) + 1e-6

# IA vector
IA_dict = {}
with open("/kaggle/input/cafa56-end/IA.tsv") as f:
    for line in f:
        go, value = line.strip().split("\t")
        IA_dict[go] = float(value)
IA_vec_C = np.array([IA_dict.get(go, 0.0) for go in GO_terms_C], dtype=np.float32)
IA_vec_F = np.array([IA_dict.get(go, 0.0) for go in GO_terms_F], dtype=np.float32)
IA_vec_P = np.array([IA_dict.get(go, 0.0) for go in GO_terms_P], dtype=np.float32)

# ontology mapping
go2asp = {}
with open("/kaggle/input/mapping-wf1/go_to_aspect.tsv") as f:
    next(f)
    for line in f:
        go, asp = line.strip().split(",")
        go2asp[go] = asp

# ---------------------------
# DATASET
# ---------------------------
class ProteinDataset(Dataset):
    def __init__(self, X, Y_sparse=None, indices=None, mean=None, std=None, feature_dropout=0.0, train=True):
        self.X = X
        self.Y = Y_sparse
        self.indices = np.array(indices) if indices is not None else np.arange(X.shape[0])
        self.mean = mean
        self.std = std
        self.feature_dropout = feature_dropout
        self.train = train

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        i = int(self.indices[idx])
        x = self.X[i].astype(np.float32)
        if self.mean is not None and self.std is not None:
            x = (x - self.mean) / self.std
        if self.train and self.feature_dropout > 0.0 and np.random.rand() < 0.5:
            mask = (np.random.rand(x.shape[0]) >= self.feature_dropout).astype(np.float32)
            x = x * mask

        x = torch.from_numpy(x)

        if self.Y is not None:
            y = torch.from_numpy(self.Y[i].toarray().squeeze().astype(np.float32))
            return x, y
        else:
            return x

# ---------------------------
# MODEL
# ---------------------------
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, hidden=[1024, 512], dropout=0.3):
        super().__init__()
        layers = []
        in_dim = input_dim
        for h in hidden:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.LayerNorm(h))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = h
        layers.append(nn.Linear(in_dim, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# ---------------------------
# LOSS
# ---------------------------
def bce_label_smooth(logits, targets, pos_weight=None, eps=LABEL_SMOOTH):
    smooth_pos = 1.0 - eps
    smooth_neg = eps * 0.5
    targets_sm = targets * smooth_pos + (1 - targets) * smooth_neg
    criterion = nn.BCEWithLogitsLoss(reduction='none', pos_weight=pos_weight)
    loss = criterion(logits, targets_sm)
    return loss.mean()

# ---------------------------
# dummy eval (giữ nguyên logic gốc)
# ---------------------------
@torch.no_grad()
def f1_weighted_batch(y_true, y_pred_bin, IA_vec):
    weights = IA_vec
    w_precision_list, w_recall_list = [], []

    for i in range(y_true.shape[0]):
        true_terms = y_true[i] == 1
        pred_terms = y_pred_bin[i] == 1
        if pred_terms.sum() > 0:
            TP_w = weights[pred_terms & true_terms].sum()
            Pred_w = weights[pred_terms].sum()
            w_precision_list.append(TP_w / (Pred_w + 1e-9))
        True_w = weights[true_terms].sum()
        if True_w > 0:
            TP_w = weights[pred_terms & true_terms].sum()
            w_recall_list.append(TP_w / (True_w + 1e-9))
    wpr = np.mean(w_precision_list) if w_precision_list else 0.0
    wrc = np.mean(w_recall_list) if w_recall_list else 0.0
    return 2 * wpr * wrc / (wpr + wrc + 1e-9) if (wpr + wrc) > 0 else 0.0

def eval_model(model, loader, IA_vec, threshold=0.5, pos_weight=None):
    total_loss = 0.0
    n_samples = 0
    model.eval()
    F1_list = []

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)

            logits = model(xb)
            loss = bce_label_smooth(logits, yb, pos_weight)

            total_loss += float(loss.item()) * xb.size(0)
            n_samples += xb.size(0)

            probs = torch.sigmoid(logits)
            y_pred_bin = (probs.cpu().numpy() >= threshold).astype(np.float32)
            y_true = yb.cpu().numpy().astype(np.float32)

            F1_list.append(f1_weighted_batch(y_true, y_pred_bin, IA_vec))

    val_loss = total_loss / n_samples
    val_f1 = float(np.mean(F1_list))
    return val_loss, val_f1 


  

# ---------------------------
# TRAIN FUNCTION
# ---------------------------
def train_aspect(X_train, Y_sparse_aspect, aspect_name):


    Y_sparse_aspect = Y_sparse_aspect.tocsr()

    
    row_nnz = np.diff(Y_sparse_aspect.indptr)
    valid_idx = np.where(row_nnz > 0)[0]
    train_idx, val_idx = train_test_split(valid_idx, test_size=0.1, random_state=SEED)

    train_ds = ProteinDataset(X_train, Y_sparse_aspect, train_idx, global_mean, global_std, FEATURE_DROPOUT, True)
    val_ds   = ProteinDataset(X_train, Y_sparse_aspect, val_idx, global_mean, global_std, 0.0, False)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    model = MLP(X_train.shape[1], Y_sparse_aspect.shape[1]).to(DEVICE)

    train_sparse = Y_sparse_aspect[train_idx]
    label_freq = np.array(train_sparse.sum(axis=0)).squeeze()
    N_train = len(train_idx)
    pos_weight_arr = (N_train - label_freq) / (label_freq + 1e-8)
    pos_weight_arr = np.clip(pos_weight_arr, 1.0, 5.0)
    pos_weight = torch.tensor(pos_weight_arr, dtype=torch.float32).to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LR, epochs=EPOCHS,
                                              steps_per_epoch=len(train_loader))
    scaler = torch.cuda.amp.GradScaler()

    best_val_f1 = -1
    OUT_MODEL = f"/kaggle/working/best_model_{aspect_name}.pt"

    # ---- NEW HISTORY ----
    hist_train_loss = []
    hist_val_loss = []
    hist_val_f1 = []

    for epoch in range(1, EPOCHS+1):
        model.train()
        total_loss = 0
        n_samples = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                logits = model(xb)
                loss = bce_label_smooth(logits, yb, pos_weight)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += float(loss.item()) * xb.size(0)
            n_samples += xb.size(0)

        train_loss = total_loss / n_samples


        IA_vec_map = {
            'C': IA_vec_C,
            'F': IA_vec_F,
            'P': IA_vec_P
        }

        # Lấy giá trị (có thể thêm default=None nếu cần)
        IA_vec = IA_vec_map.get(aspect_name)
        
        val_loss, val_f1 = eval_model(model, val_loader, IA_vec, threshold=0.5, pos_weight=pos_weight)

        print(f"{aspect_name} Epoch {epoch}: train_loss={train_loss:.6f} val_loss={val_loss:.6f} val_F1={val_f1:.6f}")

        # save history
        hist_train_loss.append(train_loss)
        hist_val_loss.append(val_loss)
        hist_val_f1.append(val_f1)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save({"model_state": model.state_dict()}, OUT_MODEL)
            print(f" -> saved best model {aspect_name}")

    return OUT_MODEL, hist_train_loss, hist_val_loss, hist_val_f1

# ---------------------------
# TRAIN 3 aspects + SAVE history
# ---------------------------
trained_models = {}
history = {}

for asp in ["C","F","P"]:
    print(f"Training aspect {asp} ...")
    model_path, tr_l, vl_l, vf1 = train_aspect(X_train, Y_sparse_dict[asp], asp)
    trained_models[asp] = model_path
    history[asp] = {
        "train_loss": tr_l,
        "val_loss": vl_l,
        "val_f1": vf1
    }

# ---------------------------
# PLOT TRAINING CURVES
# ---------------------------
for asp in ["C","F","P"]:
    h = history[asp]
    epochs = range(1, len(h["train_loss"]) + 1)

    plt.figure(figsize=(12, 4))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, h["train_loss"], label="Train Loss")
    plt.plot(epochs, h["val_loss"], label="Val Loss")
    plt.title(f"{asp} – Loss Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # F1
    plt.subplot(1, 2, 2)
    plt.plot(epochs, h["val_f1"], label="Val F1", color="green")
    plt.title(f"{asp} – Validation F1")
    plt.xlabel("Epoch")
    plt.ylabel("F1 Score")
    plt.legend()

    plt.tight_layout()
    plt.savefig(f"/kaggle/working/{asp}_training_plots.png")
    plt.show()

print("Training curves saved in /kaggle/working/")

# ---------------------------
# PREDICT AND MERGE
# ---------------------------
test_ds = ProteinDataset(X_test, None, mean=global_mean, std=global_std, train=False)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False)

with open(OUT_SUBMIT_FINAL, "w", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["ID", "GO_ID", "score"])

    for asp in ["C","F","P"]:
        print(f"Predicting aspect {asp} ...")
        model_path = trained_models[asp]
        GO_terms_aspect = GO_terms_dict[asp]
        top_k = TOP_K[asp]

        ckpt = torch.load(model_path, map_location=DEVICE)
        model = MLP(X_train.shape[1], len(GO_terms_aspect)).to(DEVICE)
        model.load_state_dict(ckpt["model_state"])
        model.eval()

        for i, xb in enumerate(test_loader):
            xb = xb.to(DEVICE)
            with torch.no_grad():
                logits = model(xb)
                probs = torch.sigmoid(logits).cpu().numpy()

            for j in range(probs.shape[0]):
                pid = ids_test[i*128 + j]
                row_rescore = probs[j]
                topk_idx = np.argsort(row_rescore)[::-1][:top_k]
                for idx in topk_idx:
                    score = float(probs[j, idx])
                    if score > 0.0:
                        writer.writerow([pid, GO_terms_aspect[idx], score])

print("All done. Submission file:", OUT_SUBMIT_FINAL)


**3. CÁI NÀY CHỈ LẤY TOP K TỪ MODEL ĐÃ TRAIN**

In [None]:
# import os
# import time
# import csv
# import numpy as np
# from scipy.sparse import load_npz
# from sklearn.model_selection import train_test_split

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader

# import matplotlib.pyplot as plt

# # ---------------------------
# # CONFIG
# # ---------------------------
# SEED = 42
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# BATCH_SIZE = 16
# EPOCHS = 40
# LR = 5e-4
# WEIGHT_DECAY = 1e-4
# CLIP_NORM = 5.0
# FEATURE_DROPOUT = 0.2
# LABEL_SMOOTH = 0.03
# THRESHOLD = 0.2

# TOP_K = {"C": 25, "F": 75, "P": 200}
# OUT_SUBMIT_FINAL = "/kaggle/working/submission.tsv"

# np.random.seed(SEED)
# torch.manual_seed(SEED)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(SEED)

# # ---------------------------
# # LOAD DATA
# # ---------------------------
# X_train = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_esm2_taxon_features_X.npy')
# X_test = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_X_test.npy')
# ids_test = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids_test.npy')

# Y_sparse_C = load_npz("/kaggle/input/cafa56-splitfunc/Y_C.npz")
# GO_terms_C = np.load("/kaggle/input/cafa56-splitfunc/GO_terms_C.npy", allow_pickle=True)

# Y_sparse_F = load_npz("/kaggle/input/cafa56-splitfunc/Y_F.npz")
# GO_terms_F = np.load("/kaggle/input/cafa56-splitfunc/GO_terms_F.npy", allow_pickle=True)

# Y_sparse_P = load_npz("/kaggle/input/cafa56-splitfunc/Y_P.npz")
# GO_terms_P = np.load("/kaggle/input/cafa56-splitfunc/GO_terms_P.npy", allow_pickle=True)

# Y_sparse_dict = {"C": Y_sparse_C, "F": Y_sparse_F, "P": Y_sparse_P}
# GO_terms_dict = {"C": GO_terms_C, "F": GO_terms_F, "P": GO_terms_P}

# # normalization
# global_mean = X_train.mean(axis=0).astype(np.float32)
# global_std  = X_train.std(axis=0).astype(np.float32) + 1e-6

# # IA vector
# IA_dict = {}
# with open("/kaggle/input/cafa6-data/IA.tsv") as f:
#     for line in f:
#         go, value = line.strip().split("\t")
#         IA_dict[go] = float(value)
# IA_vec_C = np.array([IA_dict.get(go, 0.0) for go in GO_terms_C], dtype=np.float32)
# IA_vec_F = np.array([IA_dict.get(go, 0.0) for go in GO_terms_F], dtype=np.float32)
# IA_vec_P = np.array([IA_dict.get(go, 0.0) for go in GO_terms_P], dtype=np.float32)

# # ontology mapping
# go2asp = {}
# with open("/kaggle/input/mapping-wf1/go_to_aspect.tsv") as f:
#     next(f)
#     for line in f:
#         go, asp = line.strip().split(",")
#         go2asp[go] = asp

# class ProteinDataset(Dataset):
#     def __init__(self, X, Y_sparse=None, indices=None, mean=None, std=None, feature_dropout=0.0, train=True):
#         self.X = X
#         self.Y = Y_sparse
#         self.indices = np.array(indices) if indices is not None else np.arange(X.shape[0])
#         self.mean = mean
#         self.std = std
#         self.feature_dropout = feature_dropout
#         self.train = train

#     def __len__(self):
#         return len(self.indices)

#     def __getitem__(self, idx):
#         i = int(self.indices[idx])
#         x = self.X[i].astype(np.float32)
#         if self.mean is not None and self.std is not None:
#             x = (x - self.mean) / self.std
#         if self.train and self.feature_dropout > 0.0 and np.random.rand() < 0.5:
#             mask = (np.random.rand(x.shape[0]) >= self.feature_dropout).astype(np.float32)
#             x = x * mask

#         x = torch.from_numpy(x)

#         if self.Y is not None:
#             y = torch.from_numpy(self.Y[i].toarray().squeeze().astype(np.float32))
#             return x, y
#         else:
#             return x

# class DeepGO_MLP(nn.Module):
#     def __init__(self, input_dim, output_dim, hidden=[1024, 512], dropout=0.3):
#         super().__init__()
#         layers = []
#         in_dim = input_dim
#         for h in hidden:
#             layers.append(nn.Linear(in_dim, h))
#             layers.append(nn.LayerNorm(h))
#             layers.append(nn.GELU())
#             layers.append(nn.Dropout(dropout))
#             in_dim = h
#         layers.append(nn.Linear(in_dim, output_dim))
#         self.net = nn.Sequential(*layers)

#     def forward(self, x):
#         return self.net(x)
            
# test_ds = ProteinDataset(X_test, None, mean=global_mean, std=global_std, train=False)
# test_loader = DataLoader(test_ds, batch_size=128, shuffle=False)

# trained_models = {}
# trained_models["C"] = "/kaggle/input/3splitfunc/other/default/1/best_model_C.pt"
# trained_models["F"] = "/kaggle/input/3splitfunc/other/default/1/best_model_F.pt"
# trained_models["P"] = "/kaggle/input/3splitfunc/other/default/1/best_model_P.pt"

# with open(OUT_SUBMIT_FINAL, "w", newline="") as f:
#     writer = csv.writer(f, delimiter="\t")
#     writer.writerow(["ID", "GO_ID", "score"])

#     for asp in ["C","F","P"]:
#         print(f"Predicting aspect {asp} ...")
#         model_path = trained_models[asp]
#         GO_terms_aspect = GO_terms_dict[asp]
#         top_k = TOP_K[asp]

#         ckpt = torch.load(model_path, map_location=DEVICE)
#         model = DeepGO_MLP(X_train.shape[1], len(GO_terms_aspect)).to(DEVICE)
#         model.load_state_dict(ckpt["model_state"])
#         model.eval()

#         for i, xb in enumerate(test_loader):
#             xb = xb.to(DEVICE)
#             with torch.no_grad():
#                 logits = model(xb)
#                 probs = torch.sigmoid(logits).cpu().numpy()

#             for j in range(probs.shape[0]):
#                 pid = ids_test[i*128 + j]
#                 row_rescore = probs[j]
#                 topk_idx = np.argsort(row_rescore)[::-1][:top_k]
#                 for idx in topk_idx:
#                     score = float(probs[j, idx])
#                     if score > 0.0:
#                         writer.writerow([pid, GO_terms_aspect[idx], score])


# import pandas as pd

# print("------------------------------------------------------")
# print(f"Post-processing: Sorting {OUT_SUBMIT_FINAL} by ID...")

# # 1. Đọc file TSV lên bằng Pandas
# # Lưu ý: header=None vì code trước đó thường không ghi header cho CAFA. 
# # Nếu code của bạn CÓ ghi header ["ID", "GO_ID", "score"], hãy đổi thành header=0
# df = pd.read_csv(OUT_SUBMIT_FINAL, sep='\t', header=None, names=["ID", "GO_ID", "score"])

# # 2. Sắp xếp theo cột ID
# df = df.sort_values(by="ID", ascending=True)

# # 3. Ghi đè lại file cũ
# # header=False để không ghi tên cột vào file output
# df.to_csv(OUT_SUBMIT_FINAL, sep='\t', index=False, header=False)

# print(f"Sorting complete. Final file saved to {OUT_SUBMIT_FINAL}")
