In [None]:
import warnings
warnings.filterwarnings('ignore')
!pip install biopython obonet --quiet
!pip install transformers biopython --quiet

In [None]:
import torch
from transformers import EsmTokenizer, EsmModel
import re
from Bio import SeqIO
from collections import defaultdict

# --- 1.1. X·ª≠ l√Ω file Taxonomy ---
def parse_taxonomy(taxon_file_path):
    """T·∫°o √°nh x·∫° t·ª´ ID protein sang Taxon ID."""
    prot_to_taxon = {}
    unique_taxa = set()
    with open(taxon_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                protein_id = parts[0].strip()
                taxon_id = parts[1].strip()
                prot_to_taxon[protein_id] = taxon_id
                unique_taxa.add(taxon_id)
    return prot_to_taxon, sorted(list(unique_taxa))

# --- 1.2. X·ª≠ l√Ω file Sequences (FASTA) ---
def parse_sequences(fasta_file_path):
    """T·∫°o √°nh x·∫° t·ª´ ID protein sang Chu·ªói axit amin."""
    prot_to_seq = {}
    # S·ª≠ d·ª•ng SeqIO ƒë·ªÉ ƒë·ªçc file FASTA hi·ªáu qu·∫£
    for record in SeqIO.parse(fasta_file_path, "fasta"):
        # L·∫•y Uniprot ID t·ª´ header (v√≠ d·ª•: sp|A0A1D9BZF0|...)
        uniprot_id_match = re.search(r'\|([A-Z0-9]+)\|', record.id)
        if uniprot_id_match:
            protein_id = uniprot_id_match.group(1)
            prot_to_seq[protein_id] = str(record.seq)
    return prot_to_seq

# --- D·ªØ li·ªáu gi·∫£ ƒë·ªãnh ---
# Trong th·ª±c t·∫ø, b·∫°n s·∫Ω thay th·∫ø b·∫±ng ƒë∆∞·ªùng d·∫´n file:
prot_to_taxon, unique_taxa_from_func = parse_taxonomy("/kaggle/input/cafa56/CAFA56/CAFA56_train_taxonomy.tsv")
prot_to_seq = parse_sequences("/kaggle/input/cafa56/CAFA56/CAFA56_sequences.fasta")

unique_taxa = unique_taxa_from_func





In [None]:
# T·∫°o √°nh x·∫° t·ª´ Taxon ID (chu·ªói) sang Index (s·ªë nguy√™n)
taxon_to_index = {taxon: i for i, taxon in enumerate(unique_taxa)}
num_taxon = len(unique_taxa) # S·ªë l∆∞·ª£ng taxon duy nh·∫•t
print(num_taxon)
print(1)
# V√≠ d·ª•: num_taxon = 5

In [None]:
# T·∫£i m√¥ h√¨nh ESM-2
import torch
import gc
from tqdm.auto import tqdm
import time


torch.cuda.empty_cache()
gc.collect()
# --- Danh s√°ch top 10 taxa ---
top_taxa = ["9606", "10090", "3702", "559292", "10116", "284812", 
            "83333", "7227", "6239", "83332"]
taxon_to_index_top = {taxon: i for i, taxon in enumerate(top_taxa)}
others_index = len(top_taxa)        # index cho 'others'
num_taxon_top = len(top_taxa) + 1   # 11 chi·ªÅu

def prot_taxon_onehot(prot_id, prot_to_taxon, num_taxon_top=num_taxon_top, taxon_to_index_top=taxon_to_index_top):
    """
    Nh·∫≠n protein ID, tr·∫£ v·ªÅ vector one-hot 11 chi·ªÅu cho Taxon
    """
    taxon_id_str = prot_to_taxon.get(prot_id, None)
    if taxon_id_str is None:
        # N·∫øu kh√¥ng t√¨m th·∫•y taxon -> g√°n v√†o 'others'
        index = others_index
    else:
        index = taxon_to_index_top.get(taxon_id_str, others_index)
    
    vec = torch.zeros(num_taxon_top, dtype=torch.float32)
    vec[index] = 1
    return vec


model_name = "facebook/esm2_t33_650M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model_esm2 = EsmModel.from_pretrained(model_name)

# ESM-2 c√≥ k√≠ch th∆∞·ªõc embedding
EMBEDDING_DIM = model_esm2.config.hidden_size 
print(f"Embedding dimension: {EMBEDDING_DIM}")

# Thi·∫øt l·∫≠p thi·∫øt b·ªã
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model_esm2 = model_esm2.to(device)
model_esm2 = model_esm2.eval()

print(f"Model loaded. GPU memory: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB\n")

# --- H√†m X·ª≠ l√Ω D·ªØ li·ªáu theo Batch ---
def process_and_embed_batch(prot_ids, prot_to_seq, prot_to_taxon, model, tokenizer, device, max_length=1024):
    final_features = []

    for pid in prot_ids:
        if pid not in prot_to_seq:
            continue
        seq = prot_to_seq[pid]

        # --- Chunk sequence n·∫øu qu√° d√†i ---
        chunks = [seq[i:i+max_length] for i in range(0, len(seq), max_length)]
        chunk_embeddings = []

        for chunk in chunks:
            tokens = tokenizer(
                chunk,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
                add_special_tokens=False
            )
            input_ids = tokens['input_ids'].to(device)
            attention_mask = tokens['attention_mask'].to(device)

            with torch.no_grad():
                output = model(input_ids=input_ids, attention_mask=attention_mask)
                # L·∫•y embedding CLS
                emb = output.last_hidden_state[:, 0, :]  # [1, hidden_dim]
                chunk_embeddings.append(emb.squeeze(0))

            del input_ids, attention_mask, output
            torch.cuda.empty_cache()
        
        # --- Trung b√¨nh embedding c√°c chunk ---
        seq_emb = torch.stack(chunk_embeddings, dim=0).mean(dim=0)  # [hidden_dim]

        # --- Gh√©p Taxon One-Hot ---
        taxon_vec = prot_taxon_onehot(pid, prot_to_taxon).to(device)
        features = torch.cat([seq_emb, taxon_vec], dim=0)
        final_features.append((pid, features))

        del chunk_embeddings, seq_emb, taxon_vec
        torch.cuda.empty_cache()

    return final_features


# --- Ki·ªÉm tra ƒë·ªô d√†i chu·ªói ---
print("Analyzing sequence lengths...")
lengths = [len(prot_to_seq[pid]) for pid in prot_to_seq.keys() if pid in prot_to_seq]
print(f"Total proteins: {len(lengths)}")
print(f"Min length: {min(lengths)}, Max length: {max(lengths)}")
print(f"Mean length: {sum(lengths)/len(lengths):.0f}\n")

# --- V√≤ng L·∫∑p Ch√≠nh v·ªõi Ti·∫øn ƒê·ªô ---
final_features_list = []
all_prot_ids = list(prot_to_seq.keys())
BATCH_SIZE = 128  # X·ª≠ l√Ω t·ª´ng chu·ªói m·ªôt
MAX_LENGTH = 1024  # Gi·ªõi h·∫°n ƒë·ªô d√†i

# ƒê·∫øm s·ªë protein h·ª£p l·ªá
valid_count = sum(1 for pid in all_prot_ids if pid in prot_to_seq and pid in prot_to_taxon)
print(f"Valid proteins (c√≥ c·∫£ sequence v√† taxon): {valid_count}/{len(all_prot_ids)}\n")

# T·∫°o progress bar
pbar = tqdm(total=len(all_prot_ids), desc="Processing proteins", unit="protein")

start_time = time.time()
error_count = 0

for i in range(0, len(all_prot_ids), BATCH_SIZE):
    batch_ids = all_prot_ids[i:i + BATCH_SIZE]
    valid_batch_ids = [pid for pid in batch_ids if pid in prot_to_seq and pid in prot_to_taxon]
    print(i)
    if valid_batch_ids:
        try:
            batch_features = process_and_embed_batch(
                valid_batch_ids, prot_to_seq, prot_to_taxon, 
                model_esm2, tokenizer, device,
                max_length=MAX_LENGTH
            )
            final_features_list.extend(batch_features)
        except RuntimeError as e:
            if "out of memory" in str(e):
                error_count += 1
                torch.cuda.empty_cache()
            else:
                raise e

    pbar.update(len(valid_batch_ids))


# --- Th·ªëng k√™ k·∫øt qu·∫£ ---
elapsed_time = time.time() - start_time
print(f"\n{'='*60}")
print(f"‚úì X·ª≠ l√Ω ho√†n t·∫•t!")
print(f"{'='*60}")
print(f"T·ªïng proteins x·ª≠ l√Ω: {len(final_features_list)}/{valid_count}")
print(f"L·ªói OOM: {error_count}")
print(f"Th·ªùi gian: {elapsed_time/60:.1f} ph√∫t ({elapsed_time:.0f}s)")
print(f"T·ªëc ƒë·ªô trung b√¨nh: {len(final_features_list)/elapsed_time:.2f} proteins/gi√¢y")
print(f"K√≠ch th∆∞·ªõc feature: {final_features_list[0][1].shape if final_features_list else 'N/A'}")
print(f"GPU memory cu·ªëi: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"{'='*60}\n")

In [None]:
import numpy as np
import os
print("\n" + "="*60)
print("B·∫Øt ƒë·∫ßu t·ªïng h·ª£p v√† l∆∞u d·ªØ li·ªáu...")
print("="*60)

# --- 1. T√°ch Protein ID v√† Vector Features ---
print("\n[1/4] T√°ch protein IDs v√† features...")
protein_ids = [item[0] for item in final_features_list]
feature_tensors = [item[1] for item in final_features_list]

print(f"   ‚úì ƒê√£ t√°ch {len(protein_ids)} protein IDs")
print(f"   ‚úì ƒê√£ t√°ch {len(feature_tensors)} feature vectors")

# --- 2. Chuy·ªÉn ƒë·ªïi sang NumPy ---
print("\n[2/4] Chuy·ªÉn ƒë·ªïi PyTorch tensors sang NumPy arrays...")

# 2.1. Stack c√°c feature tensors
print("   - ƒêang stack feature tensors...")
stacked_features = torch.stack(feature_tensors)
print(f"   ‚úì K√≠ch th∆∞·ªõc tensor sau khi stack: {stacked_features.shape}")

# 2.2. Chuy·ªÉn sang CPU v√† NumPy
print("   - ƒêang chuy·ªÉn sang CPU v√† NumPy...")
all_features_np = stacked_features.cpu().numpy()
protein_ids_np = np.array(protein_ids)

print(f"   ‚úì Features shape: {all_features_np.shape}")
print(f"   ‚úì Protein IDs shape: {protein_ids_np.shape}")
print(f"   ‚úì Features dtype: {all_features_np.dtype}")

# --- 3. T·∫°o th∆∞ m·ª•c output ---
print("\n[3/4] Chu·∫©n b·ªã th∆∞ m·ª•c output...")
output_dir = '/kaggle/working/'  # Thay ƒë·ªïi n·∫øu c·∫ßn
os.makedirs(output_dir, exist_ok=True)
print(f"   ‚úì Th∆∞ m·ª•c output: {output_dir}")

# --- 4. L∆∞u file .npy ---
print("\n[4/4] L∆∞u files .npy...")

# 4.1. L∆∞u Features (X)
features_output_path = os.path.join(output_dir, 'cafa56_esm2_taxon_features_X.npy')
print(f"   - ƒêang l∆∞u features...")
np.save(features_output_path, all_features_np)
file_size_mb = os.path.getsize(features_output_path) / (1024 * 1024)
print(f"   ‚úÖ ƒê√£ l∆∞u Features t·∫°i: {features_output_path}")
print(f"      Shape: {all_features_np.shape}")
print(f"      Size: {file_size_mb:.2f} MB")

# 4.2. L∆∞u Protein IDs
ids_output_path = os.path.join(output_dir, 'cafa56_protein_ids.npy')
print(f"\n   - ƒêang l∆∞u protein IDs...")
np.save(ids_output_path, protein_ids_np)
file_size_kb = os.path.getsize(ids_output_path) / 1024
print(f"   ‚úÖ ƒê√£ l∆∞u Protein IDs t·∫°i: {ids_output_path}")
print(f"      Shape: {protein_ids_np.shape}")
print(f"      Size: {file_size_kb:.2f} KB")

# --- 5. T√≥m t·∫Øt cu·ªëi c√πng ---
print("\n" + "="*60)
print("‚úì HO√ÄN T·∫§T T·∫§T C·∫¢!")
print("="*60)
print(f"üìä T·ªïng k·∫øt:")
print(f"   - S·ªë proteins: {len(protein_ids_np)}")
print(f"   - Feature dimension: {all_features_np.shape[1]}")
print(f"   - ESM-2 embedding dim: {EMBEDDING_DIM}")
print(f"   - Taxon one-hot dim: {num_taxon}")
print(f"   - Total feature dim: {EMBEDDING_DIM + num_taxon}")
print(f"\nüìÅ Files ƒë√£ l∆∞u:")
print(f"   1. {features_output_path}")
print(f"   2. {ids_output_path}")
print("="*60 + "\n")

# --- 6. Ki·ªÉm tra t√≠nh to√†n v·∫πn d·ªØ li·ªáu (Optional) ---
print("üîç Ki·ªÉm tra t√≠nh to√†n v·∫πn d·ªØ li·ªáu...")
try:
    # Load l·∫°i ƒë·ªÉ ki·ªÉm tra
    loaded_features = np.load(features_output_path)
    loaded_ids = np.load(ids_output_path)

    
    print("   ‚úÖ T·∫•t c·∫£ files ƒë√£ ƒë∆∞·ª£c l∆∞u ƒë√∫ng v√† c√≥ th·ªÉ load l·∫°i!")
    print(f"   ‚úÖ Verified {loaded_features.shape[0]} proteins")
    
except Exception as e:
    print(f"   ‚ö†Ô∏è L·ªói khi ki·ªÉm tra: {e}")

print("\nüéâ Done! B·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng c√°c file .npy n√†y cho training.\n")

# --- 7. H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ---
print("üí° C√°ch load d·ªØ li·ªáu sau n√†y:")
print("-" * 60)
print("import numpy as np")
print(f"X = np.load('{features_output_path}')")
print(f"protein_ids = np.load('{ids_output_path}')")
print("print(f'X shape: {X.shape}')")
print("print(f'IDs shape: {protein_ids.shape}')")
print("-" * 60 + "\n")

import numpy as np
X = np.load('/kaggle/working/cafa56_esm2_taxon_features_X.npy')
protein_ids = np.load('/kaggle/working/cafa56_protein_ids.npy')
print(f'X shape: {X.shape}')
print(f'IDs shape: {protein_ids.shape}')


In [None]:
# import numpy as np

# protein_ids = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids.npy')
# X = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_esm2_taxon_features_X.npy')

# print(f'X shape: {X.shape}')
# print(f'IDs shape: {protein_ids.shape}')
# print(f'---')
# # In th√™m th√¥ng tin chi ti·∫øt
# print(f'X dtype: {X.dtype}')
# print(f'X ndim: {X.ndim}')
# print(f'IDs dtype: {protein_ids.dtype}')
# print(f'IDs ndim: {protein_ids.ndim}')
# # In ra m·ªôt v√†i ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n ƒë·ªÉ ki·ªÉm tra n·ªôi dung
# print(f'---')
# print(f'X first 5 rows:\n{X[:5]}') 
# print(f'IDs first 5 elements: {protein_ids[:5]}')

In [None]:
# import pandas as pd
# import numpy as np

# # ================================
# # 1. Load X v√† IDs ƒë√£ n√©n s·∫µn
# # ================================

# IDs = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids.npy')
# X = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_esm2_taxon_features_X.npy')

# print("Loaded X:", X.shape)
# print("Loaded IDs:", IDs.shape)

# # Map ID ‚Üí index trong X
# id_to_index = {pid: i for i, pid in enumerate(IDs)}

# # ================================
# # 2. Load b·∫£ng GO annotation train
# # ================================
# # Format file: EntryID \t term \t aspect
# df = pd.read_csv("/kaggle/input/cafa-m56/CAFA56/CAFA56_train_terms.tsv", sep="\t")

# print("Annotation rows:", len(df))
# df.head()

# # ================================
# # 3. L·∫•y danh s√°ch t·∫•t c·∫£ GO terms trong train
# # ================================
# all_go_terms = sorted(df["term"].unique())
# num_go = len(all_go_terms)

# print("Number of GO terms =", num_go)

# # Map GO ‚Üí c·ªôt c·ªßa Y
# go_to_idx = {go: i for i, go in enumerate(all_go_terms)}

# # ================================
# # 4. T·∫°o ma tr·∫≠n Y (multi-hot)
# # ================================
# N = len(IDs)
# Y = np.zeros((N, num_go), dtype=np.float32)

# missing = 0

# for _, row in df.iterrows():
#     pid = row["EntryID"]
#     go  = row["term"]

#     if pid not in id_to_index:
#         missing += 1
#         continue

#     i = id_to_index[pid]  # row trong X
#     j = go_to_idx[go]     # c·ªôt GO
#     Y[i, j] = 1.0

# print("Proteins in annotation but NOT in X:", missing)

# # ================================
# # 5. L∆∞u Y v√† danh s√°ch GO terms
# # ================================
# np.save("/kaggle/working/Y.npy", Y)
# np.save("/kaggle/working/GO_terms.npy", np.array(all_go_terms))

# print("\n===== DONE =====")
# print("X shape:", X.shape)
# print("Y shape:", Y.shape)
# print("GO_terms shape:", len(all_go_terms))


In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from collections import Counter

# =============== OBO PARSER ===============
def parse_obo(obo_path):
    parents = {}
    children = {}
    current = None

    with open(obo_path, "r") as f:
        for line in f:
            line = line.strip()

            if line == "[Term]":
                current = None
                continue

            if line.startswith("id: GO:"):
                current = line.split("id: ")[1]
                parents[current] = []
                children[current] = []
                continue

            if line.startswith("is_a:") and current:
                parent = line.split("is_a: ")[1].split(" ! ")[0]
                parents[current].append(parent)

    for child, plist in parents.items():
        for p in plist:
            if p not in children:
                children[p] = []
            children[p].append(child)

    return parents, children


def filter_leaf_terms(filtered_terms, children):
    terms_set = set(filtered_terms)
    leaf_terms = []

    for go in filtered_terms:
        child_list = children.get(go, [])
        if not any(c in terms_set for c in child_list):
            leaf_terms.append(go)

    return leaf_terms

# ==========================================

obo = ('/kaggle/input/cafa56-end/go-basic.obo')
IDs = np.load('/kaggle/input/cafa56-end/650_protein_ids_INPUT.npy')
X = np.load('/kaggle/input/cafa56-end/650_taxon_features_X_INPUT.npy')

id_to_index = {pid: i for i, pid in enumerate(IDs)}

parents, children = parse_obo(obo)
df = pd.read_csv("/kaggle/input/cafa-m56/CAFA56/CAFA56_train_terms.tsv", sep="\t")

MIN_FREQ = 21
aspects = ["C", "P", "F"]

for asp in aspects:
    df_asp = df[df["aspect"] == asp]
    term_counts = Counter(df_asp["term"])

    filtered_terms = sorted([go for go, c in term_counts.items() if c >= MIN_FREQ])
    print(f"[{asp}] freq‚â•{MIN_FREQ}:", len(filtered_terms))

    # NEW: apply leaf filter
    # filtered_terms = filter_leaf_terms(filtered_terms, children)
    # print(f"[{asp}] leaf-only terms:", len(filtered_terms))

    go_to_idx = {go: i for i, go in enumerate(filtered_terms)}
    num_go = len(filtered_terms)

    rows, cols = [], []
    missing = 0
    N = len(IDs)

    for _, row in df_asp.iterrows():
        pid = row["EntryID"]
        go = row["term"]

        if go not in go_to_idx:
            continue
        if pid not in id_to_index:
            missing += 1
            continue

        rows.append(id_to_index[pid])
        cols.append(go_to_idx[go])

    data = np.ones(len(rows), dtype=np.float32)
    Y_sparse = sparse.coo_matrix((data, (rows, cols)), shape=(N, num_go))

    sparse.save_npz(f"/kaggle/working/Y_{asp}.npz", Y_sparse)
    np.save(f"/kaggle/working/GO_terms_{asp}.npy", np.array(filtered_terms))

    print(f"{asp} done: {num_go} GO terms, missing proteins: {missing}")


In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from collections import Counter

# ================================
# 1. Load X v√† IDs
# ================================
IDs = np.load('/kaggle/input/cafa56-end/650_protein_ids_INPUT.npy')
X = np.load('/kaggle/input/cafa56-end/650_taxon_features_X_INPUT.npy')

print("Loaded X:", X.shape)
print("Loaded IDs:", IDs.shape)

id_to_index = {pid: i for i, pid in enumerate(IDs)}

# ================================
# 2. Load b·∫£ng GO annotation train
# ================================
df = pd.read_csv("/kaggle/input/cafa-m56/CAFA56/CAFA56_train_terms.tsv", sep="\t")
print("Annotation rows:", len(df))

# ================================
# 3. Filter GO terms by frequency ‚â• MIN_FREQ
# ================================
MIN_FREQ = 21

# ---- Count GO terms ----
term_counts = Counter(df["term"])

# ---- Keep all terms with freq ‚â• MIN_FREQ ----
filtered_terms = sorted([
    go for go, c in term_counts.items() if c >= MIN_FREQ
])

print(f"Original GO terms: {len(term_counts)}")
print(f"Filtered GO terms (freq ‚â• {MIN_FREQ}): {len(filtered_terms)}")

# Mapping GO ‚Üí index
go_to_idx = {go: i for i, go in enumerate(filtered_terms)}
num_go = len(filtered_terms)

N = len(IDs)
rows = []
cols = []
missing = 0

# ================================
# 4. Build sparse Y (NO OBO FILTER)
# ================================
for _, row in df.iterrows():
    pid = row["EntryID"]
    go = row["term"]

    # Skip infrequent GO
    if go not in go_to_idx:
        continue

    # Skip proteins not in X
    if pid not in id_to_index:
        missing += 1
        continue

    i = id_to_index[pid]
    j = go_to_idx[go]
    rows.append(i)
    cols.append(j)

data = np.ones(len(rows), dtype=np.float32)
Y_sparse = sparse.coo_matrix((data, (rows, cols)), shape=(N, num_go))

# ================================
# 5. Save outputs
# ================================
sparse.save_npz("/kaggle/working/Y.npz", Y_sparse)
np.save("/kaggle/working/GO_terms.npy", np.array(filtered_terms))

print("\n==== DONE ====")
print(f"Stored {num_go} GO terms in Y.npz")
print("Proteins missing in X:", missing)


Loaded X: (144096, 1291)
Loaded IDs: (144096,)
Annotation rows: 5410821
Original GO terms: 32347
Filtered GO terms (freq ‚â• 0): 32347

==== DONE ====
Stored 32347 GO terms in Y.npz
Proteins missing in X: 25447


In [None]:
# import numpy as np
# from scipy import sparse

# # Load Y dense
# Y = np.load("/kaggle/working/Y.npy", allow_pickle=False)
# GO_terms = np.load("/kaggle/working/GO_terms.npy", allow_pickle=True)

# print("Dense Y shape:", Y.shape)

# # Chuy·ªÉn sang sparse
# Y_sparse = sparse.csr_matrix(Y)
# sparse.save_npz("/kaggle/working/Y_sparse.npz", Y_sparse)
# print("Saved sparse Y at /kaggle/working/Y_sparse.npz")


In [None]:
# import numpy as np
# from scipy import sparse

# # Load Y sparse
# Y_sparse = sparse.load_npz("/kaggle/working/Y_sparse.npz")
# GO_terms = np.load("/kaggle/working/GO_terms.npy", allow_pickle=True)

# print("Sparse Y loaded")
# print("Type:", type(Y_sparse))
# print("Shape:", Y_sparse.shape)
# print("Number of non-zero entries:", Y_sparse.nnz)
# print("Density (non-zero fraction):", Y_sparse.nnz / (Y_sparse.shape[0] * Y_sparse.shape[1]))

# # Ki·ªÉm tra v√†i d√≤ng ƒë·∫ßu
# num_rows_to_check = 5
# for i in range(num_rows_to_check):
#     row = Y_sparse.getrow(i).toarray()  # Chuy·ªÉn sang dense t·∫°m th·ªùi ƒë·ªÉ ki·ªÉm tra
#     print(f"\nRow {i} non-zero indices:", np.nonzero(row)[1])
#     print(f"Row {i} values:", row[0, np.nonzero(row)[1]])


In [None]:
# import numpy as np
# import pandas as pd
# from scipy.sparse import load_npz

# # --- Load d·ªØ li·ªáu ---
# X = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_esm2_taxon_features_X.npy')
# protein_ids = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids.npy')
# Y_sparse = load_npz("/kaggle/working/Y_sparse.npz")  # CSR sparse

# # File train GO
# train_go_file = "/kaggle/input/cafa-m56/CAFA56/CAFA56_train_terms.tsv"  # s·ª≠a ƒë∆∞·ªùng d·∫´n n·∫øu c·∫ßn
# train_df = pd.read_csv(train_go_file, sep='\t')

# # --- T·∫°o dict protein_id -> list GO indices ---
# # Tr∆∞·ªõc h·∫øt, c·∫ßn danh s√°ch t·∫•t c·∫£ GO terms
# all_go_terms = np.load("/kaggle/working/GO_terms.npy", allow_pickle=True)
# go_to_index = {go:i for i, go in enumerate(all_go_terms)}

# protein_to_go_indices = {}
# for pid, go, aspect in zip(train_df['EntryID'], train_df['term'], train_df['aspect']):
#     if pid not in protein_to_go_indices:
#         protein_to_go_indices[pid] = []
#     protein_to_go_indices[pid].append(go_to_index[go])

# # --- H√†m ki·ªÉm tra ng·∫´u nhi√™n n protein ---
# def check_features_labels_match(n=5, seed=42):
#     np.random.seed(seed)
#     indices = np.random.choice(len(protein_ids), n, replace=False)
    
#     for idx in indices:
#         pid = protein_ids[idx]
#         row_sparse = Y_sparse[idx]
#         row_indices = row_sparse.nonzero()[1]  # ch·ªâ s·ªë c·ªôt c√≥ nh√£n
#         train_indices = protein_to_go_indices.get(pid, [])
        
#         print(f"Protein ID: {pid}")
#         print(f"Indices in Y_sparse: {sorted(row_indices)}")
#         print(f"Indices in train file: {sorted(train_indices)}")
#         print(f"Match? {set(row_indices) == set(train_indices)}")
#         print("-"*50)

# # --- Ch·∫°y ki·ªÉm tra ---
# check_features_labels_match(n=10)


In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# import numpy as np
# from scipy.sparse import csr_matrix, load_npz

# # --- Load d·ªØ li·ªáu ---
# X = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_esm2_taxon_features_X.npy')
# protein_ids = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids.npy')
# X_test = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_X_test.npy')
# X_test_protein_ids = np.load('/kaggle/input/cafa-m56/CAFA56/cafa56_protein_ids_test.npy')
# Y_sparse = load_npz("/kaggle/input/cafa56-y-label/cafa56_Y_sparse.npz")  # CSR sparse
# all_go_terms = np.load("/kaggle/input/cafa56-y-label/cafa56_GO_terms.npy", allow_pickle=True)

# # --- Dataset class ---
# class ProteinDataset(Dataset):
#     def __init__(self, X, Y_sparse):
#         self.X = torch.tensor(X, dtype=torch.float32)
#         # Convert sparse Y to dense tensor khi training
#         self.Y = torch.tensor(Y_sparse.toarray(), dtype=torch.float32)

#     def __len__(self):
#         return self.X.shape[0]

#     def __getitem__(self, idx):
#         return self.X[idx], self.Y[idx]

# # --- T·∫°o dataset ---
# dataset = ProteinDataset(X, Y_sparse)

# # --- Chia train/test ---
# from sklearn.model_selection import train_test_split

# indices = np.arange(len(dataset))
# train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# from torch.utils.data import Subset

# train_dataset = Subset(dataset, train_idx)
# test_dataset = Subset(dataset, test_idx)

# # --- DataLoader ---
# batch_size = 64
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# # --- Ki·ªÉm tra ---
# for X_batch, Y_batch in train_loader:
#     print(X_batch.shape)  # [batch_size, feature_dim]
#     print(Y_batch.shape)  # [batch_size, num_labels]
#     break


In [2]:
import torch
import numpy as np
from scipy.sparse import load_npz, csr_matrix
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split

def load_data():
    X = np.load('/kaggle/input/cafa56-end/650_taxon_features_X_INPUT.npy')
    protein_ids = np.load('/kaggle/input/cafa56-end/650_protein_ids_INPUT.npy')
    X_test = np.load('/kaggle/input/cafa56-end/X_test.npy')
    X_test_protein_ids = np.load('/kaggle/input/cafa56-end/protein_ids_test.npy')
    Y_sparse = load_npz("/kaggle/working/Y.npz")  # CSR sparse
    GO = np.load("/kaggle/working/GO_terms.npy", allow_pickle=True)
    return X, protein_ids, X_test, X_test_protein_ids, Y_sparse, GO

class ProteinDataset(Dataset):
    def __init__(self, X, Y_sparse):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.Y = torch.tensor(Y_sparse.toarray(), dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

def check_integrity(X_train, ids_train, X_test, ids_test, Y_sparse, GO_terms):
    print("== BASIC SHAPE CHECK ==")
    n_X = X_train.shape[0]
    n_ids = len(ids_train)
    n_Y = Y_sparse.shape[0]
    n_labels = Y_sparse.shape[1]
    n_GO = len(GO_terms)

    print(f"X_train : {X_train.shape}")
    print(f"ids_train count : {n_ids}")
    print(f"Y_sparse shape : {Y_sparse.shape}")
    print(f"GO terms count : {n_GO}")
    print(f"X_test  : {X_test.shape}")
    print(f"ids_test count : {len(ids_test)}")

    if n_X != n_ids:
        print("‚úó FAIL: S·ªë l∆∞·ª£ng m·∫´u X_train v√† ids_train kh√¥ng kh·ªõp!")
    else:
        print("‚úì ids_train kh·ªõp s·ªë m·∫´u X_train")

    if n_X != n_Y:
        print("‚úó FAIL: S·ªë l∆∞·ª£ng X_train v√† Y labels kh√¥ng kh·ªõp!")
    else:
        print("‚úì S·ªë m·∫´u X_train & Y labels kh·ªõp nhau")

    if n_labels != n_GO:
        print("‚úó FAIL: S·ªë chi·ªÅu Y (labels) v√† s·ªë GO terms kh√¥ng kh·ªõp!")
    else:
        print("‚úì S·ªë labels kh·ªõp s·ªë GO terms")

    # Check X_test feature dim matches X_train
    if X_test.shape[1] != X_train.shape[1]:
        print("‚úó FAIL: s·ªë chi·ªÅu feature X_test v√† X_train KH√îNG kh·ªõp!")
    else:
        print("‚úì Feature dimension X_test & X_train kh·ªõp")

    print("\n== SAMPLE IDS EXAMPLE ==")
    for i in [0, 1, min(5, n_ids - 1)]:
        print(f"Train sample {i} : id = {ids_train[i]}")

    for i in [0, 1, min(5, len(ids_test) - 1)]:
        print(f"Test sample {i}  : id = {ids_test[i]}")

    # Optionally: check some Y vectors (sparsity / zero-rows)
    Y_dense = Y_sparse.toarray()
    n_zero_rows = np.sum((Y_dense.sum(axis=1) == 0))
    print(f"\nTrong Y labels c√≥ {n_zero_rows}/{n_Y} m·∫´u to√†n zero-label (kh√¥ng GO term n√†o).")

def build_and_run_loader(X_train, Y_sparse, batch_size=64, random_state=42):
    dataset = ProteinDataset(X_train, Y_sparse)
    indices = np.arange(len(dataset))
    train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=random_state)
    train_ds = Subset(dataset, train_idx)
    test_ds = Subset(dataset, test_idx)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    print("\n== DATALOADER BATCH SHAPE CHECK ==")
    for Xb, Yb in train_loader:
        print("Train batch Xb.shape:", Xb.shape)
        print("Train batch Yb.shape:", Yb.shape)
        break
    for Xb, Yb in test_loader:
        print("Test  batch Xb.shape:", Xb.shape)
        print("Test  batch Yb.shape:", Yb.shape)
        break

    return train_loader, test_loader

if __name__ == "__main__":
    X, ids, X_test, ids_test, Y_sparse, GO = load_data()
    check_integrity(X, ids, X_test, ids_test, Y_sparse, GO)
    build_and_run_loader(X, Y_sparse)


== BASIC SHAPE CHECK ==
X_train : (144096, 1291)
ids_train count : 144096
Y_sparse shape : (144096, 32347)
GO terms count : 32347
X_test  : (224309, 1291)
ids_test count : 224309
‚úì ids_train kh·ªõp s·ªë m·∫´u X_train
‚úì S·ªë m·∫´u X_train & Y labels kh·ªõp nhau
‚úì S·ªë labels kh·ªõp s·ªë GO terms
‚úì Feature dimension X_test & X_train kh·ªõp

== SAMPLE IDS EXAMPLE ==
Train sample 0 : id = P20536
Train sample 1 : id = O73864
Train sample 5 : id = P33681
Test sample 0  : id = A0A0C5B5G6
Test sample 1  : id = A0A1B0GTW7
Test sample 5  : id = A1A4S6

Trong Y labels c√≥ 0/144096 m·∫´u to√†n zero-label (kh√¥ng GO term n√†o).

== DATALOADER BATCH SHAPE CHECK ==
Train batch Xb.shape: torch.Size([64, 1291])
Train batch Yb.shape: torch.Size([64, 32347])
Test  batch Xb.shape: torch.Size([64, 1291])
Test  batch Yb.shape: torch.Size([64, 32347])
