In [None]:
# ============================================================
# SECTION 1 — IMPORTS
# ============================================================

import pandas as pd
import numpy as np
from datasets import Dataset
from collections import Counter
import gc
import time
import psutil
import os

"""
EXPLANATION OF SECTION 1:
1. pandas (pd)
   - Used for structured/tabular data manipulation.
   - Likely used later for reading protein datasets (e.g., CSV with sequences and GO terms).
   - Provides functions like:
       - read_csv(), sort_values(), unique(), groupby(), reset_index()

2. numpy (np)
   - Used for numerical operations.
   - Deep learning models require numerical arrays.
   - Useful for:
       - vector operations, encoding labels, reshaping data

3. Dataset (from HuggingFace datasets library)
   - Converts data into an optimized format for training.
   - Often used with transformers or large NLP models.
   - If not using HuggingFace Trainer or tokenizers later, this import may be unnecessary.

4. Counter
   - Counts frequency of elements in a list.
   - Very useful for:
       - Counting GO term frequencies, Detecting class imbalance

5. gc (Garbage Collector)
   - Frees memory manually using gc.collect()
   - Important when handling large biological datasets.


6. time
   - Used to measure execution time.

7. psutil
   - Tracks memory usage of current process.
   - Useful for monitoring RAM consumption.

8. os
   - Used for operating system utilities.
   - Here it is used to get current process ID.
"""

# ============================================================
# SECTION 2 — INITIALIZE TIME AND MEMORY TRACKING
# ============================================================

start_time = time.time()
process = psutil.Process(os.getpid())

"""
EXPLANATION OF SECTION 2:

1. time.time()
   - Returns current time in seconds since epoch.
   - Used to calculate total runtime later: total_time = time.time() - start_time

2. os.getpid()
   - Gets the process ID of the current Python program.

3. psutil.Process(pid)
   - Attaches a memory-monitoring object to this process.
   - Allows access to:
       - memory usage, CPU usage, other process stats
"""

# ============================================================
# SECTION 3 — MEMORY MONITORING FUNCTION
# ============================================================

def print_memory_usage(step_name):
    mem_info = process.memory_info()
    mem_gb = mem_info.rss / (1024 ** 3)

    print(f"{step_name} - Memory Usage: {mem_gb:.2f} GB")
    return mem_gb

"""
EXPLANATION OF SECTION 3:

Function Purpose:
This function checks how much RAM the program is currently using.

Line-by-line explanation:

1. process.memory_info()
   - Retrieves memory statistics for current process.
   - Returns an object containing:
       - rss (Resident Set Size)
       - vms (Virtual memory size)

2. mem_info.rss
   - RSS = actual physical memory being used.
   - This is the most relevant measure for memory usage.

3. (1024 ** 3)
   - Converts bytes to gigabytes.

4. print(f"{step_name} - Memory Usage: {mem_gb:.2f} GB")
   - Displays step name AND memory.
   - This fixes earlier version where step_name was unused.

SIMPLIFICATION:
Original function accepted step_name but did not use it.
This version properly integrates it.
Function behavior remains identical.
"""

# ============================================================
# SECTION 4 — STEP HEADER PRINTING FUNCTION
# ============================================================

def print_step_header(step_num, step_name):
    separator = "=" * 60

    print("\n" + separator)
    print(f"STEP {step_num}: {step_name}")
    print(separator)

"""
EXPLANATION OF SECTION 4:

Purpose:
This function improves readability when running long pipelines.

1. "=" * 60
   - Repeats "=" 60 times.
   - Creates a clear visual separator.

2. print("\n" + separator)
   - Adds spacing before new step.

3. f"STEP {step_num}: {step_name}"
   - Displays structured step numbering.

Why this is useful:
When processing protein datasets, multiple stages exist:
    - Loading data, Cleaning, Encoding, Training, Evaluation

This prevents confusion during long runs.

SIMPLIFICATION:
Original implementation was already clean.
We just stored separator in a variable for clarity.
Function output remains identical.
"""

# ============================================================
# SECTION 5 — START LOGGING EXPERIMENT
# ============================================================

initial_mem = print_memory_usage("Initial")

print("\nSTARTING PROTEIN GO TERM PROCESSING")
print(f"Start Time: {time.strftime('%H:%M:%S')}")
print(f"Initial Memory: {initial_mem:.2f} GB")

"""
EXPLANATION OF SECTION 5:

1. initial_mem = print_memory_usage("Initial")
   - Captures memory before processing begins.
   - Later we can compare:
         current_memory - initial_memory
   - Helps detect memory leaks.

2. time.strftime('%H:%M:%S')
   - Formats current time as Hour:Minute:Second.

3. Logging statements
   - Useful for reproducibility.
   - Important when running experiments overnight or on servers.
"""


  from .autonotebook import tqdm as notebook_tqdm


Initial - Memory Usage: 0.29 GB

STARTING PROTEIN GO TERM PROCESSING
Start Time: 00:02:11
Initial Memory: 0.29 GB


'\nEXPLANATION OF SECTION 5:\n\n1. initial_mem = print_memory_usage("Initial")\n   - Captures memory before processing begins.\n   - Later we can compare:\n         current_memory - initial_memory\n   - Helps detect memory leaks.\n\n2. time.strftime(\'%H:%M:%S\')\n   - Formats current time as Hour:Minute:Second.\n\n3. Logging statements\n   - Useful for reproducibility.\n   - Important when running experiments overnight or on servers.\n\nSIMPLIFICATION:\nCode is already minimal and clean.\nNo unnecessary complexity here.\n\n'

In [None]:
# ============================================================
# SECTION 1 — IMPORT SeqIO FROM BIOPYTHON
# ============================================================
from Bio import SeqIO

"""
EXPLANATION OF SECTION 1:

Bio.SeqIO is a module from the Biopython library used for reading
biological sequence files such as FASTA, GenBank, etc.

SeqIO can:
- Parse FASTA files automatically
- Extract IDs and sequences cleanly
- Avoid manual string parsing
"""

# ============================================================
# SECTION 2 — LOAD TSV LABEL FILE
# ============================================================

tsv_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/raw/train/uniprotkb_AND_reviewed_true_AND_protein_2025_12_27.tsv"
labels_df = pd.read_csv(tsv_path, sep='\t')

"""
EXPLANATION OF SECTION 2:

1. pd.read_csv(tsv_path, sep='\t')

   - read_csv() loads tabular data into a pandas DataFrame.
   - sep='\t' tells pandas that the file is TAB-separated.
   - TSV = Tab Separated Values.

The result:
labels_df is a DataFrame (rows × columns).
This DataFrame will later be merged with sequences.
"""

# ============================================================
# SECTION 3 — LOAD FASTA SEQUENCES (SIMPLIFIED & CORRECTED)
# ============================================================

fasta_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/raw/train/uniprotkb_AND_reviewed_true_AND_protein_2025_12_27.fasta"
sequence_dict = {}

for record in SeqIO.parse(fasta_path, "fasta"):
    # UniProt FASTA headers look like:
    # sp|P12345|PROTEIN_NAME
    # We extract the middle ID (P12345)

    header = record.id
    if "|" in header:
        protein_id = header.split("|")[1]
    else:
        protein_id = header.split()[0]
    sequence_dict[protein_id] = str(record.seq)

"""
EXPLANATION OF SECTION 3:

We replaced the manual FASTA parsing with SeqIO.parse.

Original complexity:
- Manually tracking current_id
- Manually appending sequence lines
- Handling final sequence outside loop
- Risk of subtle bugs

1. SeqIO.parse(fasta_path, "fasta")

   - Reads FASTA file safely.
   - Automatically handles:
        - Multi-line sequences
        - Header parsing
        - End-of-file cases

2. record.id
   - Returns FASTA header identifier.
   - Example:
        sp|P12345|PROTEIN_NAME

3. header.split("|")[1]
   - Splits string at "|".
   - Index 1 extracts middle ID (P12345).
   - This matches the original logic.

4. sequence_dict[protein_id] = str(record.seq)
   - record.seq is a Seq object.
   - Convert to string for easier handling later.

Result:
sequence_dict:
    {
        "P12345": "MTEYKLVVVGAGGVGKS...",
        ...
    }
"""

# ============================================================
# SECTION 4 — VERIFY LOADED DATA
# ============================================================

print(f"TSV entries: {len(labels_df)}")
print(f"FASTA sequences: {len(sequence_dict)}")

"""
EXPLANATION OF SECTION 4:

1. len(labels_df)
   - Returns number of rows in DataFrame.
   - Equivalent to number of labeled proteins.

2. len(sequence_dict)
   - Returns number of protein sequences parsed.

Why this is important:
We must verify that:
    number of labels ≈ number of sequences

If they differ significantly:
    - Some proteins may lack sequences
    - Some sequences may lack labels
    - IDs may not match

Later, we must ensure proper merging.

IMPORTANT NEXT STEP (very important for deep learning):

We must check:
    set(labels_df['protein_id_column']) 
        ∩ 
    set(sequence_dict.keys())

If intersection is smaller than expected, we may silently lose data.
"""


TSV entries: 105951
FASTA sequences: 105951


"\nEXPLANATION OF SECTION 4:\n\n1. len(labels_df)\n   - Returns number of rows in DataFrame.\n   - Equivalent to number of labeled proteins.\n\n2. len(sequence_dict)\n   - Returns number of protein sequences parsed.\n\nWhy this is important:\nYou must verify that:\n\n    number of labels ≈ number of sequences\n\nIf they differ significantly:\n    - Some proteins may lack sequences\n    - Some sequences may lack labels\n    - IDs may not match\n\nLater, we must ensure proper merging.\n\nIMPORTANT NEXT STEP (very important for deep learning):\n\nWe must check:\n\n    set(labels_df['protein_id_column']) \n        ∩ \n    set(sequence_dict.keys())\n\nIf intersection is smaller than expected,\nyou may silently lose data.\n"

In [None]:
# ============================================================
# SECTION 1 — FUNCTION TO PARSE GO TERM STRINGS
# ============================================================

def parse_go_terms(go_string):
    """
    Convert a semicolon-separated GO string into a Python list.

    Example:
        Input  → "GO:0008150;GO:0003674"
        Output → ["GO:0008150", "GO:0003674"]

    This is required because:
    - The dataset stores multiple GO annotations in one cell.
    - Deep learning models cannot work with raw strings.
    - We must convert them into structured format (list).
    """

    # --------------------------------------------------------
    # SUBSECTION 1A — HANDLE MISSING OR EMPTY VALUES
    # --------------------------------------------------------

    if pd.isna(go_string) or go_string == "":
        return []

    """
    EXPLANATION:

    1. pd.isna(go_string)
       - Checks whether value is NaN (missing).
       - NaN can appear if a protein has no GO annotations.

    2. go_string == ""
       - Handles empty strings.
       - Sometimes TSV files store missing values as "" instead of NaN.

    Why return [] instead of None?
       - Returning an empty list keeps data type consistent.
       - Later, we will expect a LIST of GO terms.
       - Returning None would break later loops.

    Deep learning perspective:
       If we are doing multi-label classification,
       returning [] means this protein has no labels.
       We must later decide:
           - Remove such proteins or Keep them?
    """

    # --------------------------------------------------------
    # SUBSECTION 1B — SPLIT STRING INTO LIST
    # --------------------------------------------------------

    return [term.strip() for term in str(go_string).split(';')]

    """
    EXPLANATION:

    1. str(go_string)
       - Ensures the value is treated as string.
       - Defensive programming in case data type is unexpected.

    2. .split(';')
       - Splits string at each semicolon.
       - Example:
           "GO:0003953; GO:0007165"
       becomes:
           ["GO:0003953", " GO:0007165"]

    3. term.strip()
       - Removes leading/trailing whitespace.
       - Important because some entries have space after semicolon.
       - Example: " GO:0007165" → "GO:0007165"

    4. List comprehension
       - Efficient way to apply transformation to each term.
       - Cleaner than using a for-loop.

    FINAL RESULT: A Python list of GO term strings.

    IMPORTANT CONCEPTUAL NOTE:
        We are converting from:
            Multi-label string format
        into:
            Structured multi-label list format.

        This means the problem is NOT single-class classification, It is multi-label classification.
    """

# ============================================================
# SECTION 2 — APPLY PARSING FUNCTION TO ENTIRE DATAFRAME
# ============================================================

labels_df['go_terms_list'] = labels_df['Gene Ontology IDs'].apply(parse_go_terms)

"""
EXPLANATION OF SECTION 2:

1. labels_df['Gene Ontology IDs']
   - Selects the column containing raw GO strings.

2. .apply(parse_go_terms)
   - Applies the function to EACH row.
   - Equivalent to:
         for each row:
             parse_go_terms(value)
   But much cleaner and vectorized.

3. labels_df['go_terms_list'] = ...
   - Creates a NEW column.
   - Does not overwrite original data.
   - Good practice for traceability.

Now your DataFrame contains:

Column: 'Gene Ontology IDs'
    → Raw string format

Column: 'go_terms_list'
    → Python list format

Example:

Before:
    "GO:0003953; GO:0007165"
After:
    ["GO:0003953", "GO:0007165"]

This structured list is REQUIRED for:
    - Counting frequencies (Counter), Building label vocabulary, Multi-hot encoding, Training neural networks
"""

# ============================================================
# SECTION 3 — SIMPLIFICATION ANALYSIS
# ============================================================

"""
A slightly simpler version could be:

    if not isinstance(go_string, str) or go_string.strip() == "":
        return []
    return [term.strip() for term in go_string.split(';')]
"""

# ============================================================
# SECTION 4 — CRITICAL DEEP LEARNING INSIGHT
# ============================================================

"""
IMPORTANT:

Because each protein can have MULTIPLE GO terms,
the task is:
    MULTI-LABEL CLASSIFICATION

This changes everything downstream:

1. We CANNOT use:
       CrossEntropyLoss
   Because that assumes ONE label per sample.

2. We MUST use:
       BCEWithLogitsLoss (if PyTorch)
   or
       BinaryCrossentropy (if Keras)

3. The output layer must be:
       number_of_unique_GO_terms neurons

4. Final activation should be:
       Sigmoid (not Softmax)

Softmax = probabilities sum to 1 (single-label)
Sigmoid = independent probabilities per class (multi-label)
"""


'\nIMPORTANT:\n\nBecause each protein can have MULTIPLE GO terms,\nyour task is:\n\n    MULTI-LABEL CLASSIFICATION\n\nThis changes everything downstream:\n\n1. You CANNOT use:\n       CrossEntropyLoss\n   Because that assumes ONE label per sample.\n\n2. You MUST use:\n       BCEWithLogitsLoss (if PyTorch)\n   or\n       BinaryCrossentropy (if Keras)\n\n3. Your output layer must be:\n       number_of_unique_GO_terms neurons\n\n4. Final activation should be:\n       Sigmoid (not Softmax)\n\nSoftmax = probabilities sum to 1 (single-label)\nSigmoid = independent probabilities per class (multi-label)\n\nThis is a critical conceptual checkpoint.\n'

In [4]:
labels_df.head()

Unnamed: 0,Entry,Entry Name,Protein names,Organism,Sequence,Gene Ontology IDs,go_terms_list
0,A0A009IHW8,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,Acinetobacter baumannii (strain 1295743),MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,GO:0003953; GO:0007165; GO:0019677; GO:0050135...,"[GO:0003953, GO:0007165, GO:0019677, GO:005013..."
1,A0A023I7E1,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",Rhizomucor miehei,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,GO:0000272; GO:0005576; GO:0042973; GO:0052861...,"[GO:0000272, GO:0005576, GO:0042973, GO:005286..."
2,A0A024B7W1,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,Zika virus (isolate ZIKV/Human/French Polynesi...,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,GO:0003724; GO:0003725; GO:0003968; GO:0004252...,"[GO:0003724, GO:0003725, GO:0003968, GO:000425..."
3,A0A024RXP8,GUX1_HYPJR,"Exoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobi...",Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,GO:0005576; GO:0016162; GO:0030245; GO:0030248,"[GO:0005576, GO:0016162, GO:0030245, GO:0030248]"
4,A0A024SC78,CUTI1_HYPJR,Cutinase (EC 3.1.1.74),Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,GO:0005576; GO:0016052; GO:0050525,"[GO:0005576, GO:0016052, GO:0050525]"
