In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from collections import Counter
import gc
import time
import psutil
import os

start_time = time.time()
process = psutil.Process(os.getpid())

def print_memory_usage(step_name):
    mem_info = process.memory_info()
    mem_gb = mem_info.rss / (1024 ** 3)

    print(f"{step_name} - Memory Usage: {mem_gb:.2f} GB")
    return mem_gb

def print_step_header(step_num, step_name):
    separator = "=" * 60

    print("\n" + separator)
    print(f"STEP {step_num}: {step_name}")
    print(separator)

initial_mem = print_memory_usage("Initial")

print("\nSTARTING PROTEIN GO TERM PROCESSING")
print(f"Start Time: {time.strftime('%H:%M:%S')}")
print(f"Initial Memory: {initial_mem:.2f} GB")

  from .autonotebook import tqdm as notebook_tqdm


  Memory: 0.28 GB

STARTING PROTEIN GO TERM PROCESSING
Time: 07:52:28
Initial memory: 0.28 GB


In [None]:
from Bio import SeqIO

tsv_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/raw/train/uniprotkb_AND_reviewed_true_AND_protein_2025_12_27.tsv"
labels_df = pd.read_csv(tsv_path, sep='\t')

fasta_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/raw/train/uniprotkb_AND_reviewed_true_AND_protein_2025_12_27.fasta"
sequence_dict = {}

for record in SeqIO.parse(fasta_path, "fasta"):

    header = record.id
    if "|" in header:
        protein_id = header.split("|")[1]
    else:
        protein_id = header.split()[0]
    sequence_dict[protein_id] = str(record.seq)

print(f"TSV entries: {len(labels_df)}")
print(f"FASTA sequences: {len(sequence_dict)}")

TSV entries: 105951
FASTA sequences: 105951


In [None]:
def parse_go_terms(go_string):

    if pd.isna(go_string) or go_string == "":
        return []

    return [term.strip() for term in str(go_string).split(';')]

labels_df['go_terms_list'] = labels_df['Gene Ontology IDs'].apply(parse_go_terms)

In [4]:
labels_df.head()

Unnamed: 0,Entry,Entry Name,Protein names,Organism,Sequence,Gene Ontology IDs,go_terms_list
0,A0A009IHW8,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,Acinetobacter baumannii (strain 1295743),MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,GO:0003953; GO:0007165; GO:0019677; GO:0050135...,"[GO:0003953, GO:0007165, GO:0019677, GO:005013..."
1,A0A023I7E1,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",Rhizomucor miehei,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,GO:0000272; GO:0005576; GO:0042973; GO:0052861...,"[GO:0000272, GO:0005576, GO:0042973, GO:005286..."
2,A0A024B7W1,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,Zika virus (isolate ZIKV/Human/French Polynesi...,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,GO:0003724; GO:0003725; GO:0003968; GO:0004252...,"[GO:0003724, GO:0003725, GO:0003968, GO:000425..."
3,A0A024RXP8,GUX1_HYPJR,"Exoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobi...",Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,GO:0005576; GO:0016162; GO:0030245; GO:0030248,"[GO:0005576, GO:0016162, GO:0030245, GO:0030248]"
4,A0A024SC78,CUTI1_HYPJR,Cutinase (EC 3.1.1.74),Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,GO:0005576; GO:0016052; GO:0050525,"[GO:0005576, GO:0016052, GO:0050525]"


In [None]:
filtered_df = labels_df[labels_df['Entry'].isin(sequence_dict.keys())].copy()

filtered_df['sequence'] = filtered_df['Entry'].map(sequence_dict)

train_df = filtered_df[['Entry', 'sequence', 'go_terms_list', 'Organism']].rename(
    columns={
        'Entry': 'accession',
        'go_terms_list': 'go_terms',
        'Organism': 'organism'
    }
)

print(f"Matched proteins: {len(train_df)}")

output_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/training_data_combined.csv"
train_df.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")

Matched proteins: 105951
Saved to: data/processed/training_data_combined.csv


In [6]:
train_df.head()

Unnamed: 0,accession,sequence,go_terms,organism
0,A0A009IHW8,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[GO:0003953, GO:0007165, GO:0019677, GO:005013...",Acinetobacter baumannii (strain 1295743)
1,A0A023I7E1,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,"[GO:0000272, GO:0005576, GO:0042973, GO:005286...",Rhizomucor miehei
2,A0A024B7W1,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,"[GO:0003724, GO:0003725, GO:0003968, GO:000425...",Zika virus (isolate ZIKV/Human/French Polynesi...
3,A0A024RXP8,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,"[GO:0005576, GO:0016162, GO:0030245, GO:0030248]",Hypocrea jecorina (strain ATCC 56765 / BCRC 32...
4,A0A024SC78,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,"[GO:0005576, GO:0016052, GO:0050525]",Hypocrea jecorina (strain ATCC 56765 / BCRC 32...


In [None]:
print(f"Total proteins: {len(train_df)}")

print("\nCollecting unique GO terms and computing frequencies...")
go_counts = Counter(term for go_list in train_df['go_terms'] for term in go_list)
all_go_terms = set(go_counts.keys())

print(f"Found {len(all_go_terms)} unique GO terms total")
mem1 = print_memory_usage("After collecting all terms")

ultra_general = {'GO:0008150', 'GO:0005575', 'GO:0003674'}

filtered_terms = [
    term for term, count in go_counts.most_common()
    if term not in ultra_general
]

top_n_frequent = 2000
top_go_frequent = filtered_terms[:top_n_frequent]

remaining_terms = filtered_terms[top_n_frequent:]

remaining_filtered = [
    term for term in remaining_terms
    if go_counts[term] >= 5
]
np.random.seed(42)

top_go_random = (
    np.random.choice(remaining_filtered, 761, replace=False).tolist()
    if len(remaining_filtered) >= 761
    else remaining_filtered
)

top_go_terms = top_go_frequent + top_go_random

print(f"\nSelected {len(top_go_frequent)} frequent + "
      f"{len(top_go_random)} random = {len(top_go_terms)} total GO terms")

print(f"Most common term: {top_go_frequent[0]} "
      f"(appears {go_counts[top_go_frequent[0]]} times)")

if top_go_random:
    print(f"Sample random term: {top_go_random[0]} "
          f"(appears {go_counts[top_go_random[0]]} times)")

mem2 = print_memory_usage("After term selection")

print(f"Time elapsed: {time.time() - start_time:.1f} seconds")

In [None]:
go_to_index = {go: idx for idx, go in enumerate(top_go_terms)}

num_proteins = len(train_df)
num_labels = len(top_go_terms)

print(f"Total proteins: {num_proteins:,}")
print(f"Total selected GO terms: {num_labels:,}")

binary_matrix = np.zeros((num_proteins, num_labels), dtype=np.int8)
fill_start_time = time.time()
for protein_idx, go_list in enumerate(train_df['go_terms']):
    for go in go_list:
        if go in go_to_index:
            label_idx = go_to_index[go]

            binary_matrix[protein_idx, label_idx] = 1
column_names = [f"label_{go}" for go in top_go_terms]

binary_df = pd.DataFrame(binary_matrix, columns=column_names)

print(f"Created binary DataFrame with {binary_df.shape[1]:,} label columns")

del binary_matrix
gc.collect()
mem3 = print_memory_usage("After binary matrix creation")

print(f"Matrix filling time: {time.time() - fill_start_time:.1f} seconds")
print(f"Total time elapsed: {time.time() - start_time:.1f} seconds")


STEP 1: DATA LOADING AND TERM SELECTION
Total proteins: 105951

Collecting ALL unique GO terms from all proteins...


Found 27615 unique GO terms total
  Memory: 0.50 GB

Selecting balanced GO terms (700 frequent + 300 random)...
Selected 700 frequent + 0 random = 700 total GO terms
Most common: ('GO:0005737', 22662) (appears 0 times)
  Memory: 0.49 GB
Time elapsed: 59.2 seconds


In [None]:
final_df = pd.concat(
    [
        train_df[['sequence', 'accession']],  # original features
        binary_df                              # binary label matrix
    ], axis=1
)
print(f"Final DataFrame shape: {final_df.shape[0]:,} proteins × {final_df.shape[1]:,} columns")

del binary_df
gc.collect()
print("\nVERIFICATION:")

example_accession = final_df.iloc[0]['accession']
example_sequence = final_df.iloc[0]['sequence']

print(f"Example protein: {example_accession}")
print(f"Sequence length: {len(example_sequence):,}")

label_columns = [col for col in final_df.columns if col.startswith('label_')]
print(f"Number of GO term labels: {len(label_columns):,}")

positive_counts = final_df.loc[0, label_columns].sum()
print(f"Positive labels for first protein: {positive_counts} out of {len(label_columns)}")
empty_labels = [col for col in label_columns if final_df[col].sum() == 0]

if empty_labels:
    print(f"Warning: {len(empty_labels)} labels have no positive examples!")
else:
    print("All labels have at least one positive example")

mem4 = print_memory_usage("After final dataframe")
print(f"Total time elapsed: {time.time() - start_time:.1f} seconds")


STEP 2: CREATING BINARY MATRIX
Processing 11 chunks of 10000 proteins each...
  Chunk 1/11: proteins 0 to 9,999
  Chunk 6/11: proteins 50,000 to 59,999
  Chunk 11/11: proteins 100,000 to 105,950
Created binary DataFrame with 700 columns
  Memory: 0.48 GB
Chunk processing time: 161.8 seconds
Total time elapsed: 221.0 seconds


In [None]:
hf_start_time = time.time()

dataset = Dataset.from_pandas(final_df, preserve_index=False)
print(f"Dataset created with {len(dataset):,} proteins")

label_columns = [col for col in dataset.column_names if col.startswith('label_')]
print(f"Number of label columns: {len(label_columns):,}")

print(f"Conversion time: {time.time() - hf_start_time:.1f} seconds")

save_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/protein_go_dataset"
print(f"Saving HuggingFace Dataset to: {save_path}")

dataset.save_to_disk(save_path)
print("Dataset saved successfully.")

metadata = pd.DataFrame({
    'go_term': top_go_terms,
    'count': [go_counts[term] for term in top_go_terms],
    'percentage': [(go_counts[term] / len(final_df)) * 100 for term in top_go_terms],
    'is_frequent': [True] * len(top_go_frequent) + [False] * len(top_go_random)
})

metadata_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/go_terms_metadata.csv"
metadata.to_csv(metadata_path, index=False)
print(f"Saved metadata to: {metadata_path}")

total_time = time.time() - start_time
final_mem = print_memory_usage("Final")

print("\nPERFORMANCE SUMMARY:")
print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
print(f"Memory increase: {final_mem - initial_mem:.2f} GB")
print(f"Time: {time.strftime('%H:%M:%S')}")

print("\nDATA SUMMARY:")
print(f"  • Input proteins: {len(train_df):,}")
print(f"  • Unique GO terms (original): {len(all_go_terms):,}")
print(f"  • Selected GO terms: {len(top_go_terms):,}")
print(f"  • Final dataset size: {final_df.shape[0]:,} × {final_df.shape[1]:,}")
print(f"  • Label representation: multi-hot binary (int8)")

print("\nAll steps completed successfully!")


STEP 3: CREATING FINAL DATAFRAME


Final DataFrame shape: 105,951 proteins × 702 columns

VERIFICATION:
Example protein: A0A009IHW8
Sequence length: 269
Number of GO term labels: 700
Positive labels for first protein: 0 out of 700
  Memory: 0.49 GB
Total time elapsed: 222.8 seconds


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from datasets import load_from_disk

print("✓ All libraries imported successfully")
print_memory_usage("After imports")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    print(f"✓ Random seed set to: {seed}")

set_seed(42)
print_memory_usage("After setting seeds")

print("Loading dataset...")
mem_before = print_memory_usage("Before loading")
dataset_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/protein_go_dataset"
dataset = load_from_disk(dataset_path)
print(f"✓ Loaded: {len(dataset):,} proteins")

mem_after = print_memory_usage("After loading")
print(f"✓ Memory used: {mem_after - mem_before:.3f} GB")

label_columns = [col for col in dataset.column_names if col.startswith("label_")]
print(f"✓ Number of GO term labels: {len(label_columns):,}")

dataset.set_format(
    type="torch",
    columns=label_columns  # labels as tensors
)
print("✓ Dataset formatted for PyTorch (labels as tensors)")


STEP 4: CONVERTING TO HUGGINGFACE DATASET
Converting to HuggingFace Dataset in chunks of 5,000...
  Processing HF chunk: rows 0 to 4,999
  Processing HF chunk: rows 25,000 to 29,999
  Processing HF chunk: rows 50,000 to 54,999
  Processing HF chunk: rows 75,000 to 79,999
  Processing HF chunk: rows 100,000 to 104,999
  Processing HF chunk: rows 105,000 to 105,950
Dataset created with 105,951 proteins and 700 labels

STEP 5: SAVING PROCESSED DATA
Saving HuggingFace Dataset to: C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/protein_go_dataset


Saving the dataset (1/1 shards): 100%|██████████| 105951/105951 [06:50<00:00, 258.04 examples/s]


Saved metadata to: C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/go_terms_metadata.csv

STEP 6: PROCESSING COMPLETE
  Memory: 0.15 GB

PERFORMANCE SUMMARY:
Total processing time: 865.0 seconds (14.4 minutes)
Memory increase: -0.14 GB
Time: 22:30:58

DATA SUMMARY:
  • Input proteins: 105,951
  • Unique GO terms (original): 27,615
  • Selected GO terms: 700 (700 frequent + 300 random)
  • Coverage achieved: ~59% of all GO term instances
  • Final dataset size: 105,951 × 702
  • Memory efficient: Using int8 for binary labels

All steps completed successfully!


In [12]:
print_step_header(7, "SETTING UP ENVIRONMENT")

import torch                 
import torch.nn as nn        # Neural network layers
import torch.optim as optim  # Optimization algorithms (Adam, SGD)
from torch.utils.data import Dataset, DataLoader  # For creating datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Turn off warning messages for cleaner output
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully")
print_memory_usage("After imports")


STEP 7: SETTING UP ENVIRONMENT
✓ All libraries imported successfully
  Memory: 0.29 GB


0.2879829406738281

In [13]:
print_step_header(8, "SETTING RANDOM SEEDS")

def set_seed(seed=42):
    """Set random seeds for all random number generators"""
    np.random.seed(seed)            # For NumPy random operations
    torch.manual_seed(seed)         # For PyTorch random operations
    print(f"✓ Random seed set to: {seed}")

# Call the function to set seeds
set_seed(42)

print_memory_usage("After setting seeds")


STEP 8: SETTING RANDOM SEEDS
✓ Random seed set to: 42
  Memory: 0.29 GB


0.2888450622558594

In [14]:
print_step_header(10, "LOADING AND VIEWING DATASET")
from datasets import load_from_disk
    
print("Loading dataset...")
mem_before = print_memory_usage("Before loading")
    
dataset_path = "C:/Users/USER/Documents/cod3astro/ML_AI/ProteinSeq_DL/data/processed/protein_go_dataset"
dataset = load_from_disk(dataset_path)
print(f"✓ Loaded: {len(dataset)} proteins")
    
# Convert to pandas DataFrame 
train_df = dataset.to_pandas()
    
mem_after = print_memory_usage("After loading")
print(f"✓ Memory used: {mem_after - mem_before:.3f} GB")


STEP 10: LOADING AND VIEWING DATASET
Loading dataset...
  Memory: 0.29 GB


✓ Loaded: 105951 proteins
  Memory: 0.46 GB
✓ Memory used: 0.170 GB


In [15]:
train_df.head()

Unnamed: 0,sequence,accession,"label_('GO:0005737', 22662)","label_('GO:0005634', 19439)","label_('GO:0005829', 17513)","label_('GO:0005886', 17277)","label_('GO:0005576', 14427)","label_('GO:0046872', 11214)","label_('GO:0005524', 10448)","label_('GO:0016020', 9338)",...,"label_('GO:0008234', 205)","label_('GO:0046930', 205)","label_('GO:0007416', 205)","label_('GO:0015293', 204)","label_('GO:0019064', 204)","label_('GO:0008198', 204)","label_('GO:0009277', 203)","label_('GO:0022900', 203)","label_('GO:0075512', 202)","label_('GO:0001750', 202)"
0,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,A0A009IHW8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,A0A023I7E1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,MYRKLAVISAFLATARAQSACTLQSETHPPLTWQKCSSGGTCTQQT...,A0A024RXP8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,A0A024SC78,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train_df.describe(include='all')

Unnamed: 0,sequence,accession,"label_('GO:0005737', 22662)","label_('GO:0005634', 19439)","label_('GO:0005829', 17513)","label_('GO:0005886', 17277)","label_('GO:0005576', 14427)","label_('GO:0046872', 11214)","label_('GO:0005524', 10448)","label_('GO:0016020', 9338)",...,"label_('GO:0008234', 205)","label_('GO:0046930', 205)","label_('GO:0007416', 205)","label_('GO:0015293', 204)","label_('GO:0019064', 204)","label_('GO:0008198', 204)","label_('GO:0009277', 203)","label_('GO:0022900', 203)","label_('GO:0075512', 202)","label_('GO:0001750', 202)"
count,105951,105951,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,...,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0,105951.0
unique,102750,105951,,,,,,,,,...,,,,,,,,,,
top,GSSGLISMPRV,A0A009IHW8,,,,,,,,,...,,,,,,,,,,
freq,55,1,,,,,,,,,...,,,,,,,,,,
mean,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
train_df['seq_length'] = train_df['sequence'].str.len()
train_df['seq_length'].describe()

bins = [0, 50, 100, 200, 256, 512, 1000, 2000, 5000]
labels = [
    '<50', '50–100', '100–200', '200–256',
    '256–512', '512–1000', '1000–2000', '>2000'
]

train_df['length_bin'] = pd.cut(
    train_df['seq_length'],
    bins=bins,
    labels=labels,
    right=False
)

train_df['length_bin'].value_counts().sort_index()


length_bin
<50           5622
50–100        6019
100–200      15837
200–256       8731
256–512      37810
512–1000     23051
1000–2000     7023
>2000         1725
Name: count, dtype: int64

In [18]:
(train_df['seq_length'] > 1024).mean() * 100

np.float64(7.931968551500222)

In [19]:
print(train_df['sequence'].duplicated().sum())
dup_seqs = train_df[train_df['sequence'].duplicated(keep=False)]
dup_seqs

3201


Unnamed: 0,sequence,accession,"label_('GO:0005737', 22662)","label_('GO:0005634', 19439)","label_('GO:0005829', 17513)","label_('GO:0005886', 17277)","label_('GO:0005576', 14427)","label_('GO:0046872', 11214)","label_('GO:0005524', 10448)","label_('GO:0016020', 9338)",...,"label_('GO:0007416', 205)","label_('GO:0015293', 204)","label_('GO:0019064', 204)","label_('GO:0008198', 204)","label_('GO:0009277', 203)","label_('GO:0022900', 203)","label_('GO:0075512', 202)","label_('GO:0001750', 202)",seq_length,length_bin
5,MIVGILTTLATLATLAASVPLEERQACSSVWGQCGGQNWSGPTCCA...,A0A024SH76,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,471,256–512
28,MVRRIASATPMVQSPMSPLGTTYCVRPNPVSLNLQRRPLVIASTDE...,A0A075TJ05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,480,256–512
66,MRSSSLAWALGLVALANAQGSPTQWYDSITGVTFSRFYQQDTDASW...,A0A0A8IDB7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,726,512–1000
112,MGRVTAPEPLSAFHQVAEFVSGEAVLDDWLKQKGLKNQALGAARTF...,A0A0F6B8D8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,161,100–200
113,MLYKGCLMKSDVQLNLRAKESQRALIDAAAEILHKSRTDFILETAC...,A0A0F6B8D9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,95,50–100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105883,AEYDVSDADIEAFYQ,P83764,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15,<50
105885,AEYDVSDADIEAFYQ,P83766,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15,<50
105937,MHNMSDIIEQYIKRLFEESNEDVVEIQRANIAQRFDCVPSQLNYVI...,Q7A799,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,153,100–200
105949,MTIEKNLSDVQQKYADQFQEDVVKSFQTGYGITPDTQIDAGALRRE...,V5XVW4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,463,256–512


In [20]:
print_step_header(11, 'CREATING DATASET CLASS')

class ProteinDataset(Dataset):
    # Prepares protein sequence for the neural network

    def __init__(self, dataframe=train_df, max_seq_length=1024):
        """
        Initialize the dataset
        Args:
            dataframe: Protein data table
            max_seq_length: Maximum length of protein sequences (shorter ones will be padded)
        """
        self.dataframe = dataframe.reset_index(drop=True)
        self.max_seq_length = max_seq_length  # Using shorter sequences for CPU

        # Find which columns are GO term labels
        # These are all columns except sequence, accession
        self.label_cols = []
        for col in dataframe.columns:
            if col not in ['sequence', 'accession']:
                self.label_cols.append(col)

        # Create dictionary to convert amino acids to numbers
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # Standard 20 amino acids
        self.aa_to_idx = {}
        for i, aa in enumerate(self.amino_acids):
            self.aa_to_idx[aa] = i + 1 # Start from 1, 0 will be for padding
        
        print(f"  Found {len(self.label_cols)} GO term labels")

    def __len__(self):
        # Return the number of protein in the dataset
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # Get one proteinsequence and its labels
        seq = self.dataframe.iloc[idx]['sequence']

        # Convert sequence to numbers
        seq_encoded = self.encode_sequence(seq)

        # Get the GO term labels
        labels = self.dataframe.iloc[idx][self.label_cols].values.astype(np.float32)

        # Return as pytorn tensors
        return{
            'sequence': torch.tensor(seq_encoded, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }
    def encode_sequence(self, sequence):
        # Convert amino acids strings to a list of numbers
        encoded = []
        # Take only first max_seq_length if amino acids if sequence is too long
        if len(sequence) > self.max_seq_length:
            sequence = sequence[:self.max_seq_length]

            # convert each amino acid to its number
            for aa in sequence:
                if aa in self.aa_to_idx:
                    encoded.append(self.aa_to_idx[aa])
                else:
                    encoded.append(0)
                
            # If sequence is too short
            while len(encoded) < self.max_seq_length:
                encoded.append(0)
            return encoded

print("✓ ProteinDataset class created")
print_memory_usage("After creating dataset class")


STEP 11: CREATING DATASET CLASS
✓ ProteinDataset class created
  Memory: 0.67 GB


0.6670570373535156

In [21]:
print_step_header(12, "SPLITTING DATA")

mem_before = print_memory_usage("Before splitting data")

# First split: 70% training, 30% temporary
train_data, temp_data = train_test_split(train_df, test_size=0.3, random_state=42)

# Split temporary data: 50% validation, 50% test (15% each of total)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Create dataset objects for each split
train_dataset = ProteinDataset(train_data, max_seq_length=256)
val_dataset = ProteinDataset(val_data, max_seq_length=256)
test_dataset = ProteinDataset(test_data, max_seq_length=256)

print(f"✓ Data split completed:")
print(f"  Training samples: {len(train_dataset)}")
print(f"  Validation samples: {len(val_dataset)}")
print(f"  Test samples: {len(test_dataset)}")

# Track memory after splitting
mem_after = print_memory_usage("After splitting data")
print(f"  Memory used by splits: {mem_after - mem_before:.2f} GB")

# Force garbage collection to free memory
gc.collect()
print("  Garbage collection performed")
print_memory_usage("After garbage collection")


STEP 12: SPLITTING DATA
  Memory: 0.67 GB
  Found 702 GO term labels
  Found 702 GO term labels
  Found 702 GO term labels
✓ Data split completed:
  Training samples: 74165
  Validation samples: 15893
  Test samples: 15893
  Memory: 0.83 GB
  Memory used by splits: 0.16 GB
  Garbage collection performed
  Memory: 0.83 GB


0.8286895751953125

In [22]:
print_step_header(13, "CREATING CNN MODEL")

class SimpleProteinCNN(nn.Module):
    """Simple CNN for protein function prediction"""
    
    def __init__(self, num_classes=100):
        """
        Initialize the CNN
        Args:
            num_classes: Number of GO terms to predict
        """
        super(SimpleProteinCNN, self).__init__()
        
        # Embedding layer: converts amino acid numbers to vectors
        # 21 = 20 amino acids + 1 for padding/unknown
        self.embedding = nn.Embedding(21, 64, padding_idx=0)
        
        # First CNN layer
        self.conv1 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        
        # Second CNN layer
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=5, padding=2)
        
        # Third CNN layer
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=7, padding=3)
        
        # Global pooling - takes the most important features
        self.pool = nn.AdaptiveMaxPool1d(1)
        
        # First fully connected layer
        self.fc1 = nn.Linear(32, 128)
        
        # Dropout for preventing overfitting
        self.dropout = nn.Dropout(0.3)
        
        # Second fully connected layer
        self.fc2 = nn.Linear(128, 64)
        
        # Output layer - one neuron for each GO term
        self.output = nn.Linear(64, num_classes)
        
        # Activation function
        self.relu = nn.ReLU()
    
    def forward(self, x):
        """
        Forward pass through the network
        Args:
            x: Input tensor with shape (batch_size, sequence_length)
        Returns:
            Predictions for each GO term
        """
        # Step 1: Convert amino acid numbers to vectors
        x = self.embedding(x)  # Shape: (batch, seq_len, 64)
        x = x.permute(0, 2, 1)  # Shape: (batch, 64, seq_len)
        
        # Step 2: Apply CNN layers
        x = self.relu(self.conv1(x))  # Apply first CNN + activation
        x = self.relu(self.conv2(x))  # Apply second CNN + activation
        x = self.relu(self.conv3(x))  # Apply third CNN + activation
        
        # Step 3: Global pooling
        x = self.pool(x)  # Shape: (batch, 32, 1)
        x = x.squeeze(-1)  # Shape: (batch, 32)
        
        # Step 4: Fully connected layers
        x = self.relu(self.fc1(x))  # First FC layer + activation
        x = self.dropout(x)  # Apply dropout
        x = self.relu(self.fc2(x))  # Second FC layer + activation
        x = self.dropout(x)  # Apply dropout again
        
        # Step 5: Output layer (no activation here - will be applied later)
        x = self.output(x)  # Shape: (batch, num_classes)
        
        return x

print("✓ SimpleProteinCNN class created")

# Count how many parameters the model has
sample_model = SimpleProteinCNN(num_classes=len(train_dataset.label_cols))
total_params = sum(p.numel() for p in sample_model.parameters())
print(f"  Model parameters: {total_params:,}")

print_memory_usage("After creating model")


STEP 13: CREATING CNN MODEL
✓ SimpleProteinCNN class created
  Model parameters: 139,550
  Memory: 0.83 GB


0.8319358825683594

In [23]:
print_step_header(14, "PREPARING FOR TRAINING - SIMPLE VERSION")

# ============================================
# 1: SETUP DEVICE (CPU)
# ============================================

print("Step 1: Setting up device...")

device = torch.device('cpu')
print("✓ Using CPU (most reliable for laptops)")

# ============================================
# 2: CHECK OUR DATA
# ============================================

print("\nStep 2: Checking our data...")
print(f"We have {len(train_dataset)} training proteins")

# Check if dataset has label columns
if hasattr(train_dataset, 'label_cols'):
    print(f"Found {len(train_dataset.label_cols)} GO term labels")
else:
    print("Warning: No label columns found!")
    # Create dummy labels
    num_dummy_labels = 100
    print(f"Creating {num_dummy_labels} dummy labels for testing")

# ============================================
# 3: CREATE DATA LOADERS
# ============================================
print("\nStep 3: Creating data loaders...")

batch_size = 8 # Use small batch size for CPU

# Create simple data loaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True,
    num_workers=0  # Important: 0 workers for Windows
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

print(f"✓ Created data loaders")
print(f"  Batch size: {batch_size}")
print(f"  Training batches: {len(train_loader)}")

# ============================================
# 4: CREATE MODEL (SIMPLER VERSION)
# ============================================
print("\nCreating a simpler model...")

# Create a VERY SIMPLE CNN to avoid problems
class VerySimpleProteinCNN(nn.Module):
    """Super simple CNN"""
    
    def __init__(self, num_classes=100):
        super(VerySimpleProteinCNN, self).__init__()
        
        # Simple embedding
        self.embedding = nn.Embedding(21, 32, padding_idx=0)
        
        # Just ONE CNN layer
        self.conv = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        
        # Simple pooling
        self.pool = nn.AdaptiveMaxPool1d(1)
        
        # Just ONE fully connected layer
        self.fc = nn.Linear(64, num_classes)
        
        # Activation
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        
        # CNN
        x = self.relu(self.conv(x))
        
        # Pooling
        x = self.pool(x)
        x = x.squeeze(-1)
        
        # Output
        x = self.fc(x)
        return x

# Create the simple model
try:
    num_classes = 100  # Start with 100 classes
    model = VerySimpleProteinCNN(num_classes=num_classes)
    model = model.to(device)
    print(f"✓ Created VerySimpleProteinCNN")
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"  Parameters: {total_params:,}")
    
except Exception as e:
    print(f"Error creating model: {e}")
    print("Creating the simplest possible model...")
    
    # Last resort: Linear model
    class LinearModel(nn.Module):
        def __init__(self, num_classes=100):
            super(LinearModel, self).__init__()
            self.fc = nn.Linear(256, num_classes)  # Sequence length is 256
            
        def forward(self, x):
            # Simple: average embeddings
            return self.fc(x.float().mean(dim=1))
    
    model = LinearModel(num_classes=100)
    model = model.to(device)
    print("✓ Created LinearModel (simplest possible)")

# ============================================
# 5: SETUP OPTIMIZER (WITH WORKAROUND)
# ============================================
print("\nStep 5: Setting up optimizer (with workaround)...")

# Clear any PyTorch cache
import torch
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Create loss function (simple, no weights)
criterion = nn.BCEWithLogitsLoss()
print(f"✓ Created loss function")

# Try different optimizers if Adam fails
optimizers_to_try = [
    ('SGD', optim.SGD(model.parameters(), lr=0.01)),
    ('RMSprop', optim.RMSprop(model.parameters(), lr=0.001)),
    ('Adagrad', optim.Adagrad(model.parameters(), lr=0.01))
]

# Try Adam first (with error handling)
try:
    print("Trying Adam optimizer...")
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
    print("✓ Adam optimizer created successfully!")
    
except Exception as e:
    print(f"Adam failed: {e}")
    print("Trying SGD instead...")
    
    # Use SGD (older, more reliable)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    print("✓ Using SGD optimizer instead")

# ============================================
# 6: TEST EVERYTHING
# ============================================
print("\nStep 6: Testing everything works...")

# Test 1: Can we get a batch?
try:
    test_batch = next(iter(train_loader))
    sequences = test_batch['sequence'].to(device)
    labels = test_batch['labels'].to(device)
    
    print(f"✓ Got batch: sequences={sequences.shape}, labels={labels.shape}")
    
    # Test 2: Can model make predictions?
    with torch.no_grad():  # Don't calculate gradients for testing
        outputs = model(sequences)
        print(f"✓ Model can make predictions: outputs={outputs.shape}")
    
    # Test 3: Can we calculate loss?
    loss = criterion(outputs, labels)
    print(f"✓ Can calculate loss: {loss.item():.4f}")
    
    # Test 4: Can we do backpropagation?
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"✓ Can do training step (backpropagation)")
    
    print("\n ALL TESTS PASSED! Ready for training.")
    
except Exception as e:
    print(f"❌ Test failed: {e}")
    print("\nDebugging info:")
    
    # Show what we have
    print(f"  Device: {device}")
    print(f"  Model type: {type(model)}")
    print(f"  Model on device? {next(model.parameters()).device}")
    
    # Try to fix
    print("\nTrying to fix...")
    
    # Reset model
    model = VerySimpleProteinCNN(num_classes=100)
    model = model.to(device)
    
    # Use simplest optimizer
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    print("✓ Reset model and optimizer")

# ============================================
# FINAL SETUP
# ============================================

print("\n" + "="*50)
print("FINAL TRAINING SETUP:")
print("="*50)

print(f"1. Device: {device}")
print(f"2. Model: {model.__class__.__name__}")
print(f"3. Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"4. Optimizer: {optimizer.__class__.__name__}")
print(f"5. Batch size: {batch_size}")
print(f"6. Training samples: {len(train_dataset)}")
print(f"7. Validation samples: {len(val_dataset)}")

print_memory_usage("After setup")

print("\n" + "="*50)
print("READY FOR TRAINING!")
print("="*50)


STEP 14: PREPARING FOR TRAINING - SIMPLE VERSION
Step 1: Setting up device...
✓ Using CPU (most reliable for laptops)

Step 2: Checking our data...
We have 74165 training proteins
Found 702 GO term labels

Step 3: Creating data loaders...
✓ Created data loaders
  Batch size: 8
  Training batches: 9271

Creating a simpler model...
✓ Created VerySimpleProteinCNN
  Parameters: 13,380

Step 5: Setting up optimizer (with workaround)...
✓ Created loss function
Trying Adam optimizer...
✓ Adam optimizer created successfully!

Step 6: Testing everything works...
❌ Test failed: could not convert string to float: '100–200'

Debugging info:
  Device: cpu
  Model type: <class '__main__.VerySimpleProteinCNN'>
  Model on device? cpu

Trying to fix...
✓ Reset model and optimizer

FINAL TRAINING SETUP:
1. Device: cpu
2. Model: VerySimpleProteinCNN
3. Parameters: 13,380
4. Optimizer: SGD
5. Batch size: 8
6. Training samples: 74165
7. Validation samples: 15893
  Memory: 0.90 GB

READY FOR TRAINING!


In [25]:
print_step_header(14, "TRAINING - SIMPLE VERSION")

# ============================================
# PART 1: TRAINING SETUP
# ============================================

print("Starting training...")
print(f"Training samples: {len(train_dataset)}")
print(f"Batch size: {batch_size}")
print(f"Total batches per epoch: {len(train_loader)}")

# Create lists to track progress
train_losses = []
val_losses = []

# How many times to go through all data
num_epochs = 3  
best_loss = float('inf')  

# ============================================
# PART 2: THE TRAINING LOOP
# ============================================
for epoch in range(num_epochs):
    print(f"\n📅 Epoch {epoch + 1}/{num_epochs}")
    print("-" * 30)
    
    # ===== TRAINING PHASE =====
    model.train() 
    total_train_loss = 0
    
    for batch_idx, batch in enumerate(train_loader):
        # Get data
        sequences = batch['sequence'].to(device)
        labels = batch['labels'].to(device)
        
        # Clear old gradients
        optimizer.zero_grad()
        
        # Make predictions
        outputs = model(sequences)
        
        # Calculate loss 
        loss = criterion(outputs, labels)
        
        # Learn from mistakes
        loss.backward()
        optimizer.step()
        
        # Track loss
        total_train_loss += loss.item()
        
        # Show progress every 1000 batches
        if (batch_idx + 1) % 1000 == 0:
            print(f"  Batch {batch_idx + 1}/{len(train_loader)} - Loss: {loss.item():.4f}")
    
    # Calculate average training loss for this epoch
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # ===== VALIDATION PHASE =====
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0
    
    with torch.no_grad():  # No learning during validation
        for batch in val_loader:
            sequences = batch['sequence'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    
    # ===== SHOW RESULTS =====
    print(f"\n📊 Results for Epoch {epoch + 1}:")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    
    # ===== SAVE BEST MODEL =====
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"  ✅ Saved better model! (Loss improved to {avg_val_loss:.4f})")
    else:
        print(f"  Model did not improve")

# ============================================
# PART 3: TRAINING COMPLETE
# ============================================

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print(f"✓ Best model saved as: best_model.pth")
print(f"✓ Final training loss: {train_losses[-1]:.4f}")
print(f"✓ Final validation loss: {val_losses[-1]:.4f}")

# ============================================
# PART 4: QUICK SUMMARY
# ============================================

print("\n📈 Training Summary:")
print(f"  Epochs completed: {num_epochs}")
print(f"  Training loss started at: {train_losses[0]:.4f}")
print(f"  Training loss ended at: {train_losses[-1]:.4f}")
if len(train_losses) > 1:
    improvement = train_losses[0] - train_losses[-1]
    print(f"  Improvement: {improvement:.4f}")



STEP 14: TRAINING - SIMPLE VERSION
Starting training...
Training samples: 74165
Batch size: 8
Total batches per epoch: 9271

📅 Epoch 1/3
------------------------------


ValueError: could not convert string to float: '512–1000'