## Data Processing and Train/Val/Test Split

After running this notebook, you should have:

1. Raw processed CSV datasets

    - `data/processed/train.csv` — **Training set** (70%)  
    - `data/processed/val.csv` — **Validation set** (15%)  
    - `data/processed/test.csv` — **Test set** (15%)  

    - Columns: `Entry`, `Sequence`, `EC Number`


2. Tokenized datasets (for model training)

    - `data/tokenized/train_tokenized/`  
    - `data/tokenized/val_tokenized/`
    - `data/tokenized/test_tokenized/`

In [None]:
import pandas as pd
import os
import yaml
from sklearn.model_selection import train_test_split

# Load cleaned data from exploration step
df = pd.read_csv('../data/raw/uniprot_cleaned.csv')
print(f"Loaded {len(df):,} entries from cleaned dataset")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# Convert main EC class numbers (1-7) into ML labels (0-6)
df['Label'] = df['EC_Main_Class'] - 1

# Drop rows with invalid labels
initial_count = len(df)
df = df.dropna(subset=['Label'])
df['Label'] = df['Label'].astype(int)

print(f"\nExtracted labels for {len(df):,} entries")
print(f"Dropped {initial_count - len(df):,} entries with invalid EC numbers")

In [None]:
# Check original class distribution and imbalance
print("\n=== ORIGINAL CLASS DISTRIBUTION ===")
class_counts = df['Label'].value_counts().sort_index()
for label, count in class_counts.items():
    print(f"Class {label}: {count:>8,} samples")

original_max = class_counts.max()
original_min = class_counts.min()
imbalance_ratio = original_max / original_min
print(f"\nOriginal imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"(Max class: {original_max:,} / Min class: {original_min:,})")


In [None]:
# Load balancing targets from config
with open("../configs/data.yaml", "r") as f:
    cfg = yaml.safe_load(f)

targets = cfg["targets"]
random_seed = cfg["random_seed"]

print("\n=== BALANCING TARGETS FROM CONFIG ===")
print(f"Random seed: {random_seed}")
print("\nTarget samples per class:")
for label, target in targets.items():
    print(f"  Class {label}: {target:,} samples")

In [None]:
# Apply class balancing
print("\n=== APPLYING CLASS BALANCING ===")
balanced_dfs = []

for label, target_count in targets.items():
    class_df = df[df['Label'] == label]
    current_count = len(class_df)
    
    if current_count == 0:
        print(f"Warning: No samples for class {label}")
        continue
    
    # Take minimum of available samples and target
    sample_count = min(current_count, target_count)
    
    # Random sampling with fixed seed for reproducibility
    sampled_df = class_df.sample(n=sample_count, random_state=42)
    balanced_dfs.append(sampled_df)
    
    print(f"Class {label}: {current_count:>7,} → {sample_count:>7,}")

# Combine all balanced classes
df_balanced = pd.concat(balanced_dfs, ignore_index=True)

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n{'Total':<8} {len(df):>7,} → {len(df_balanced):>7,}")

# Check final imbalance
balanced_class_counts = df_balanced['Label'].value_counts().sort_index()
balanced_max = balanced_class_counts.max()
balanced_min = balanced_class_counts.min()
balanced_imbalance = balanced_max / balanced_min
print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1 → {balanced_imbalance:.2f}:1")

In [None]:
# Prepare data for splitting (only Sequence and Label columns)
df_final = df_balanced[['Sequence', 'Label']].copy()

print("\n=== DATASET READY FOR SPLITTING ===")
print(f"Total samples: {len(df_final):,}")
print(f"Features: Sequence")
print(f"Target: Label (0-6)")

In [None]:
print("\n=== PERFORMING STRATIFIED TRAIN/VAL/TEST SPLIT ===")

# First split: 70% train, 30% temp (which will become 15% val, 15% test)
train_df, temp_df = train_test_split(
    df_final,
    test_size=0.30,
    random_state=42,
    stratify=df_final['Label']
)

# Second split: Split temp into 50% val, 50% test (15% each of original)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df['Label']
)

print(f"Train: {len(train_df):>7,} ({len(train_df)/len(df_final)*100:.1f}%)")
print(f"Val:   {len(val_df):>7,} ({len(val_df)/len(df_final)*100:.1f}%)")
print(f"Test:  {len(test_df):>7,} ({len(test_df)/len(df_final)*100:.1f}%)")

In [None]:
# Verify stratification - check class distribution in each split
print("\n=== CLASS DISTRIBUTION IN SPLITS ===")
print(f"{'Class':<7} {'Train':>8} {'Val':>8} {'Test':>8}")
print("-" * 35)

train_counts = train_df['Label'].value_counts().sort_index()
val_counts = val_df['Label'].value_counts().sort_index()
test_counts = test_df['Label'].value_counts().sort_index()

for label in range(7):
    train_c = train_counts.get(label, 0)
    val_c = val_counts.get(label, 0)
    test_c = test_counts.get(label, 0)
    print(f"{label:<7} {train_c:>8,} {val_c:>8,} {test_c:>8,}")

print("-" * 35)
print(f"{'Total':<7} {len(train_df):>8,} {len(val_df):>8,} {len(test_df):>8,}")

In [None]:
# Save processed datasets
os.makedirs("../data/processed", exist_ok=True)

train_path = "../data/processed/train.csv"
val_path = "../data/processed/val.csv"
test_path = "../data/processed/test.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("\n=== SAVED PROCESSED DATASETS ===")
print(f"Train: {train_path}")
print(f"Val:   {val_path}")
print(f"Test:  {test_path}")

print("\n=== FINAL SUMMARY ===")
print(f"Total sequences: {len(train_df) + len(val_df) + len(test_df):,}")
print(f"Number of classes: {train_df['Label'].nunique()}")
print(f"Class balance ratio: {balanced_imbalance:.2f}:1")

print("\nData processing complete! Ready for model training.")

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

# Tokenize and save datasets
print("\n=== STARTING TOKENIZATION ===")

# Load the processed CSVs
train_df = pd.read_csv("../data/processed/train.csv")
val_df   = pd.read_csv("../data/processed/val.csv")
test_df  = pd.read_csv("../data/processed/test.csv")

train_sequences = train_df["Sequence"].tolist()
val_sequences   = val_df["Sequence"].tolist()
test_sequences  = test_df["Sequence"].tolist()

train_labels = train_df["Label"].tolist()
val_labels   = val_df["Label"].tolist()
test_labels  = test_df["Label"].tolist()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")

print("Tokenizing train set...")
train_tokenized = tokenizer(train_sequences, padding=True, truncation=True)

print("Tokenizing validation set...")
val_tokenized = tokenizer(val_sequences, padding=True, truncation=True)

print("Tokenizing test set...")
test_tokenized = tokenizer(test_sequences, padding=True, truncation=True)

# Create Hugging Face Datasets
train_dataset = Dataset.from_dict(train_tokenized).add_column("labels", train_labels)
val_dataset   = Dataset.from_dict(val_tokenized).add_column("labels", val_labels)
test_dataset  = Dataset.from_dict(test_tokenized).add_column("labels", test_labels)

# Save to disk
save_dir = "../data/tokenized"
os.makedirs(save_dir, exist_ok=True)

train_dataset.save_to_disk(f"{save_dir}/train_dataset")
val_dataset.save_to_disk(f"{save_dir}/val_dataset")
test_dataset.save_to_disk(f"{save_dir}/test_dataset")

print("\n=== SAVED TOKENIZED DATASETS ===")
print(f"Train: {save_dir}/train_dataset")
print(f"Val:   {save_dir}/val_dataset")
print(f"Test:  {save_dir}/test_dataset")

print("\n=== TOKENIZATION SUMMARY ===")
print(f"Train samples: {len(train_dataset):,}")
print(f"Val samples:   {len(val_dataset):,}")
print(f"Test samples:  {len(test_dataset):,}")

print("\nTokenization complete! Ready for model setup.")
