## Data Processing

After running this notebook, you should have 
- `data/processed/uniprot_processed.tsv` with enzyme sequences and labels (0-6)
- Columns: `Sequence`, `Label`

Currently, the processing script takes in the first EC number for multi class enzymes.
However, this may change depending on data exploration.

In [None]:
import re
import pandas
import os
import yaml

df = pandas.read_csv('../data/raw/uniprot_raw.tsv', sep='\t')

def extract_primary_ec_class(ec_string):
    # Take first EC number if multiple
    first_ec = str(ec_string).split(';')[0].strip()
    
    # Extract first digit (EC main class 1-7)
    match = re.search(r'(\d)\.\d+', first_ec)
    if match:
        ec_main = int(match.group(1))
        # Convert EC 1-7 to classes 0-6 for ML
        return ec_main - 1
    return None

df['Label'] = df['EC number'].apply(extract_primary_ec_class)

# Balance data if needed
# with open("../configs/data.yaml", "r") as f:
#     cfg = yaml.safe_load(f)
# targets = cfg["targets"]
# dfs = []
# for c, target in targets.items():
#     group = df[df['primary_ec_class'] == c]
#     take_n = min(len(group), target)
#     dfs.append(group.sample(n=take_n, random_state=42))
# df = pandas.concat(dfs, ingnore_index=True)

# Save processed data to file
os.makedirs("../data/processed", exist_ok=True)
df = df.dropna(subset=['Label']) # Drop unlabeled rows
df['Label'] = df['Label'].astype(int)
df[['Sequence', 'Label']].to_csv("../data/processed/uniprot_processed.csv", index=False)
df[['Sequence', 'Label']].head(3)