# Benchmarking aptamer evaluation algorithms
Step-by-step guide for using `AptaNet` for benchmarking.

## Overview
This notebook introduces the Benchmarking class, a utility for systematically comparing machine learning estimators on a given dataset using cross-validation. It is designed to streamline model evaluation across multiple metrics and provide results in a unified, interpretable format.

The output is a summary table that makes it easy to compare different models and metrics at a glance.

## Data preparation
To train the `AptaNetPipeline` and `AptaTransPipeline` the notebook uses the dataset used to train the `AptaTrans` algorithm, this dataset can be found in `pyaptamer/datasets/data/train_li2014`.

In [1]:
# Data imports
import numpy as np

from pyaptamer.datasets import load_csv_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load full dataset
df = load_csv_dataset("train_li2014")

# Separate features and label column
X_raw = df.drop(columns=["label"])
y_raw = df["label"]

# Build combinations (aptamer, protein)
# assuming the first two columns are aptamer and protein
X = list(zip(X_raw.iloc[:, 0], X_raw.iloc[:, 1], strict=False))[:100]

# Binary labels
y = np.where(y_raw == "positive", 1, 0)[:100]

## Different workflows
Benchmarking offers 2 main workflows, both depending on how you want to use `cv` (cross validation) in your benchmarking experiment:
1. Using normal k-fold cross-validation
2. Using `PredefinedSplit` to create a fixed train/test split

### 1. Using normal k-fold cross validation for benchmarking

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

from pyaptamer.aptanet import AptaNetPipeline
from pyaptamer.benchmarking import Benchmarking

In [4]:
import torch

from pyaptamer.aptatrans import (
    AptaTrans,
    AptaTransPipeline,
    EncoderPredictorConfig,
)
from pyaptamer.datasets import (
    load_csv_dataset,
)
from pyaptamer.utils._base import filter_words

# setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# auto-reloading external modules
%load_ext autoreload
%autoreload 2

In [5]:
BATCH_SIZE = 16
TEST_SIZE = 0.05  # size of the test set for pretraining
RAMDOM_STATE = 42  # for reproducibility

# embeddings for pretraining
# aptamers
N_APTA_VOCABS = 127
N_APTA_TARGET_VOCABS = 344
APTA_MAX_LEN = 275
# proteins
N_PROT_VOCABS = 715
N_PROT_TARGET_VOCABS = 585
PROT_MAX_LEN = 867

In [6]:
prot_words = load_csv_dataset(
    name="protein_word_freq", keep_default_na=False, na_values=["_"]
)  # dict for each protein word in ds gives freq
prot_words = prot_words.set_index("seq")["freq"].to_dict()

filtered_prot_words = filter_words(prot_words)

# (1.) load the api dataset for fine-tuning
# train_dataset = load_csv_dataset(name="train_li2014")
# test_dataset = load_csv_dataset(name="test_li2014")

# # (2.) create the API dataset
# train_dataset = APIDataset(
#     x_apta=train_dataset["aptamer"].to_numpy(),
#     x_prot=train_dataset["protein"].to_numpy(),
#     y=train_dataset["label"].to_numpy(),
#     apta_max_len=APTA_MAX_LEN,
#     prot_max_len=PROT_MAX_LEN,
#     prot_words=filtered_prot_words,
# )
# test_dataset = APIDataset(
#     x_apta=test_dataset["aptamer"].to_numpy(),
#     x_prot=test_dataset["protein"].to_numpy(),
#     y=test_dataset["label"].to_numpy(),
#     apta_max_len=APTA_MAX_LEN,
#     prot_max_len=PROT_MAX_LEN,
#     prot_words=filtered_prot_words,
#     split="test",
# )

# # (3.) create dataloaders
# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True,
# )
# test_dataloader = DataLoader(
#     test_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True,
# )

In [7]:
apta_embedding = EncoderPredictorConfig(
    num_embeddings=N_APTA_VOCABS,
    target_dim=N_APTA_TARGET_VOCABS,
    max_len=APTA_MAX_LEN,
)
prot_embedding = EncoderPredictorConfig(
    num_embeddings=N_PROT_VOCABS,
    target_dim=N_PROT_TARGET_VOCABS,
    max_len=PROT_MAX_LEN,
)
model = AptaTrans(
    apta_embedding=apta_embedding,
    prot_embedding=prot_embedding,
    in_dim=128,
    n_encoder_layers=1,
    n_heads=1,
    dropout=0.1,
).to(device)



In [8]:
# specify the target protein sequence here
target_protein = (
    "STEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAM"
    "RDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTR"
    "QGVDDAFYTLVREIRKHKEKMSK"
)

pipeline = AptaTransPipeline(
    device=device,
    model=model,
    prot_words=prot_words,
    depth=1,  # depth of the search (i.e., length of generated candidates)
    n_iterations=1,  # higher is better but slower, suggested: 1000
)
candidates = pipeline.recommend(
    target=target_protein,
    n_candidates=1,  # number of candidates to generate
    verbose=True,
)


 ----- Round: 1 -----


RuntimeError: The size of tensor a (0) must match the size of tensor b (128) at non-singleton dimension 1

In [None]:
# Example estimator
aptanet_estimator = AptaNetPipeline(k=4)
aptatrans_estimator = AptaTransPipeline(
    device=device,
    model=model,
    prot_words=prot_words,
    depth=1,
    n_iterations=1,
)

# Define a 5-fold CV strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Run benchmarking with CV
bench = Benchmarking(
    estimators=[aptanet_estimator, aptatrans_estimator],
    metrics=[accuracy_score],
    X=X,
    y=y,
    cv=cv,
)
results_cv = bench.run()
print(results_cv)

  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)
  seq = clean_protein_seq(protein_sequence)


                                train  test
estimator       metric                     
AptaNetPipeline accuracy_score    1.0   1.0


### 2. Using PredefinedSplit for benchmarking with a fixed train/test split

In [None]:
from sklearn.model_selection import PredefinedSplit

# Define a custom train/test split
# Here, last 10 samples are used as test set
test_fold = np.ones(len(y)) * -1
test_fold[-10:] = 0
cv = PredefinedSplit(test_fold)

# Run benchmarking with fixed split
bench_fixed = Benchmarking(
    estimators=[aptanet_estimator, aptatrans_estimator],
    metrics=[accuracy_score],
    X=X,
    y=y,
    cv=cv,
)
results_fixed = bench_fixed.run()
print(results_fixed)

  seq = clean_protein_seq(protein_sequence)
  y = column_or_1d(y, warn=True)
  seq = clean_protein_seq(protein_sequence)


                                train  test
estimator       metric                     
AptaNetPipeline accuracy_score    1.0   1.0
