Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyaptamer/deepatamer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""The deepatamer algorithm for binary binding affinity prediction."""

from pyaptamer.deepatamer._deepaptamer_nn import DeepAptamerNN
from pyaptamer.deepatamer._pipeline import DeepAptamerPipeline

__all__ = ["DeepAptamerNN", "DeepAptamerPipeline"]
215 changes: 215 additions & 0 deletions pyaptamer/deepatamer/_deepaptamer_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
__author__ = "satvshr"
__all__ = ["DeepAptamerNN"]

import torch
import torch.nn as nn


class DeepAptamerNN(nn.Module):
"""
DeepAptamer neural network model for aptamer–protein interaction prediction.

This architecture integrates:

- A sequence branch using convolutional and fully-connected layers to
process one-hot encoded aptamer sequences.
- A structural (DNA shape) branch using convolution + pooling + dense layers to
extract shape features using `deepDNAshape` from the aptamer sequence.
- A BiLSTM for capturing sequential dependencies.
- Multi-head self-attention for contextual feature refinement.
- A final classification head for binary binding prediction.

Parameters
----------
seq_conv_in : int, optional, default=4
Number of input channels for the sequence convolution branch. Typically 4
for one-hot DNA encoding.

seq_conv_out : int, optional, default=12
Number of output channels (filters) for the sequence convolution.

seq_conv_kernel_size : int, optional, default=1
Kernel size for the sequence convolution.

seq_pool_kernel_size : int, optional, default=1
Kernel size for max-pooling after sequence convolution.

seq_pool_stride : int, optional, default=1
Stride for max-pooling after sequence convolution.

seq_linear_hidden_dim : int, optional, default=32
Hidden layer size for fully connected layers in the sequence branch.

seq_conv_linear_out : int, optional, default=4
Dimensionality of the output feature vector from the sequence branch.

shape_conv_kernel_size : int, optional, default=100
Kernel size for convolution in the shape branch.

shape_pool_kernel_size : int, optional, default=20
Kernel size for pooling in the shape branch.

shape_pool_stride : int, optional, default=20
Stride for pooling in the shape branch.

bilstm_hidden_size : int, optional, default=100
Number of hidden units in each LSTM direction.

bilstm_num_layers : int, optional, default=2
Number of BiLSTM layers.

dropout : float, optional, default=0.1
Dropout probability applied after the BiLSTM.

optimizer : torch.optim.Optimizer or None, optional, default=None
Optimizer for training. If None, defaults to Adam with lr=0.001.

Attributes
----------
seq_conv : nn.Conv1d
1D convolution layer for sequence branch.

seq_fc : nn.Sequential
Fully connected projection for sequence features.

shape_conv_pool : nn.Sequential
Convolution + pooling for DNA shape features.

shape_fc : nn.Sequential
Fully connected projection for shape features.

bilstm : nn.LSTM
Bidirectional LSTM for sequential modeling.

dropout : nn.Dropout
Dropout layer applied after BiLSTM.

attn : nn.MultiheadAttention
Attention layer for contextual refinement.

head : nn.Linear
Final classification layer (logits for 2 classes).
"""

def __init__(
self,
seq_conv_in=4,
seq_conv_out=12,
seq_conv_kernel_size=1,
seq_pool_kernel_size=1,
seq_pool_stride=1,
seq_linear_hidden_dim=32,
# Defines the size of the 1st dimension(time/seq) used for branch concatenation
seq_conv_linear_out=4,
shape_conv_kernel_size=100,
shape_pool_kernel_size=20,
shape_pool_stride=20,
bilstm_hidden_size=100,
bilstm_num_layers=2,
dropout=0.1,
optimizer=None,
):
super().__init__()
self.seq_conv_in = seq_conv_in
self.seq_conv_out = seq_conv_out
self.seq_conv_kernel_size = seq_conv_kernel_size
self.seq_pool_kernel_size = seq_pool_kernel_size
self.seq_pool_stride = seq_pool_stride
self.seq_linear_hidden_dim = seq_linear_hidden_dim
self.seq_conv_linear_out = seq_conv_linear_out
self.shape_conv_kernel_size = shape_conv_kernel_size
self.shape_pool_kernel_size = shape_pool_kernel_size
self.shape_pool_stride = shape_pool_stride
self.bilstm_hidden_size = bilstm_hidden_size
self.bilstm_num_layers = bilstm_num_layers
self.dropout_val = dropout
self.optimizer = optimizer

# Sequence branch (B, seq_len, 4)
self.seq_conv = nn.Conv1d(
in_channels=self.seq_conv_in,
out_channels=self.seq_conv_out,
kernel_size=self.seq_conv_kernel_size,
)
self.seq_fc = nn.Sequential(
nn.MaxPool1d(
kernel_size=self.seq_pool_kernel_size, stride=self.seq_pool_stride
),
nn.Linear(self.seq_conv_out, self.seq_linear_hidden_dim),
nn.ReLU(),
nn.Linear(self.seq_linear_hidden_dim, self.seq_conv_linear_out),
nn.Softmax(dim=-1),
)

# Shape branch (B, 1, 126/138)
self.shape_conv_pool = nn.Sequential(
nn.Conv1d(
in_channels=1, out_channels=1, kernel_size=self.shape_conv_kernel_size
),
nn.MaxPool1d(
kernel_size=self.shape_pool_kernel_size, stride=self.shape_pool_stride
),
)
self.shape_fc = nn.Sequential(nn.Linear(1, self.seq_conv_linear_out), nn.ReLU())

# Rest of the model
self.bilstm = nn.LSTM(
input_size=self.seq_conv_linear_out,
hidden_size=self.bilstm_hidden_size,
num_layers=self.bilstm_num_layers,
batch_first=True,
bidirectional=True,
dropout=self.dropout_val,
)
self.dropout = nn.Dropout(self.dropout_val)

self.attn = nn.MultiheadAttention(
embed_dim=2 * self.bilstm_hidden_size, num_heads=1, batch_first=True
)

self.head = nn.Linear(2 * bilstm_hidden_size, 2)

self.optimizer = self.optimizer or torch.optim.Adam(self.parameters(), lr=0.001)

def forward(self, x_ohe, x_shape):
"""
Forward pass of the DeepAptamerNN model.

Parameters
----------
x_ohe : torch.Tensor
One-hot encoded aptamer sequence tensor of shape (batch_size, seq_len, 4).
Example: (B, seq_len, 4) for batch size B and sequence length seq_len.

x_shape : torch.Tensor
DNA shape feature tensor of shape (batch_size, 1, shape_len).
Example: (B, 1, 126) for batch size B and shape feature length 126.

Returns
-------
torch.Tensor
Logits tensor of shape (batch_size, 2), representing the predicted
class scores for binary classification.
"""
out_ohe = x_ohe.permute(0, 2, 1)
out_ohe = self.seq_conv(out_ohe)
out_ohe = out_ohe.permute(0, 2, 1)
out_ohe = self.seq_fc(out_ohe)

out_shape = self.shape_conv_pool(x_shape)
out_shape = out_shape.transpose(1, 2)
out_shape = self.shape_fc(out_shape)

x = torch.cat([out_ohe, out_shape], dim=1)

x, _ = self.bilstm(x)

x = x[:, -1, :]
x = self.dropout(x)

x = x.unsqueeze(1)
x, _ = self.attn(x, x, x)

x = x.squeeze(1)

return self.head(x)
117 changes: 117 additions & 0 deletions pyaptamer/deepatamer/_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
__author__ = "satvshr"
__all__ = ["DeepAptamerPipeline"]

import numpy as np
import torch

from pyaptamer.deepatamer._preprocessing import preprocess_seq_ohe, preprocess_seq_shape


class DeepAptamerPipeline:
"""
DeepAptamer algorithm for aptamer–protein interaction prediction [1]_

This class encapsulates preprocessing (sequence one-hot encoding and DNAshape
feature extraction) together with inference on a trained `DeepAptamerNN` model.
It provides a `predict` method that accepts one or more DNA sequences and returns
ranked binding affinity scores.

Parameters
----------
model : DeepAptamerNN
A trained DeepAptamer neural network model.
full_dna_shape : bool, optional, default=True
If True, use the trimmed 126-length DNAshape representation
(MGW=31, HelT=32, ProT=31, Roll=32).
If False, keep the full 138-length DeepDNAshape representation.
(MGW=35, HelT=34, ProT=35, Roll=34).
device : {"cpu", "cuda"}, default="cpu"
Device to run inference on.

Methods
-------
predict(seqs)
Compute ranked binding affinity scores for one or more DNA
sequences. Returns a list of dictionaries with each sequence
and its predicted binding probability.

References
----------
.. [1] Yang X, Chan CH, Yao S, Chu HY, Lyu M, Chen Z, Xiao H, Ma Y, Yu S, Li F,
Liu J, Wang L, Zhang Z, Zhang BT, Zhang L, Lu A, Wang Y, Zhang G, Yu Y.
DeepAptamer: Advancing high-affinity aptamer discovery with a hybrid deep learning
model. Mol Ther Nucleic Acids. 2024 Dec 21;36(1):102436.
doi: 10.1016/j.omtn.2024.102436. PMID: 39897584; PMCID: PMC11787022.
https://www.cell.com/molecular-therapy-family/nucleic-acids/pdf/S2162-2531(24)00323-8.pdf
.. [2] deepDNAshape: a deep learning predictor for DNA shape features.
https://github.com/JinsenLi/deepDNAshape/blob/main/LICENSE
.. [3] DeepAptamer: a deep learning framework for aptamer design and binding
prediction.
https://github.com/YangX-BIDD/DeepAptamer

Examples
--------
>>> from pyaptamer.deepatamer import DeepAptamerPipeline, DeepAptamerNN
>>> model = DeepAptamerNN()
>>> model.predict("ACGTAGCTCGTAGCTAGCTAGCTAGCTAGCTCGTAGCTAGCTAGCTAG")

"""

def __init__(self, model, full_dna_shape=True, device="cpu"):
self.model = model
self.full_dna_shape = full_dna_shape
self.device = device

def predict(self, seqs):
"""
Predict binding affinity scores for one or more sequences.

Parameters
----------
seqs : str or list of str
DNA sequence(s), each length ≤ max sequence length in `seqs`.

Returns
-------
list of dict
Ranked list of dictionaries, each with:
{
"seq": sequence string,
"score": float (probability of binding, from [p_bind, p_not_bind])
}
Sorted from high to low by score.
"""
if isinstance(seqs, str):
seqs = [seqs]

ohe_list, shape_list = [], []
max_len = max(len(seq) for seq in seqs)
for seq in seqs:
ohe_list.append(preprocess_seq_ohe(seq, seq_len=max_len))
shape_list.append(preprocess_seq_shape(seq))

X_ohe = torch.tensor(
np.array(ohe_list), dtype=torch.float32, device=self.device
)
X_shape = torch.tensor(
np.array(shape_list), dtype=torch.float32, device=self.device
)
self.model.eval()
with torch.no_grad():
outputs = self.model(X_ohe, X_shape)
# convert to probabilities
probs = torch.softmax(outputs, dim=1).cpu().numpy()

bind_scores = probs[:, 0]

# Create ranked output
ranked = sorted(
[
{"seq": s, "score": float(sc)}
for s, sc in zip(seqs, bind_scores, strict=False)
],
key=lambda x: x["score"],
reverse=True,
)

return ranked
Loading
Loading