diff --git a/pyaptamer/deepatamer/__init__.py b/pyaptamer/deepatamer/__init__.py new file mode 100644 index 00000000..234601dd --- /dev/null +++ b/pyaptamer/deepatamer/__init__.py @@ -0,0 +1,6 @@ +"""The deepatamer algorithm for binary binding affinity prediction.""" + +from pyaptamer.deepatamer._deepaptamer_nn import DeepAptamerNN +from pyaptamer.deepatamer._pipeline import DeepAptamerPipeline + +__all__ = ["DeepAptamerNN", "DeepAptamerPipeline"] diff --git a/pyaptamer/deepatamer/_deepaptamer_nn.py b/pyaptamer/deepatamer/_deepaptamer_nn.py new file mode 100644 index 00000000..664ca079 --- /dev/null +++ b/pyaptamer/deepatamer/_deepaptamer_nn.py @@ -0,0 +1,215 @@ +__author__ = "satvshr" +__all__ = ["DeepAptamerNN"] + +import torch +import torch.nn as nn + + +class DeepAptamerNN(nn.Module): + """ + DeepAptamer neural network model for aptamer–protein interaction prediction. + + This architecture integrates: + + - A sequence branch using convolutional and fully-connected layers to + process one-hot encoded aptamer sequences. + - A structural (DNA shape) branch using convolution + pooling + dense layers to + extract shape features using `deepDNAshape` from the aptamer sequence. + - A BiLSTM for capturing sequential dependencies. + - Multi-head self-attention for contextual feature refinement. + - A final classification head for binary binding prediction. + + Parameters + ---------- + seq_conv_in : int, optional, default=4 + Number of input channels for the sequence convolution branch. Typically 4 + for one-hot DNA encoding. + + seq_conv_out : int, optional, default=12 + Number of output channels (filters) for the sequence convolution. + + seq_conv_kernel_size : int, optional, default=1 + Kernel size for the sequence convolution. + + seq_pool_kernel_size : int, optional, default=1 + Kernel size for max-pooling after sequence convolution. + + seq_pool_stride : int, optional, default=1 + Stride for max-pooling after sequence convolution. + + seq_linear_hidden_dim : int, optional, default=32 + Hidden layer size for fully connected layers in the sequence branch. + + seq_conv_linear_out : int, optional, default=4 + Dimensionality of the output feature vector from the sequence branch. + + shape_conv_kernel_size : int, optional, default=100 + Kernel size for convolution in the shape branch. + + shape_pool_kernel_size : int, optional, default=20 + Kernel size for pooling in the shape branch. + + shape_pool_stride : int, optional, default=20 + Stride for pooling in the shape branch. + + bilstm_hidden_size : int, optional, default=100 + Number of hidden units in each LSTM direction. + + bilstm_num_layers : int, optional, default=2 + Number of BiLSTM layers. + + dropout : float, optional, default=0.1 + Dropout probability applied after the BiLSTM. + + optimizer : torch.optim.Optimizer or None, optional, default=None + Optimizer for training. If None, defaults to Adam with lr=0.001. + + Attributes + ---------- + seq_conv : nn.Conv1d + 1D convolution layer for sequence branch. + + seq_fc : nn.Sequential + Fully connected projection for sequence features. + + shape_conv_pool : nn.Sequential + Convolution + pooling for DNA shape features. + + shape_fc : nn.Sequential + Fully connected projection for shape features. + + bilstm : nn.LSTM + Bidirectional LSTM for sequential modeling. + + dropout : nn.Dropout + Dropout layer applied after BiLSTM. + + attn : nn.MultiheadAttention + Attention layer for contextual refinement. + + head : nn.Linear + Final classification layer (logits for 2 classes). + """ + + def __init__( + self, + seq_conv_in=4, + seq_conv_out=12, + seq_conv_kernel_size=1, + seq_pool_kernel_size=1, + seq_pool_stride=1, + seq_linear_hidden_dim=32, + # Defines the size of the 1st dimension(time/seq) used for branch concatenation + seq_conv_linear_out=4, + shape_conv_kernel_size=100, + shape_pool_kernel_size=20, + shape_pool_stride=20, + bilstm_hidden_size=100, + bilstm_num_layers=2, + dropout=0.1, + optimizer=None, + ): + super().__init__() + self.seq_conv_in = seq_conv_in + self.seq_conv_out = seq_conv_out + self.seq_conv_kernel_size = seq_conv_kernel_size + self.seq_pool_kernel_size = seq_pool_kernel_size + self.seq_pool_stride = seq_pool_stride + self.seq_linear_hidden_dim = seq_linear_hidden_dim + self.seq_conv_linear_out = seq_conv_linear_out + self.shape_conv_kernel_size = shape_conv_kernel_size + self.shape_pool_kernel_size = shape_pool_kernel_size + self.shape_pool_stride = shape_pool_stride + self.bilstm_hidden_size = bilstm_hidden_size + self.bilstm_num_layers = bilstm_num_layers + self.dropout_val = dropout + self.optimizer = optimizer + + # Sequence branch (B, seq_len, 4) + self.seq_conv = nn.Conv1d( + in_channels=self.seq_conv_in, + out_channels=self.seq_conv_out, + kernel_size=self.seq_conv_kernel_size, + ) + self.seq_fc = nn.Sequential( + nn.MaxPool1d( + kernel_size=self.seq_pool_kernel_size, stride=self.seq_pool_stride + ), + nn.Linear(self.seq_conv_out, self.seq_linear_hidden_dim), + nn.ReLU(), + nn.Linear(self.seq_linear_hidden_dim, self.seq_conv_linear_out), + nn.Softmax(dim=-1), + ) + + # Shape branch (B, 1, 126/138) + self.shape_conv_pool = nn.Sequential( + nn.Conv1d( + in_channels=1, out_channels=1, kernel_size=self.shape_conv_kernel_size + ), + nn.MaxPool1d( + kernel_size=self.shape_pool_kernel_size, stride=self.shape_pool_stride + ), + ) + self.shape_fc = nn.Sequential(nn.Linear(1, self.seq_conv_linear_out), nn.ReLU()) + + # Rest of the model + self.bilstm = nn.LSTM( + input_size=self.seq_conv_linear_out, + hidden_size=self.bilstm_hidden_size, + num_layers=self.bilstm_num_layers, + batch_first=True, + bidirectional=True, + dropout=self.dropout_val, + ) + self.dropout = nn.Dropout(self.dropout_val) + + self.attn = nn.MultiheadAttention( + embed_dim=2 * self.bilstm_hidden_size, num_heads=1, batch_first=True + ) + + self.head = nn.Linear(2 * bilstm_hidden_size, 2) + + self.optimizer = self.optimizer or torch.optim.Adam(self.parameters(), lr=0.001) + + def forward(self, x_ohe, x_shape): + """ + Forward pass of the DeepAptamerNN model. + + Parameters + ---------- + x_ohe : torch.Tensor + One-hot encoded aptamer sequence tensor of shape (batch_size, seq_len, 4). + Example: (B, seq_len, 4) for batch size B and sequence length seq_len. + + x_shape : torch.Tensor + DNA shape feature tensor of shape (batch_size, 1, shape_len). + Example: (B, 1, 126) for batch size B and shape feature length 126. + + Returns + ------- + torch.Tensor + Logits tensor of shape (batch_size, 2), representing the predicted + class scores for binary classification. + """ + out_ohe = x_ohe.permute(0, 2, 1) + out_ohe = self.seq_conv(out_ohe) + out_ohe = out_ohe.permute(0, 2, 1) + out_ohe = self.seq_fc(out_ohe) + + out_shape = self.shape_conv_pool(x_shape) + out_shape = out_shape.transpose(1, 2) + out_shape = self.shape_fc(out_shape) + + x = torch.cat([out_ohe, out_shape], dim=1) + + x, _ = self.bilstm(x) + + x = x[:, -1, :] + x = self.dropout(x) + + x = x.unsqueeze(1) + x, _ = self.attn(x, x, x) + + x = x.squeeze(1) + + return self.head(x) diff --git a/pyaptamer/deepatamer/_pipeline.py b/pyaptamer/deepatamer/_pipeline.py new file mode 100644 index 00000000..16b337f8 --- /dev/null +++ b/pyaptamer/deepatamer/_pipeline.py @@ -0,0 +1,117 @@ +__author__ = "satvshr" +__all__ = ["DeepAptamerPipeline"] + +import numpy as np +import torch + +from pyaptamer.deepatamer._preprocessing import preprocess_seq_ohe, preprocess_seq_shape + + +class DeepAptamerPipeline: + """ + DeepAptamer algorithm for aptamer–protein interaction prediction [1]_ + + This class encapsulates preprocessing (sequence one-hot encoding and DNAshape + feature extraction) together with inference on a trained `DeepAptamerNN` model. + It provides a `predict` method that accepts one or more DNA sequences and returns + ranked binding affinity scores. + + Parameters + ---------- + model : DeepAptamerNN + A trained DeepAptamer neural network model. + full_dna_shape : bool, optional, default=True + If True, use the trimmed 126-length DNAshape representation + (MGW=31, HelT=32, ProT=31, Roll=32). + If False, keep the full 138-length DeepDNAshape representation. + (MGW=35, HelT=34, ProT=35, Roll=34). + device : {"cpu", "cuda"}, default="cpu" + Device to run inference on. + + Methods + ------- + predict(seqs) + Compute ranked binding affinity scores for one or more DNA + sequences. Returns a list of dictionaries with each sequence + and its predicted binding probability. + + References + ---------- + .. [1] Yang X, Chan CH, Yao S, Chu HY, Lyu M, Chen Z, Xiao H, Ma Y, Yu S, Li F, + Liu J, Wang L, Zhang Z, Zhang BT, Zhang L, Lu A, Wang Y, Zhang G, Yu Y. + DeepAptamer: Advancing high-affinity aptamer discovery with a hybrid deep learning + model. Mol Ther Nucleic Acids. 2024 Dec 21;36(1):102436. + doi: 10.1016/j.omtn.2024.102436. PMID: 39897584; PMCID: PMC11787022. + https://www.cell.com/molecular-therapy-family/nucleic-acids/pdf/S2162-2531(24)00323-8.pdf + .. [2] deepDNAshape: a deep learning predictor for DNA shape features. + https://github.com/JinsenLi/deepDNAshape/blob/main/LICENSE + .. [3] DeepAptamer: a deep learning framework for aptamer design and binding + prediction. + https://github.com/YangX-BIDD/DeepAptamer + + Examples + -------- + >>> from pyaptamer.deepatamer import DeepAptamerPipeline, DeepAptamerNN + >>> model = DeepAptamerNN() + >>> model.predict("ACGTAGCTCGTAGCTAGCTAGCTAGCTAGCTCGTAGCTAGCTAGCTAG") + + """ + + def __init__(self, model, full_dna_shape=True, device="cpu"): + self.model = model + self.full_dna_shape = full_dna_shape + self.device = device + + def predict(self, seqs): + """ + Predict binding affinity scores for one or more sequences. + + Parameters + ---------- + seqs : str or list of str + DNA sequence(s), each length ≤ max sequence length in `seqs`. + + Returns + ------- + list of dict + Ranked list of dictionaries, each with: + { + "seq": sequence string, + "score": float (probability of binding, from [p_bind, p_not_bind]) + } + Sorted from high to low by score. + """ + if isinstance(seqs, str): + seqs = [seqs] + + ohe_list, shape_list = [], [] + max_len = max(len(seq) for seq in seqs) + for seq in seqs: + ohe_list.append(preprocess_seq_ohe(seq, seq_len=max_len)) + shape_list.append(preprocess_seq_shape(seq)) + + X_ohe = torch.tensor( + np.array(ohe_list), dtype=torch.float32, device=self.device + ) + X_shape = torch.tensor( + np.array(shape_list), dtype=torch.float32, device=self.device + ) + self.model.eval() + with torch.no_grad(): + outputs = self.model(X_ohe, X_shape) + # convert to probabilities + probs = torch.softmax(outputs, dim=1).cpu().numpy() + + bind_scores = probs[:, 0] + + # Create ranked output + ranked = sorted( + [ + {"seq": s, "score": float(sc)} + for s, sc in zip(seqs, bind_scores, strict=False) + ], + key=lambda x: x["score"], + reverse=True, + ) + + return ranked diff --git a/pyaptamer/deepatamer/_preprocessing.py b/pyaptamer/deepatamer/_preprocessing.py new file mode 100644 index 00000000..17d8cb8f --- /dev/null +++ b/pyaptamer/deepatamer/_preprocessing.py @@ -0,0 +1,112 @@ +__author__ = "satvshr" +__all__ = ["preprocess_seq_ohe", "preprocess_seq_shape", "preprocess_y"] + +import numpy as np + +from pyaptamer.utils._deepaptamer_utils import ( + ohe, + pad_sequence, + remove_na, + run_deepdna_prediction, +) + + +def preprocess_seq_ohe(seq, seq_len=35): + """ + Preprocesses a single DNA sequence for DeepAptamer. + + The function pads the sequence to length `seq_len` using 'N' and one-hot encodes + it. The resulting array has shape (`seq_len`, 4), where each base is encoded as: + - A → [1, 0, 0, 0] + - T → [0, 1, 0, 0] + - C → [0, 0, 1, 0] + - G → [0, 0, 0, 1] + - N or unknown → [0, 0, 0, 0] + + Parameters + ---------- + seq : str + A DNA sequence of length ≤ `seq_len`. + seq_len : int, optional, default=35 + The length to which the sequence will be padded or truncated. + + Returns + ------- + np.ndarray + A NumPy array of shape (`seq_len`, 4) representing the one-hot + encoded sequence. + """ + seq_pad = pad_sequence(seq, seq_len) # pads to `seq_len` + seq_ohe = ohe(seq_pad) # one-hot encode (shape `seq_len` × 4) + return seq_ohe + + +# input will be of size (n_shapes(4), shape_vector_size) +def preprocess_seq_shape(seq, full_dna_shape=True): + """ + Preprocesses a single DNA sequence into a normalized shape vector. + + The function runs DeepDNA prediction on the input sequence, normalizes + the resulting feature matrix column-wise, flattens it into a single row + vector, and removes any "NA" values. + + Parameters + ---------- + seq : str + A DNA sequence to be processed. + full_dna_shape : bool, optional, default=True + If True, uses the 138-length long `deepDNAshape` vector. + If False, uses the 126-length long `DNAshapeR` like vector. + + Returns + ------- + np.ndarray + A 2D NumPy array of shape (1, new_length), where `new_length` + depends on the DeepDNA prediction output after flattening and + removing "NA" values. + """ + + # Step 1: Get raw predictions + seq_shape = run_deepdna_prediction(seq) + if full_dna_shape: + seq_shape = remove_na(seq_shape) + + norm_features = [] + for feat in seq_shape: # each feat is a list of floats + arr = np.array(feat, dtype=np.float32) + + # Normalize per feature + mean = arr.mean() + std = arr.std() if arr.std() > 0 else 1.0 + arr_norm = (arr - mean) / std + + norm_features.append(arr_norm) + + # Step 2: Concatenate all features into one flat vector + seq_flat = np.concatenate(norm_features).reshape(1, -1) + + return seq_flat + + +def preprocess_y(y): + """ + Preprocess labels into one-hot vectors. + + Converts: + 1 -> [1, 0] (binder) + 0 -> [0, 1] (non-binder) + + Parameters + ---------- + y : np.ndarray + A 1D NumPy array of binary labels (0 or 1). + + Returns + ------- + np.ndarray + A 2D NumPy array of shape (len(y), 2) with one-hot encoded labels. + """ + one_hot = np.zeros((len(y), 2), dtype=int) + one_hot[y == 1] = [1, 0] + one_hot[y == 0] = [0, 1] + return one_hot diff --git a/pyaptamer/deepatamer/tests/__init__.py b/pyaptamer/deepatamer/tests/__init__.py new file mode 100644 index 00000000..233d772e --- /dev/null +++ b/pyaptamer/deepatamer/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the deepatamer algorithm.""" diff --git a/pyaptamer/deepatamer/tests/test_deepaptamer.py b/pyaptamer/deepatamer/tests/test_deepaptamer.py new file mode 100644 index 00000000..3b564ce5 --- /dev/null +++ b/pyaptamer/deepatamer/tests/test_deepaptamer.py @@ -0,0 +1,35 @@ +__author__ = "satvshr" + +import pytest + +from pyaptamer.deepatamer._deepaptamer_nn import DeepAptamerNN +from pyaptamer.deepatamer._pipeline import DeepAptamerPipeline + + +@pytest.mark.parametrize( + "seqs", + [ + "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTCCG", + [ + "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCC", + "TGCATGCTAGCTAGCTAGCTAGCTAGCTAGCGCTA", + ], + ], +) +def test_pipeline_predict_shapes(seqs): + """ + Test if DeepAptamerPipeline outputs valid ranked predictions. + + Raises + ------ + AssertionError + If prediction scores are not sorted in descending order. + """ + model = DeepAptamerNN() + pipe = DeepAptamerPipeline(model=model, device="cpu") + + ranked = pipe.predict(seqs) + + # Ensure sorted in descending order + scores = [item["score"] for item in ranked] + assert all(scores[i] >= scores[i + 1] for i in range(len(scores) - 1)) diff --git a/pyaptamer/utils/_deepaptamer_utils.py b/pyaptamer/utils/_deepaptamer_utils.py new file mode 100644 index 00000000..6c2eb9bc --- /dev/null +++ b/pyaptamer/utils/_deepaptamer_utils.py @@ -0,0 +1,136 @@ +__author__ = "satvshr" +__all__ = ["ohe", "pad_sequence", "run_deepdna_prediction", "remove_na"] + + +import numpy as np +from deepDNAshape import predictor + + +def ohe(seq): + """ + One-hot encodes a single DNA sequence. + + Each character is converted into a one-hot vector. Unknown characters are encoded + as [0, 0, 0, 0]. Column order is [A, T, C, G]. + + Parameters + ---------- + seq : str + A DNA sequence. + + Returns + ------- + np.ndarray + A 2D NumPy array of shape (seq_len, 4), where the sequence is one-hot encoded. + """ + alphabet = "ATCG" + mapping = {base: i for i, base in enumerate(alphabet)} + + seq_len = len(seq) + ohe_matrix = np.zeros((seq_len, 4), dtype=int) + + for j, base in enumerate(seq): + idx = mapping.get(base) + if idx is not None: + ohe_matrix[j, idx] = 1 + + return ohe_matrix + + +def pad_sequence(seq, seq_len=35): + """ + Pads a single DNA sequence to length `seq_len` using 'N'. Raises an error if the + sequence is longer than `seq_len`. + + Parameters + ---------- + seq : str + DNA sequence of length ≤ `seq_len`. + seq_len : int, optional, default=35 + The length to which the sequence will be padded or truncated. + + Returns + ------- + str + The padded sequence of exactly `seq_len` characters. + + Raises + ------- + ValueError + If the input sequence length exceeds `seq_len`. + + """ + if len(seq) > seq_len: + raise ValueError(f"Sequence length {len(seq)} exceeds {seq_len}: '{seq}'") + + return seq.ljust(seq_len, "N") + + +def run_deepdna_prediction(seq, mode="cpu"): + """ + Run deepDNAshape prediction for all DNA structural features (MGW, HelT, ProT, Roll) + on a single DNA sequence. + + The four DNA shape features are: + - MGW: Minor Groove Width + - HelT: Helical Twist + - ProT: Propeller Twist + - Roll: Roll angle + + Parameters + ---------- + seq : str + DNA sequence (e.g., "AAGGTTCC") to predict structural features for. + mode : {"cpu", "gpu"}, optional + Compute mode for the predictor. Default is "cpu". + + Returns + ------- + list of list of float + A list of length 4, where each element is a list of floats containing + predictions for one structural feature. The order is [MGW, HelT, ProT, Roll]. + Lengths differ depending on the feature. + """ + # Always use layer 2 (sliding window of 5) + layer = 2 + + model = predictor.predictor(mode=mode) + features = ["MGW", "HelT", "ProT", "Roll"] + + results = [model.predict(feat, seq, layer).tolist() for feat in features] + return results + + +def remove_na(shape_vectors): + """ + Trim deepDNAShape predictions to match DeepAptamer's convention + (remove edge positions that correspond to NA in original DNAshape). + + The four DNA shape features used are: + - MGW: Minor Groove Width + - HelT: Helical Twist + - ProT: Propeller Twist + - Roll: Roll angle + + Parameters + ---------- + shape_vectors : list of list of float + A list of 4 lists in order [MGW, HelT, ProT, Roll], + + Returns + ------- + list of lists of float + A list of 4 lists after trimming: + - MGW (drop first 2 and last 2 -> len=31) + - HelT (drop first and last -> len=32) + - ProT (drop first 2 and last 2 -> len=31) + - Roll (drop first and last -> len=32) + """ + mgw, helt, prot, roll = shape_vectors + + mgw = mgw[2:-2] + prot = prot[2:-2] + helt = helt[1:-1] + roll = roll[1:-1] + + return [mgw, helt, prot, roll] diff --git a/pyproject.toml b/pyproject.toml index e5e803fc..d2ec65ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,10 @@ dev = [ "pre-commit", ] +test = [ + "deepDNAshape @ git+https://github.com/JinsenLi/deepDNAshape.git@main" +] + [tool.ruff] line-length = 88 target-version = "py310" @@ -37,7 +41,7 @@ exclude = ["__pycache__", ".venv", "build", "dist", ".git"] [tool.ruff.lint] select = ["E", "F", "B", "I", "UP", "N", "C4"] -ignore = ["N803", "N806"] +ignore = ["N803", "N806", "N812"] [tool.ruff.format] quote-style = "double"