Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/aptanet_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "3737da88",
"metadata": {},
"outputs": [],
Expand All @@ -49,7 +49,7 @@
"import torch\n",
"\n",
"from pyaptamer.datasets import load_1gnh_structure\n",
"from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq"
"from pyaptamer.utils import struct_to_aaseq"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure

__all__ = [
"load_pfoa_structure",
Expand Down
2 changes: 1 addition & 1 deletion pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Loaders for different data structures."""

from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure

__all__ = ["load_pfoa_structure", "load_1gnh_structure"]
2 changes: 1 addition & 1 deletion pyaptamer/datasets/_loaders/_online_databank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from Bio.PDB import PDBList

from pyaptamer.utils.pdb_to_struct import pdb_to_struct
from pyaptamer.utils import pdb_to_struct


def load_from_rcsb(pdb_id, overwrite=False):
Expand Down
6 changes: 4 additions & 2 deletions pyaptamer/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
"rna2vec",
"pdb_to_struct",
"struct_to_aaseq",
"pdb_to_aaseq",
]

from pyaptamer.utils._pdb_to_aaseq import pdb_to_aaseq
from pyaptamer.utils._pdb_to_struct import pdb_to_struct
from pyaptamer.utils._rna import (
dna2rna,
encode_rna,
generate_all_aptamer_triplets,
rna2vec,
)
from pyaptamer.utils.pdb_to_struct import pdb_to_struct
from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq
from pyaptamer.utils._struct_to_aaseq import struct_to_aaseq
106 changes: 106 additions & 0 deletions pyaptamer/utils/_pdb_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
__author__ = "satvshr"
__all__ = ["pdb_to_aaseq"]

import io
import os

import pandas as pd
import requests
from Bio import SeqIO

from ._pdb_to_struct import pdb_to_struct
from ._struct_to_aaseq import struct_to_aaseq


def pdb_to_aaseq(pdb_file_path, return_type="list", use_uniprot=False, pdb_id=None):
"""
Extract amino-acid sequences from a PDB file.

Tries SEQRES records first (full deposited sequence).
Falls back to using the package's pdb -> Structure -> sequences converters
if SEQRES records are not present. Optionally, retrieves canonical UniProt
sequence for the PDB ID.

Parameters
----------
pdb_file_path : str or os.PathLike
Path to a PDB file.
return_type : {'list', 'pd.df'}, optional, default='list'
Format of returned value:
- ``'list'`` : list of amino acid strings (one per chain / polypeptide)
- ``'pd.df'`` : pandas.DataFrame with a single column ``'sequence'``.
Rows are indexed 0..n-1 (no chain identifiers).
use_uniprot : bool, optional, default=False
If True, fetches the UniProt sequence using the PDB ID.
Requires the ``pdb_id`` argument to be set.
pdb_id : str, optional
PDB ID (e.g., ``'1a3n'``) required if ``use_uniprot=True``.

Returns
-------
list of str or pandas.DataFrame
Depending on ``return_type``. If ``'list'``, returns a Python list of
sequence strings (one element per chain / polypeptide). If ``'pd.df'``,
returns a DataFrame with a single column ``'sequence'`` and a default
integer index (no chain IDs).

Raises
------
FileNotFoundError
If the given ``pdb_file_path`` does not exist.
ValueError
If ``return_type`` is not one of the supported values, or if
``use_uniprot=True`` but no mapping / fasta could be retrieved.
"""
pdb_path = os.fspath(pdb_file_path)
if not os.path.exists(pdb_path):
raise FileNotFoundError(f"PDB file not found: {pdb_path}")

sequences = []

# Try SEQRES records first
with open(pdb_path) as handle:
seqres_records = list(SeqIO.parse(handle, "pdb-seqres"))

if seqres_records:
for record in seqres_records:
sequences.append(str(record.seq))
else:
# Fall back to using pdb_to_struct + struct_to_aaseq helpers
structure = pdb_to_struct(pdb_path)
sequences = struct_to_aaseq(structure)

if len(sequences) == 0:
raise ValueError(f"No sequences could be extracted from PDB file: {pdb_path}")

if use_uniprot:
if not pdb_id:
raise ValueError("`pdb_id` must be provided when use_uniprot=True")

pdb_id = pdb_id.lower()
mapping_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
mapping_resp = requests.get(mapping_url, timeout=10)
mapping_resp.raise_for_status()
mapping_data = mapping_resp.json()
uniprot_ids = list(mapping_data.get(pdb_id, {}).get("UniProt", {}).keys())

if not uniprot_ids:
raise ValueError(f"No UniProt mapping found for PDB ID '{pdb_id}'")

uniprot_id = uniprot_ids[0]

fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
fasta_resp = requests.get(fasta_url, timeout=10)
fasta_resp.raise_for_status()
fasta_data = fasta_resp.text

record = next(SeqIO.parse(io.StringIO(fasta_data), "fasta"))
sequences = [str(record.seq)]

if return_type == "list":
return sequences
elif return_type == "pd.df":
df = pd.DataFrame({"sequence": sequences})
return df
else:
raise ValueError("`return_type` must be either 'list' or 'pd.df'")
File renamed without changes.
File renamed without changes.
75 changes: 75 additions & 0 deletions pyaptamer/utils/tests/test_pdb_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
__author__ = "satvshr"

import os

import pytest

from pyaptamer.utils import pdb_to_aaseq


@pytest.fixture
def pdb_path_1gnh():
return os.path.join(
os.path.dirname(__file__), "..", "..", "datasets", "data", "1gnh.pdb"
)


@pytest.fixture
def pdb_path_pfoa():
return os.path.join(
os.path.dirname(__file__), "..", "..", "datasets", "data", "pfoa.pdb"
)


def test_pdb_to_aaseq_seqres(pdb_path_1gnh):
"""
Test that pdb_to_aaseq correctly extracts SEQRES sequences as a list and DataFrame.
"""
sequences = pdb_to_aaseq(pdb_path_1gnh)

assert isinstance(sequences, list), "Expected a list return type"
assert len(sequences) > 0, "Returned list should not be empty"

for seq in sequences:
assert isinstance(seq, str), "Each entry should be a string"
assert seq.isalpha(), "Sequence should contain only alphabetic characters"
assert len(seq) > 0, "Sequence should not be empty"

df = pdb_to_aaseq(pdb_path_1gnh, return_type="pd.df")

assert not df.empty, "Returned DataFrame should not be empty"
assert "sequence" in df.columns, "DataFrame should have a 'sequence' column"
assert all(isinstance(s, str) and len(s) > 0 for s in df["sequence"]), (
"Each sequence entry in DataFrame should be a non-empty string"
)


def test_pdb_to_aaseq_atom_fallback(pdb_path_pfoa):
"""
Use the packaged 'pfoa.pdb' (ATOM-only) to exercise the ATOM fallback.
"""

sequences = pdb_to_aaseq(pdb_path_pfoa)
print(sequences)
assert isinstance(sequences, list), "Should return a list"
assert len(sequences) > 0, "ATOM fallback should produce at least one sequence"


@pytest.mark.internet
def test_pdb_to_aaseq_uniprot_fetch(pdb_path_1gnh):
"""
Test UniProt fetch mode using PDB ID 1gnh.
(Requires internet connection)
"""
try:
sequences = pdb_to_aaseq(pdb_path_1gnh, use_uniprot=True, pdb_id="1gnh")
except RuntimeError as e:
pytest.skip(f"Skipped UniProt fetch test (API unavailable): {e}")
return

assert isinstance(sequences, list), "Expected list return type from UniProt mode"
assert len(sequences) == 1, "Expected one canonical sequence"
seq = sequences[0]
assert isinstance(seq, str) and len(seq) > 50, (
"Fetched UniProt sequence should be a non-trivial string"
)
2 changes: 1 addition & 1 deletion pyaptamer/utils/tests/test_pdb_to_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os

from pyaptamer.utils.pdb_to_struct import pdb_to_struct
from pyaptamer.utils import pdb_to_struct


def test_pdb_to_struct():
Expand Down
2 changes: 1 addition & 1 deletion pyaptamer/utils/tests/test_struct_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__author__ = "satvshr"

from pyaptamer.datasets import load_1gnh_structure
from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq
from pyaptamer.utils import struct_to_aaseq


def test_struct_to_aaseq():
Expand Down
Loading