gc-os-ai · satvshr · Sep 15, 2025 · Sep 17, 2025 · Sep 22, 2025 · Sep 26, 2025
diff --git a/examples/aptanet_tutorial.ipynb b/examples/aptanet_tutorial.ipynb
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "3737da88",
    "metadata": {},
    "outputs": [],
@@ -49,7 +49,7 @@
     "import torch\n",
     "\n",
     "from pyaptamer.datasets import load_1gnh_structure\n",
-    "from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq"
+    "from pyaptamer.utils import struct_to_aaseq"
    ]
   },
   {

diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
@@ -2,7 +2,7 @@
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
 from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
-from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure
 
 __all__ = [
     "load_pfoa_structure",

diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py
@@ -1,6 +1,6 @@
 """Loaders for different data structures."""
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
-from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure
 
 __all__ = ["load_pfoa_structure", "load_1gnh_structure"]
diff --git a/pyaptamer/datasets/_loaders/_online_databank.py b/pyaptamer/datasets/_loaders/_online_databank.py
@@ -3,7 +3,7 @@
 
 from Bio.PDB import PDBList
 
-from pyaptamer.utils.pdb_to_struct import pdb_to_struct
+from pyaptamer.utils import pdb_to_struct
 
 
 def load_from_rcsb(pdb_id, overwrite=False):

diff --git a/pyaptamer/datasets/_loaders/_pfoa_loader.py → pyaptamer/datasets/_loaders/_pfoa.py b/pyaptamer/datasets/_loaders/_pfoa_loader.py → pyaptamer/datasets/_loaders/_pfoa.py
diff --git a/pyaptamer/utils/__init__.py b/pyaptamer/utils/__init__.py
@@ -7,13 +7,15 @@
     "rna2vec",
     "pdb_to_struct",
     "struct_to_aaseq",
+    "pdb_to_aaseq",
 ]
 
+from pyaptamer.utils._pdb_to_aaseq import pdb_to_aaseq
+from pyaptamer.utils._pdb_to_struct import pdb_to_struct
 from pyaptamer.utils._rna import (
     dna2rna,
     encode_rna,
     generate_all_aptamer_triplets,
     rna2vec,
 )
-from pyaptamer.utils.pdb_to_struct import pdb_to_struct
-from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq
+from pyaptamer.utils._struct_to_aaseq import struct_to_aaseq
diff --git a/pyaptamer/utils/_pdb_to_aaseq.py b/pyaptamer/utils/_pdb_to_aaseq.py
@@ -0,0 +1,106 @@
+__author__ = "satvshr"
+__all__ = ["pdb_to_aaseq"]
+
+import io
+import os
+
+import pandas as pd
+import requests
+from Bio import SeqIO
+
+from ._pdb_to_struct import pdb_to_struct
+from ._struct_to_aaseq import struct_to_aaseq
+
+
+def pdb_to_aaseq(pdb_file_path, return_type="list", use_uniprot=False, pdb_id=None):
+    """
+    Extract amino-acid sequences from a PDB file.
+
+    Tries SEQRES records first (full deposited sequence).
+    Falls back to using the package's pdb -> Structure -> sequences converters
+    if SEQRES records are not present. Optionally, retrieves canonical UniProt
+    sequence for the PDB ID.
+
+    Parameters
+    ----------
+    pdb_file_path : str or os.PathLike
+        Path to a PDB file.
+    return_type : {'list', 'pd.df'}, optional, default='list'
+        Format of returned value:
+          - ``'list'`` : list of amino acid strings (one per chain / polypeptide)
+          - ``'pd.df'`` : pandas.DataFrame with a single column ``'sequence'``.
+            Rows are indexed 0..n-1 (no chain identifiers).
+    use_uniprot : bool, optional, default=False
+        If True, fetches the UniProt sequence using the PDB ID.
+        Requires the ``pdb_id`` argument to be set.
+    pdb_id : str, optional
+        PDB ID (e.g., ``'1a3n'``) required if ``use_uniprot=True``.
+
+    Returns
+    -------
+    list of str or pandas.DataFrame
+        Depending on ``return_type``. If ``'list'``, returns a Python list of
+        sequence strings (one element per chain / polypeptide). If ``'pd.df'``,
+        returns a DataFrame with a single column ``'sequence'`` and a default
+        integer index (no chain IDs).
+
+    Raises
+    ------
+    FileNotFoundError
+        If the given ``pdb_file_path`` does not exist.
+    ValueError
+        If ``return_type`` is not one of the supported values, or if
+        ``use_uniprot=True`` but no mapping / fasta could be retrieved.
+    """
+    pdb_path = os.fspath(pdb_file_path)
+    if not os.path.exists(pdb_path):
+        raise FileNotFoundError(f"PDB file not found: {pdb_path}")
+
+    sequences = []
+
+    # Try SEQRES records first
+    with open(pdb_path) as handle:
+        seqres_records = list(SeqIO.parse(handle, "pdb-seqres"))
+
+    if seqres_records:
+        for record in seqres_records:
+            sequences.append(str(record.seq))
+    else:
+        # Fall back to using pdb_to_struct + struct_to_aaseq helpers
+        structure = pdb_to_struct(pdb_path)
+        sequences = struct_to_aaseq(structure)
+
+    if len(sequences) == 0:
+        raise ValueError(f"No sequences could be extracted from PDB file: {pdb_path}")
+
+    if use_uniprot:
+        if not pdb_id:
+            raise ValueError("`pdb_id` must be provided when use_uniprot=True")
+
+        pdb_id = pdb_id.lower()
+        mapping_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
+        mapping_resp = requests.get(mapping_url, timeout=10)
+        mapping_resp.raise_for_status()
+        mapping_data = mapping_resp.json()
+        uniprot_ids = list(mapping_data.get(pdb_id, {}).get("UniProt", {}).keys())
+
+        if not uniprot_ids:
+            raise ValueError(f"No UniProt mapping found for PDB ID '{pdb_id}'")
+
+        uniprot_id = uniprot_ids[0]
+
+        fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
+        fasta_resp = requests.get(fasta_url, timeout=10)
+        fasta_resp.raise_for_status()
+        fasta_data = fasta_resp.text
+
+        record = next(SeqIO.parse(io.StringIO(fasta_data), "fasta"))
+        sequences = [str(record.seq)]
+
+    if return_type == "list":
+        return sequences
+    elif return_type == "pd.df":
+        df = pd.DataFrame({"sequence": sequences})
+        return df
+    else:
+        raise ValueError("`return_type` must be either 'list' or 'pd.df'")
diff --git a/pyaptamer/utils/pdb_to_struct.py → pyaptamer/utils/_pdb_to_struct.py b/pyaptamer/utils/pdb_to_struct.py → pyaptamer/utils/_pdb_to_struct.py
diff --git a/pyaptamer/utils/struct_to_aaseq.py → pyaptamer/utils/_struct_to_aaseq.py b/pyaptamer/utils/struct_to_aaseq.py → pyaptamer/utils/_struct_to_aaseq.py
diff --git a/pyaptamer/utils/tests/test_pdb_to_aaseq.py b/pyaptamer/utils/tests/test_pdb_to_aaseq.py
@@ -0,0 +1,75 @@
+__author__ = "satvshr"
+
+import os
+
+import pytest
+
+from pyaptamer.utils import pdb_to_aaseq
+
+
+@pytest.fixture
+def pdb_path_1gnh():
+    return os.path.join(
+        os.path.dirname(__file__), "..", "..", "datasets", "data", "1gnh.pdb"
+    )
+
+
+@pytest.fixture
+def pdb_path_pfoa():
+    return os.path.join(
+        os.path.dirname(__file__), "..", "..", "datasets", "data", "pfoa.pdb"
+    )
+
+
+def test_pdb_to_aaseq_seqres(pdb_path_1gnh):
+    """
+    Test that pdb_to_aaseq correctly extracts SEQRES sequences as a list and DataFrame.
+    """
+    sequences = pdb_to_aaseq(pdb_path_1gnh)
+
+    assert isinstance(sequences, list), "Expected a list return type"
+    assert len(sequences) > 0, "Returned list should not be empty"
+
+    for seq in sequences:
+        assert isinstance(seq, str), "Each entry should be a string"
+        assert seq.isalpha(), "Sequence should contain only alphabetic characters"
+        assert len(seq) > 0, "Sequence should not be empty"
+
+    df = pdb_to_aaseq(pdb_path_1gnh, return_type="pd.df")
+
+    assert not df.empty, "Returned DataFrame should not be empty"
+    assert "sequence" in df.columns, "DataFrame should have a 'sequence' column"
+    assert all(isinstance(s, str) and len(s) > 0 for s in df["sequence"]), (
+        "Each sequence entry in DataFrame should be a non-empty string"
+    )
+
+
+def test_pdb_to_aaseq_atom_fallback(pdb_path_pfoa):
+    """
+    Use the packaged 'pfoa.pdb' (ATOM-only) to exercise the ATOM fallback.
+    """
+
+    sequences = pdb_to_aaseq(pdb_path_pfoa)
+    print(sequences)
+    assert isinstance(sequences, list), "Should return a list"
+    assert len(sequences) > 0, "ATOM fallback should produce at least one sequence"
+
+
+@pytest.mark.internet
+def test_pdb_to_aaseq_uniprot_fetch(pdb_path_1gnh):
+    """
+    Test UniProt fetch mode using PDB ID 1gnh.
+    (Requires internet connection)
+    """
+    try:
+        sequences = pdb_to_aaseq(pdb_path_1gnh, use_uniprot=True, pdb_id="1gnh")
+    except RuntimeError as e:
+        pytest.skip(f"Skipped UniProt fetch test (API unavailable): {e}")
+        return
+
+    assert isinstance(sequences, list), "Expected list return type from UniProt mode"
+    assert len(sequences) == 1, "Expected one canonical sequence"
+    seq = sequences[0]
+    assert isinstance(seq, str) and len(seq) > 50, (
+        "Fetched UniProt sequence should be a non-trivial string"
+    )
diff --git a/pyaptamer/utils/tests/test_pdb_to_struct.py b/pyaptamer/utils/tests/test_pdb_to_struct.py
@@ -2,7 +2,7 @@
 
 import os
 
-from pyaptamer.utils.pdb_to_struct import pdb_to_struct
+from pyaptamer.utils import pdb_to_struct
 
 
 def test_pdb_to_struct():

diff --git a/pyaptamer/utils/tests/test_struct_to_aaseq.py b/pyaptamer/utils/tests/test_struct_to_aaseq.py
@@ -1,7 +1,7 @@
 __author__ = "satvshr"
 
 from pyaptamer.datasets import load_1gnh_structure
-from pyaptamer.utils.struct_to_aaseq import struct_to_aaseq
+from pyaptamer.utils import struct_to_aaseq
 
 
 def test_struct_to_aaseq():