Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
"""Contains datasets along with their loaders."""

from pyaptamer.datasets._loaders._aptacom_loader import (
load_aptacom_full,
load_aptacom_xy,
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure

__all__ = [
"load_aptacom_full",
"load_aptacom_xy",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa_structure",
Expand Down
8 changes: 8 additions & 0 deletions pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
"""Loaders for different data structures."""

from pyaptamer.datasets._loaders._aptacom_loader import (
load_aptacom_full,
load_aptacom_xy,
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure

__all__ = [
"load_pfoa_structure",
"load_1gnh_structure",
"load_aptacom_full",
"load_aptacom_xy",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa_structure",
Expand Down
138 changes: 138 additions & 0 deletions pyaptamer/datasets/_loaders/_aptacom_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
__author__ = "rpgv"
__all__ = ["load_aptacom_full", "load_aptacom_xy"]

from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset

filter_map = {
"protein_target": ("target_chemistry", ["Protein", "peptide"]),
"small_target": (
"target_chemistry",
["Small Organic", "Small Molecule", "Molecule"],
),
"dna_apt": (
"aptamer_chemistry",
[
"DNA",
"L-DNA",
"ssDNA",
"2',4'-BNA/LNA-DNA",
"5-uracil-modified-DNA",
"dsDNA",
],
),
"rna_apt": (
"aptamer_chemistry",
[
"RNA",
"2'-F-RNA",
"2'-NH2-RNA",
"L-RNA",
"2'-O-Me-RNA",
"ssRNA",
"2'-fluoro/amino-RNA",
"2'-fluoro-RNA",
"2'-amino-RNA",
"2'-fluoro/O-Me-RNA",
"5-uracil-modified-RNA",
"4'-thio-RNA",
],
),
}


def filter_columns(ds, columns=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this function not be private? Also do you mind changing ds to df (for dataframe) given ds is something I will be using for the Dataset object from HuggingFace?

""" " Selects columns to keep on dataset
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand the """ " formatting everywhere, seems wrong?

Parameters:
-----------
ds: pd dataframe, required
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatting seems a bit off, can you check this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry for the delayed response @satvshr, thank you for your comments - I'll make the adjustments !

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check #192 :D

Pandas dataframe to filter
columns: list, optional, default=None
If empty returns entire AptaCom dataset, otherwise
returns only the selected columns from the
AptaCom dataset
Returns:
--------
object: pandas dataframe object with
the selected columns
"""

if columns is not None:
ds = ds[columns]
return ds


def prepare_xy(ds):
Copy link
Collaborator

@satvshr satvshr Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fkiraly just a question, should we not name all functions with x and y in their name similarly? So this would be prepare_x_y instead.

""" " Prepares dataset for usage as training data
Parameters:
-----------
ds: pandas dataframe, required

Returns:
--------
Pandas dataframe object processed for training
with columns "aptamer_sequence", "target_sequence",
"new_affinity" and a total of 709 rows
"""
ds.dropna(
subset=["aptamer_sequence", "target_sequence", "new_affinity"], inplace=True
)
ds = ds[["aptamer_sequence", "target_sequence", "new_affinity"]]
return ds


def load_aptacom_full(select_columns=None):
"""Loads a AptaCom dataset from hugging face
with customizable options.

Parameters:
-----------
select_columns: list, optional, default=None
A list used to filter the columns dataset features.
Defaults to empty, which returns the complete dataset.
Column names:
['reference',
'aptamer_chemistry',
'aptamer_name',
'target_name',
'aptamer_sequence',
'origin',
'target_chemistry',
'external_id',
'target_sequence',
'new_affinity']

Returns:
--------
object: A pandas dataframe with 5556 rows in total.
The returned object contains the dataset, possibly
filtered with different columns.
"""
aptacom = load_hf_dataset("AptaCom", store=False)
dataset = filter_columns(aptacom, columns=select_columns)

return dataset


def load_aptacom_xy(return_X_y=False):
"""Loads Aptacom dataset for training

Parameters:
----------
return_X_y: bool, optional, default = False
If true returns X (aptamer and target sequence)
and y (new_affinity) otherwise returns a
pandas dataframe containing the three columns

Returns:
--------
Either a pandas dataframe with three columns
or two pandas dataframe objects with two and one
columns respectively.
"""
aptacom = load_hf_dataset("AptaCom", store=False)
dataset = prepare_xy(aptacom)
if return_X_y:
X = dataset[["aptamer_sequence", "target_sequence"]]
y = dataset[["new_affinity"]]
return X, y
return dataset
42 changes: 42 additions & 0 deletions pyaptamer/datasets/tests/test_aptacom_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
__author__ = "rpgv"

import pytest
from pandas import DataFrame

from pyaptamer.datasets import load_aptacom_full, load_aptacom_xy


@pytest.mark.parametrize(
"select_columns",
[
["reference"],
["aptamer_chemistry"],
["aptamer_name"],
["target_name"],
["aptamer_sequence"],
["origin"],
["target_chemistry"],
["external_id"],
["target_sequence"],
["new_affinity"],
],
)
def test_load_aptacom_full(select_columns):
"""
The test_download_aptacom function
"""
dataset = load_aptacom_full(select_columns)
if not isinstance(dataset, DataFrame):
raise ValueError(f"""Dataset format {type(dataset)}
is not DataFrame""")


@pytest.mark.parametrize("return_X_y", [True, False])
def test_download_aptacom_x_y(return_X_y):
"""
The test_download_aptacom function
"""
dataset = load_aptacom_xy(return_X_y)
if not isinstance(dataset, tuple | DataFrame):
raise ValueError(f"""Dataset format {type(dataset)}
is not X, y tuple or DataFrame""")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"scikit-learn>=1.3.0",
"skorch",
"imblearn",
"datasets",
]

[project.optional-dependencies]
Expand Down