-
Notifications
You must be signed in to change notification settings - Fork 5
[ENH] aptacom database loader #158
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d716bc1
00761a2
0f86c18
a175f9c
8a7a4a1
4fed192
6ac85f7
7ebdb1c
15cd9b3
86cd6b5
443624e
dbd1912
b57655e
7eeafdf
e2d2b72
056a644
b0b6716
497dcbf
aee137f
bca188e
19dd6c8
ad31e46
36c2406
e81499b
74586a8
4e828cc
53e9360
0555663
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| __author__ = "rpgv" | ||
| __all__ = ["load_aptacom_full", "load_aptacom_xy"] | ||
|
|
||
| from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset | ||
|
|
||
| filter_map = { | ||
| "protein_target": ("target_chemistry", ["Protein", "peptide"]), | ||
| "small_target": ( | ||
| "target_chemistry", | ||
| ["Small Organic", "Small Molecule", "Molecule"], | ||
| ), | ||
| "dna_apt": ( | ||
| "aptamer_chemistry", | ||
| [ | ||
| "DNA", | ||
| "L-DNA", | ||
| "ssDNA", | ||
| "2',4'-BNA/LNA-DNA", | ||
| "5-uracil-modified-DNA", | ||
| "dsDNA", | ||
| ], | ||
| ), | ||
| "rna_apt": ( | ||
| "aptamer_chemistry", | ||
| [ | ||
| "RNA", | ||
| "2'-F-RNA", | ||
| "2'-NH2-RNA", | ||
| "L-RNA", | ||
| "2'-O-Me-RNA", | ||
| "ssRNA", | ||
| "2'-fluoro/amino-RNA", | ||
| "2'-fluoro-RNA", | ||
| "2'-amino-RNA", | ||
| "2'-fluoro/O-Me-RNA", | ||
| "5-uracil-modified-RNA", | ||
| "4'-thio-RNA", | ||
| ], | ||
| ), | ||
| } | ||
|
|
||
|
|
||
| def filter_columns(ds, columns=None): | ||
| """ " Selects columns to keep on dataset | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not understand the |
||
| Parameters: | ||
| ----------- | ||
| ds: pd dataframe, required | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Formatting seems a bit off, can you check this?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sorry for the delayed response @satvshr, thank you for your comments - I'll make the adjustments !
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check #192 :D |
||
| Pandas dataframe to filter | ||
| columns: list, optional, default=None | ||
| If empty returns entire AptaCom dataset, otherwise | ||
| returns only the selected columns from the | ||
| AptaCom dataset | ||
| Returns: | ||
| -------- | ||
| object: pandas dataframe object with | ||
| the selected columns | ||
| """ | ||
|
|
||
| if columns is not None: | ||
| ds = ds[columns] | ||
| return ds | ||
|
|
||
|
|
||
| def prepare_xy(ds): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fkiraly just a question, should we not name all functions with x and y in their name similarly? So this would be |
||
| """ " Prepares dataset for usage as training data | ||
| Parameters: | ||
| ----------- | ||
| ds: pandas dataframe, required | ||
|
|
||
| Returns: | ||
| -------- | ||
| Pandas dataframe object processed for training | ||
| with columns "aptamer_sequence", "target_sequence", | ||
| "new_affinity" and a total of 709 rows | ||
| """ | ||
| ds.dropna( | ||
| subset=["aptamer_sequence", "target_sequence", "new_affinity"], inplace=True | ||
| ) | ||
| ds = ds[["aptamer_sequence", "target_sequence", "new_affinity"]] | ||
| return ds | ||
|
|
||
|
|
||
| def load_aptacom_full(select_columns=None): | ||
| """Loads a AptaCom dataset from hugging face | ||
| with customizable options. | ||
|
|
||
| Parameters: | ||
| ----------- | ||
| select_columns: list, optional, default=None | ||
| A list used to filter the columns dataset features. | ||
| Defaults to empty, which returns the complete dataset. | ||
| Column names: | ||
| ['reference', | ||
| 'aptamer_chemistry', | ||
| 'aptamer_name', | ||
| 'target_name', | ||
| 'aptamer_sequence', | ||
| 'origin', | ||
| 'target_chemistry', | ||
| 'external_id', | ||
| 'target_sequence', | ||
| 'new_affinity'] | ||
|
|
||
| Returns: | ||
| -------- | ||
| object: A pandas dataframe with 5556 rows in total. | ||
| The returned object contains the dataset, possibly | ||
| filtered with different columns. | ||
| """ | ||
| aptacom = load_hf_dataset("AptaCom", store=False) | ||
| dataset = filter_columns(aptacom, columns=select_columns) | ||
|
|
||
| return dataset | ||
|
|
||
|
|
||
| def load_aptacom_xy(return_X_y=False): | ||
| """Loads Aptacom dataset for training | ||
|
|
||
| Parameters: | ||
| ---------- | ||
| return_X_y: bool, optional, default = False | ||
| If true returns X (aptamer and target sequence) | ||
| and y (new_affinity) otherwise returns a | ||
| pandas dataframe containing the three columns | ||
|
|
||
| Returns: | ||
| -------- | ||
| Either a pandas dataframe with three columns | ||
| or two pandas dataframe objects with two and one | ||
| columns respectively. | ||
| """ | ||
| aptacom = load_hf_dataset("AptaCom", store=False) | ||
| dataset = prepare_xy(aptacom) | ||
| if return_X_y: | ||
| X = dataset[["aptamer_sequence", "target_sequence"]] | ||
| y = dataset[["new_affinity"]] | ||
| return X, y | ||
| return dataset | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| __author__ = "rpgv" | ||
|
|
||
| import pytest | ||
| from pandas import DataFrame | ||
|
|
||
| from pyaptamer.datasets import load_aptacom_full, load_aptacom_xy | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "select_columns", | ||
| [ | ||
| ["reference"], | ||
| ["aptamer_chemistry"], | ||
| ["aptamer_name"], | ||
| ["target_name"], | ||
| ["aptamer_sequence"], | ||
| ["origin"], | ||
| ["target_chemistry"], | ||
| ["external_id"], | ||
| ["target_sequence"], | ||
| ["new_affinity"], | ||
| ], | ||
| ) | ||
| def test_load_aptacom_full(select_columns): | ||
| """ | ||
| The test_download_aptacom function | ||
| """ | ||
| dataset = load_aptacom_full(select_columns) | ||
| if not isinstance(dataset, DataFrame): | ||
| raise ValueError(f"""Dataset format {type(dataset)} | ||
| is not DataFrame""") | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("return_X_y", [True, False]) | ||
| def test_download_aptacom_x_y(return_X_y): | ||
| """ | ||
| The test_download_aptacom function | ||
| """ | ||
| dataset = load_aptacom_xy(return_X_y) | ||
| if not isinstance(dataset, tuple | DataFrame): | ||
| raise ValueError(f"""Dataset format {type(dataset)} | ||
| is not X, y tuple or DataFrame""") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this function not be private? Also do you mind changing
dstodf(for dataframe) givendsis something I will be using for theDatasetobject from HuggingFace?