/
lipo_datasets.py
78 lines (66 loc) · 3.08 KB
/
lipo_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Lipophilicity dataset loader.
"""
import os
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union
LIPO_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv"
LIPO_TASKS = ['exp']
class _LipoLoader(_MolnetLoader):
def create_dataset(self) -> Dataset:
dataset_file = os.path.join(self.data_dir, "Lipophilicity.csv")
if not os.path.exists(dataset_file):
dc.utils.data_utils.download_url(url=LIPO_URL, dest_dir=self.data_dir)
loader = dc.data.CSVLoader(
tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
return loader.create_dataset(dataset_file, shard_size=8192)
def load_lipo(
featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
reload: bool = True,
data_dir: Optional[str] = None,
save_dir: Optional[str] = None,
**kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
"""Load Lipophilicity dataset
Lipophilicity is an important feature of drug molecules that affects both
membrane permeability and solubility. The lipophilicity dataset, curated
from ChEMBL database, provides experimental results of octanol/water
distribution coefficient (logD at pH 7.4) of 4200 compounds.
Scaffold splitting is recommended for this dataset.
The raw data csv file contains columns below:
- "smiles" - SMILES representation of the molecular structure
- "exp" - Measured octanol/water distribution coefficient (logD) of the
compound, used as label
Parameters
----------
featurizer: Featurizer or str
the featurizer to use for processing the data. Alternatively you can pass
one of the names from dc.molnet.featurizers as a shortcut.
splitter: Splitter or str
the splitter to use for splitting the data into training, validation, and
test sets. Alternatively you can pass one of the names from
dc.molnet.splitters as a shortcut. If this is None, all the data
will be included in a single dataset.
transformers: list of TransformerGenerators or strings
the Transformers to apply to the data. Each one is specified by a
TransformerGenerator or, as a shortcut, one of the names from
dc.molnet.transformers.
reload: bool
if True, the first call for a particular featurizer and splitter will cache
the datasets to disk, and subsequent calls will reload the cached datasets.
data_dir: str
a directory to save the raw data in
save_dir: str
a directory to save the dataset in
References
----------
.. [1] Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015.
https://doi.org/10.6019/chembl3301361
"""
loader = _LipoLoader(featurizer, splitter, transformers, LIPO_TASKS, data_dir,
save_dir, **kwargs)
return loader.load_dataset('lipo', reload)