Skip to content

Commit

Permalink
docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Bharath Ramsundar authored and Bharath Ramsundar committed May 29, 2020
1 parent 43b48b0 commit 98c9a66
Show file tree
Hide file tree
Showing 13 changed files with 326 additions and 17 deletions.
28 changes: 27 additions & 1 deletion deepchem/molnet/load_function/clintox_datasets.py
Expand Up @@ -18,7 +18,33 @@ def load_clintox(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load clintox datasets."""
"""Load clintox datasets.
The ClinTox dataset compares drugs approved by the FDA and
drugs that have failed clinical trials for toxicity reasons.
The dataset includes two classification tasks for 1491 drug
compounds with known chemical structures: (1) clinical trial
toxicity (or absence of toxicity) and (2) FDA approval status.
List of FDA-approved drugs are compiled from the SWEETLEAD
database, and list of drugs that failed clinical trials for
toxicity reasons are compiled from the Aggregate Analysis of
ClinicalTrials.gov(AACT) database.
The data file contains a csv table, in which columns below are
used:
"smiles" - SMILES representation of the molecular structure
"FDA_APPROVED" - FDA approval status
"CT_TOX" - Clinical trial results
References:
Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento. "A data-driven approach to predicting successes and failures of clinical trials." Cell chemical biology 23.10 (2016): 1294-1301.
Artemov, Artem V., et al. "Integrated deep learned transcriptomic and structure-based predictor of clinical trials outcomes." bioRxiv (2016): 095653.
Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PloS one 8.11 (2013): e79568.
Aggregate Analysis of ClincalTrials.gov (AACT) Database. https://www.ctti-clinicaltrials.org/aact-database
"""
if data_dir is None:
data_dir = DEFAULT_DIR
if save_dir is None:
Expand Down
14 changes: 12 additions & 2 deletions deepchem/molnet/load_function/delaney_datasets.py
Expand Up @@ -18,7 +18,16 @@ def load_delaney(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load delaney datasets."""
"""Load delaney datasets.
The Delaney datasets are extracted from the following paper
Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.
This dataset contains 2874 measured aqueous solubility
values. The source dataset is available in the supplemental
material of the original paper.
"""
# Featurize Delaney dataset
logger.info("About to featurize Delaney dataset.")
if data_dir is None:
Expand Down Expand Up @@ -60,8 +69,9 @@ def load_delaney(featurizer='ECFP',
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
res = kwargs.get("res", 0.5)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
img_size=img_size, img_spec=img_spec, res=res)

loader = deepchem.data.CSVLoader(
tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)
Expand Down
33 changes: 32 additions & 1 deletion deepchem/molnet/load_function/factors_datasets.py
Expand Up @@ -135,7 +135,38 @@ def gen_factors(FACTORS_tasks,


def load_factors(shard_size=2000, featurizer=None, split=None, reload=True):
"""Loads FACTOR dataset; does not do train/test split"""
"""Loads FACTOR dataset; does not do train/test split
The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:
Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
It contains 1500 Merck in-house compounds that were measured
for IC50 of inhibition on 12 serine proteases. Unlike most of
the other datasets featured in MoleculeNet, the Factors
collection does not have structures for the compounds tested
since they were proprietary Merck compounds. However, the
collection does feature pre-computed descriptors for these
compounds.
Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.
Parameters
----------
shard_size: int, optional
Size of the DiskDataset shards to write on disk
featurizer: optional
Ignored since featurization pre-computed
split: optional
Ignored since split pre-computed
reload: bool, optional
Whether to automatically re-load from disk
"""

FACTORS_tasks = [
'T_00001', 'T_00002', 'T_00003', 'T_00004', 'T_00005', 'T_00006',
Expand Down
21 changes: 20 additions & 1 deletion deepchem/molnet/load_function/hiv_datasets.py
Expand Up @@ -17,7 +17,26 @@ def load_hiv(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load hiv datasets. Does not do train/test split"""
"""Load hiv datasets. Does not do train/test split
The HIV dataset was introduced by the Drug Therapeutics
Program (DTP) AIDS Antiviral Screen, which tested the ability
to inhibit HIV replication for over 40,000 compounds.
Screening results were evaluated and placed into three
categories: confirmed inactive (CI),confirmed active (CA) and
confirmed moderately active (CM). We further combine the
latter two labels, making it a classification task between
inactive (CI) and active (CA and CM).
The data file contains a csv table, in which columns below
are used:
- "smiles": SMILES representation of the molecular structure
- "activity": Three-class labels for screening results: CI/CM/CA
- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)
References:
AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
"""
# Featurize hiv dataset
logger.info("About to featurize hiv dataset.")
if data_dir is None:
Expand Down
15 changes: 14 additions & 1 deletion deepchem/molnet/load_function/hopv_datasets.py
Expand Up @@ -17,7 +17,20 @@ def load_hopv(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load HOPV datasets. Does not do train/test split"""
"""Load HOPV datasets. Does not do train/test split
The HOPV datasets consist of the "Harvard Organic
Photovoltaic Dataset. This dataset includes 350 small
molecules and polymers that were utilized as p-type materials
in OPVs. Experimental properties include: HOMO [a.u.], LUMO
[a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
conversion efficiency [%], Open circuit potential [V], Short
circuit current density [mA/cm^2], and fill factor [%].
Theoretical calculations in the original dataset have been
removed (for now).
Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.
"""
# Featurize HOPV dataset
logger.info("About to featurize HOPV dataset.")
if data_dir is None:
Expand Down
33 changes: 32 additions & 1 deletion deepchem/molnet/load_function/kaggle_datasets.py
Expand Up @@ -117,7 +117,38 @@ def gen_kaggle(KAGGLE_tasks,


def load_kaggle(shard_size=2000, featurizer=None, split=None, reload=True):
"""Loads kaggle datasets. Generates if not stored already."""
"""Loads kaggle datasets. Generates if not stored already.
The Kaggle dataset is an in-house dataset from Merck that was first introduced in the following paper:
Ma, Junshui, et al. "Deep neural nets as a method for quantitative structure–activity relationships." Journal of chemical information and modeling 55.2 (2015): 263-274.
It contains 100,000 unique Merck in-house compounds that were
measured on 15 enzyme inhibition and ADME/TOX datasets.
Unlike most of the other datasets featured in MoleculeNet,
the Kaggle collection does not have structures for the
compounds tested since they were proprietary Merck compounds.
However, the collection does feature pre-computed descriptors
for these compounds.
Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.
Parameters
----------
shard_size: int, optional
Size of the DiskDataset shards to write on disk
featurizer: optional
Ignored since featurization pre-computed
split: optional
Ignored since split pre-computed
reload: bool, optional
Whether to automatically re-load from disk
"""
KAGGLE_tasks = [
'3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB', 'NK1', 'OX1',
'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN'
Expand Down
33 changes: 31 additions & 2 deletions deepchem/molnet/load_function/kinase_datasets.py
Expand Up @@ -141,8 +141,37 @@ def gen_kinase(KINASE_tasks,


def load_kinase(shard_size=2000, featurizer=None, split=None, reload=True):

"Loads kinase datasets, does not do train/test split"
"""Loads Kinase datasets, does not do train/test split
The Kinase dataset is an in-house dataset from Merck that was first introduced in the following paper:
Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
It contains 2500 Merck in-house compounds that were measured
for IC50 of inhibition on 99 protein kinases. Unlike most of
the other datasets featured in MoleculeNet, the Kinase
collection does not have structures for the compounds tested
since they were proprietary Merck compounds. However, the
collection does feature pre-computed descriptors for these
compounds.
Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.
Parameters
----------
shard_size: int, optional
Size of the DiskDataset shards to write on disk
featurizer: optional
Ignored since featurization pre-computed
split: optional
Ignored since split pre-computed
reload: bool, optional
Whether to automatically re-load from disk
"""

KINASE_tasks = [
'T_00013', 'T_00014', 'T_00015', 'T_00016', 'T_00017', 'T_00018',
Expand Down
54 changes: 53 additions & 1 deletion deepchem/molnet/load_function/qm7_datasets.py
Expand Up @@ -134,6 +134,32 @@ def load_qm7b_from_mat(featurizer='CoulombMatrix',
data_dir=None,
save_dir=None,
**kwargs):
"""Load QM7B dataset
QM7b is an extension for the QM7 dataset with additional properties predicted at different levels (ZINDO, SCS, PBE0, GW). In total 14 tasks are included for 7211 molecules with up to 7 heavy atoms.
The dataset in .mat format(for python users, we recommend using `scipy.io.loadmat`) includes two arrays:
"X" - (7211 x 23 x 23), Coulomb matrices
"T" - (7211 x 14), properties
Atomization energies E (PBE0, unit: kcal/mol)
Excitation of maximal optimal absorption E_max (ZINDO, unit: eV)
Absorption Intensity at maximal absorption I_max (ZINDO)
Highest occupied molecular orbital HOMO (ZINDO, unit: eV)
Lowest unoccupied molecular orbital LUMO (ZINDO, unit: eV)
First excitation energy E_1st (ZINDO, unit: eV)
Ionization potential IP (ZINDO, unit: eV)
Electron affinity EA (ZINDO, unit: eV)
Highest occupied molecular orbital HOMO (PBE0, unit: eV)
Lowest unoccupied molecular orbital LUMO (PBE0, unit: eV)
Highest occupied molecular orbital HOMO (GW, unit: eV)
Lowest unoccupied molecular orbital LUMO (GW, unit: eV)
Polarizabilities α (PBE0, unit: Å^3)
Polarizabilities α (SCS, unit: Å^3)
Reference:
Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small molecules for virtual screening in the chemical universe database GDB-13." Journal of the American Chemical Society 131.25 (2009): 8732-8733.
Montavon, Grégoire, et al. "Machine learning of molecular electronic properties in chemical compound space." New Journal of Physics 15.9 (2013): 095003.
"""
if data_dir is None:
data_dir = DEFAULT_DIR
if save_dir is None:
Expand Down Expand Up @@ -190,7 +216,33 @@ def load_qm7(featurizer='CoulombMatrix',
data_dir=None,
save_dir=None,
**kwargs):
"""Load qm7 datasets."""
"""Load qm7 datasets.
QM7 is a subset of GDB-13 (a database of nearly 1 billion
stable and synthetically accessible organic molecules)
containing up to 7 heavy atoms C, N, O, and S. The 3D
Cartesian coordinates of the most stable conformations and
their atomization energies were determined using ab-initio
density functional theory (PBE0/tier2 basis set).This dataset
also provided Coulomb matrices as calculated in [Rupp et al.
PRL, 2012]:
C_ii = 0.5 * Z^2.4
C_ij = Z_i * Z_j/abs(R_i − R_j)
Z_i - nuclear charge of atom i
R_i - cartesian coordinates of atom i
The data file (.mat format, we recommend using `scipy.io.loadmat` for python users to load this original data) contains five arrays:
"X" - (7165 x 23 x 23), Coulomb matrices
"T" - (7165), atomization energies (unit: kcal/mol)
"P" - (5 x 1433), cross-validation splits as used in [Montavon et al. NIPS, 2012]
"Z" - (7165 x 23), atomic charges
"R" - (7165 x 23 x 3), cartesian coordinate (unit: Bohr) of each atom in the molecules
Reference:
Rupp, Matthias, et al. "Fast and accurate modeling of molecular atomization energies with machine learning." Physical review letters 108.5 (2012): 058301.
Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction." Advances in Neural Information Processing Systems. 2012.
"""
# Featurize qm7 dataset
logger.info("About to featurize qm7 dataset.")
if data_dir is None:
Expand Down
34 changes: 29 additions & 5 deletions deepchem/molnet/load_function/qm8_datasets.py
Expand Up @@ -19,6 +19,34 @@ def load_qm8(featurizer='CoulombMatrix',
data_dir=None,
save_dir=None,
**kwargs):
"""Load QM8 Datasets
The QM8 is the dataset used in a study on modeling quantum
mechanical calculations of electronic spectra and excited
state energy of small molecules. Multiple methods, including
time-dependent density functional theories (TDDFT) and
second-order approximate coupled-cluster (CC2), are applied to
a collection of molecules that include up to eight heavy atoms
(also a subset of the GDB-17 database). In our collection,
there are four excited state properties calculated by four
different methods on 22 thousand samples:
S_0 -> S_1 transition energy E_1 and the corresponding oscillator strength f_1
S_0 -> S_2 transition energy E_2 and the corresponding oscillator strength f_2
The source data files (downloadable from moleculenet.ai):
qm8.sdf: molecular structures
qm8.sdf.csv: tables for molecular properties
Column 1: Molecule ID (gdb9 index) mapping to the .sdf file
Columns 2-5: RI-CC2/def2TZVP; E1, E2, f1, f2 in atomic units. f1, f2 in length representation
Columns 6-9: LR-TDPBE0/def2SVP; E1, E2, f1, f2 in atomic units. f1, f2 in length representation
Columns 10-13: LR-TDPBE0/def2TZVP; E1, E2, f1, f2 in atomic units. f1, f2 in length representation
Columns 14-17: LR-TDCAM-B3LYP/def2TZVP; E1, E2, f1, f2 in atomic units. f1, f2 in length representation
Reference:
Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small molecules for virtual screening in the chemical universe database GDB-13." Journal of the American Chemical Society 131.25 (2009): 8732-8733.
Ramakrishnan, Raghunathan, et al. "Electronic spectra from TDDFT and machine learning in chemical space." The Journal of chemical physics 143.8 (2015): 084111.
"""
qm8_tasks = [
"E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
"f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM",
Expand Down Expand Up @@ -68,11 +96,7 @@ def load_qm8(featurizer='CoulombMatrix',
elif featurizer == 'MP':
featurizer = deepchem.feat.WeaveFeaturizer(
graph_distance=False, explicit_H=True)
loader = deepchem.data.SDFLoader(
tasks=qm8_tasks,
smiles_field="smiles",
mol_field="mol",
featurizer=featurizer)
loader = deepchem.data.SDFLoader(tasks=qm8_tasks, featurizer=featurizer)
else:
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
Expand Down
22 changes: 22 additions & 0 deletions deepchem/molnet/load_function/sider_datasets.py
Expand Up @@ -18,6 +18,28 @@ def load_sider(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load SIDER datasets
The Side Effect Resource (SIDER) is a database of marketed
drugs and adverse drug reactions (ADR). The version of the
SIDER dataset in DeepChem has grouped drug side effects into
27 system organ classes following MedDRA classifications
measured for 1427 approved drugs.
The data file contains a csv table, in which columns below
are used:
- "smiles": SMILES representation of the molecular structure
- "Hepatobiliary disorders" ~ "Injury, poisoning and procedural complications": Recorded side effects for the drug
Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs.
References:
Kuhn, Michael, et al. "The SIDER database of drugs and side effects." Nucleic acids research 44.D1 (2015): D1075-D1079.
Altae-Tran, Han, et al. "Low data drug discovery with one-shot learning." ACS central science 3.4 (2017): 283-293.
Medical Dictionary for Regulatory Activities. http://www.meddra.org/
"""

logger.info("About to load SIDER dataset.")
if data_dir is None:
data_dir = DEFAULT_DIR
Expand Down
7 changes: 6 additions & 1 deletion deepchem/molnet/load_function/sweetlead_datasets.py
Expand Up @@ -20,7 +20,12 @@ def load_sweet(featurizer='ECFP',
data_dir=None,
save_dir=None,
**kwargs):
"""Load sweet datasets."""
"""Load sweet datasets.
Sweetlead is a dataset of chemical structures for approved drugs, chemical isolates from traditional medicinal herbs, and regulated chemicals. Resulting structures are filtered for the active pharmaceutical ingredient, standardized, and differing formulations of the same drug were combined in the final database.
Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PLoS One 8.11 (2013).
"""
# Load Sweetlead dataset
logger.info("About to load Sweetlead dataset.")
SWEET_tasks = ["task"]
Expand Down

0 comments on commit 98c9a66

Please sign in to comment.