docs

deepchem · May 29, 2020 · 98c9a66 · 98c9a66
1 parent 43b48b0
commit 98c9a66
Show file tree

Hide file tree

Showing 13 changed files with 326 additions and 17 deletions.
diff --git a/deepchem/molnet/load_function/clintox_datasets.py b/deepchem/molnet/load_function/clintox_datasets.py
@@ -18,7 +18,33 @@ def load_clintox(featurizer='ECFP',
                  data_dir=None,
                  save_dir=None,
                  **kwargs):
-  """Load clintox datasets."""
+  """Load clintox datasets.
+
+  The ClinTox dataset compares drugs approved by the FDA and
+  drugs that have failed clinical trials for toxicity reasons.
+  The dataset includes two classification tasks for 1491 drug
+  compounds with known chemical structures: (1) clinical trial
+  toxicity (or absence of toxicity) and (2) FDA approval status.
+  List of FDA-approved drugs are compiled from the SWEETLEAD
+  database, and list of drugs that failed clinical trials for
+  toxicity reasons are compiled from the Aggregate Analysis of
+  ClinicalTrials.gov(AACT) database.
+
+  The data file contains a csv table, in which columns below are
+  used:
+     "smiles" - SMILES representation of the molecular structure
+     "FDA_APPROVED" - FDA approval status
+     "CT_TOX" - Clinical trial results
+
+References:
+  Gayvert, Kaitlyn M., Neel S. Madhukar, and Olivier Elemento. "A data-driven approach to predicting successes and failures of clinical trials." Cell chemical biology 23.10 (2016): 1294-1301.
+
+  Artemov, Artem V., et al. "Integrated deep learned transcriptomic and structure-based predictor of clinical trials outcomes." bioRxiv (2016): 095653.
+
+  Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PloS one 8.11 (2013): e79568.
+
+  Aggregate Analysis of ClincalTrials.gov (AACT) Database. https://www.ctti-clinicaltrials.org/aact-database
+  """
   if data_dir is None:
     data_dir = DEFAULT_DIR
   if save_dir is None:

diff --git a/deepchem/molnet/load_function/delaney_datasets.py b/deepchem/molnet/load_function/delaney_datasets.py
@@ -18,7 +18,16 @@ def load_delaney(featurizer='ECFP',
                  data_dir=None,
                  save_dir=None,
                  **kwargs):
-  """Load delaney datasets."""
+  """Load delaney datasets.
+
+  The Delaney datasets are extracted from the following paper
+
+  Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.
+
+  This dataset contains 2874 measured aqueous solubility
+  values. The source dataset is available in the supplemental
+  material of the original paper.
+  """
   # Featurize Delaney dataset
   logger.info("About to featurize Delaney dataset.")
   if data_dir is None:
@@ -60,8 +69,9 @@ def load_delaney(featurizer='ECFP',
   elif featurizer == "smiles2img":
     img_spec = kwargs.get("img_spec", "std")
     img_size = kwargs.get("img_size", 80)
+    res = kwargs.get("res", 0.5)
     featurizer = deepchem.feat.SmilesToImage(
-        img_size=img_size, img_spec=img_spec)
+        img_size=img_size, img_spec=img_spec, res=res)
 
   loader = deepchem.data.CSVLoader(
       tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)

diff --git a/deepchem/molnet/load_function/factors_datasets.py b/deepchem/molnet/load_function/factors_datasets.py
@@ -135,7 +135,38 @@ def gen_factors(FACTORS_tasks,
 
 
 def load_factors(shard_size=2000, featurizer=None, split=None, reload=True):
-  """Loads FACTOR dataset; does not do train/test split"""
+  """Loads FACTOR dataset; does not do train/test split
+
+  The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
+
+  It contains 1500 Merck in-house compounds that were measured
+  for IC50 of inhibition on 12 serine proteases. Unlike most of
+  the other datasets featured in MoleculeNet, the Factors 
+  collection does not have structures for the compounds tested
+  since they were proprietary Merck compounds. However, the
+  collection does feature pre-computed descriptors for these
+  compounds.
+
+  Note that the original train/valid/test split from the source
+  data was preserved here, so this function doesn't allow for
+  alternate modes of splitting. Similarly, since the source data
+  came pre-featurized, it is not possible to apply alternative
+  featurizations.
+
+  Parameters
+  ----------
+  shard_size: int, optional
+    Size of the DiskDataset shards to write on disk
+  featurizer: optional
+    Ignored since featurization pre-computed
+  split: optional
+    Ignored since split pre-computed
+  reload: bool, optional
+    Whether to automatically re-load from disk
+
+  """
 
   FACTORS_tasks = [
       'T_00001', 'T_00002', 'T_00003', 'T_00004', 'T_00005', 'T_00006',

diff --git a/deepchem/molnet/load_function/hiv_datasets.py b/deepchem/molnet/load_function/hiv_datasets.py
@@ -17,7 +17,26 @@ def load_hiv(featurizer='ECFP',
              data_dir=None,
              save_dir=None,
              **kwargs):
-  """Load hiv datasets. Does not do train/test split"""
+  """Load hiv datasets. Does not do train/test split
+
+  The HIV dataset was introduced by the Drug Therapeutics
+  Program (DTP) AIDS Antiviral Screen, which tested the ability
+  to inhibit HIV replication for over 40,000 compounds.
+  Screening results were evaluated and placed into three
+  categories: confirmed inactive (CI),confirmed active (CA) and
+  confirmed moderately active (CM). We further combine the
+  latter two labels, making it a classification task between
+  inactive (CI) and active (CA and CM).
+
+  The data file contains a csv table, in which columns below
+  are used:
+     - "smiles": SMILES representation of the molecular structure
+     - "activity": Three-class labels for screening results: CI/CM/CA
+     - "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)
+
+  References:
+  AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
+  """
   # Featurize hiv dataset
   logger.info("About to featurize hiv dataset.")
   if data_dir is None:

diff --git a/deepchem/molnet/load_function/hopv_datasets.py b/deepchem/molnet/load_function/hopv_datasets.py
@@ -17,7 +17,20 @@ def load_hopv(featurizer='ECFP',
               data_dir=None,
               save_dir=None,
               **kwargs):
-  """Load HOPV datasets. Does not do train/test split"""
+  """Load HOPV datasets. Does not do train/test split
+
+  The HOPV datasets consist of the "Harvard Organic
+  Photovoltaic Dataset. This dataset includes 350 small
+  molecules and polymers that were utilized as p-type materials
+  in OPVs. Experimental properties include: HOMO [a.u.], LUMO
+  [a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
+  conversion efficiency [%], Open circuit potential [V], Short
+  circuit current density [mA/cm^2], and fill factor [%].
+  Theoretical calculations in the original dataset have been
+  removed (for now).
+
+  Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.
+  """
   # Featurize HOPV dataset
   logger.info("About to featurize HOPV dataset.")
   if data_dir is None:

diff --git a/deepchem/molnet/load_function/kaggle_datasets.py b/deepchem/molnet/load_function/kaggle_datasets.py
@@ -117,7 +117,38 @@ def gen_kaggle(KAGGLE_tasks,
 
 
 def load_kaggle(shard_size=2000, featurizer=None, split=None, reload=True):
-  """Loads kaggle datasets. Generates if not stored already."""
+  """Loads kaggle datasets. Generates if not stored already.
+
+  The Kaggle dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+  Ma, Junshui, et al. "Deep neural nets as a method for quantitative structure–activity relationships." Journal of chemical information and modeling 55.2 (2015): 263-274.
+
+  It contains 100,000 unique Merck in-house compounds that were
+  measured on 15 enzyme inhibition and ADME/TOX datasets.
+  Unlike most of the other datasets featured in MoleculeNet,
+  the Kaggle collection does not have structures for the
+  compounds tested since they were proprietary Merck compounds.
+  However, the collection does feature pre-computed descriptors
+  for these compounds.
+
+  Note that the original train/valid/test split from the source
+  data was preserved here, so this function doesn't allow for
+  alternate modes of splitting. Similarly, since the source data
+  came pre-featurized, it is not possible to apply alternative
+  featurizations.
+
+  Parameters
+  ----------
+  shard_size: int, optional
+    Size of the DiskDataset shards to write on disk
+  featurizer: optional
+    Ignored since featurization pre-computed
+  split: optional
+    Ignored since split pre-computed
+  reload: bool, optional
+    Whether to automatically re-load from disk
+
+  """
   KAGGLE_tasks = [
       '3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB', 'NK1', 'OX1',
       'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN'

diff --git a/deepchem/molnet/load_function/kinase_datasets.py b/deepchem/molnet/load_function/kinase_datasets.py
@@ -141,8 +141,37 @@ def gen_kinase(KINASE_tasks,
 
 
 def load_kinase(shard_size=2000, featurizer=None, split=None, reload=True):
-
-  "Loads kinase datasets, does not do train/test split"
+  """Loads Kinase datasets, does not do train/test split
+
+  The Kinase dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
+
+  It contains 2500 Merck in-house compounds that were measured
+  for IC50 of inhibition on 99 protein kinases. Unlike most of
+  the other datasets featured in MoleculeNet, the Kinase
+  collection does not have structures for the compounds tested
+  since they were proprietary Merck compounds. However, the
+  collection does feature pre-computed descriptors for these
+  compounds.
+
+  Note that the original train/valid/test split from the source
+  data was preserved here, so this function doesn't allow for
+  alternate modes of splitting. Similarly, since the source data
+  came pre-featurized, it is not possible to apply alternative
+  featurizations.
+
+  Parameters
+  ----------
+  shard_size: int, optional
+    Size of the DiskDataset shards to write on disk
+  featurizer: optional
+    Ignored since featurization pre-computed
+  split: optional
+    Ignored since split pre-computed
+  reload: bool, optional
+    Whether to automatically re-load from disk
+  """
 
   KINASE_tasks = [
       'T_00013', 'T_00014', 'T_00015', 'T_00016', 'T_00017', 'T_00018',

diff --git a/deepchem/molnet/load_function/qm7_datasets.py b/deepchem/molnet/load_function/qm7_datasets.py
@@ -134,6 +134,32 @@ def load_qm7b_from_mat(featurizer='CoulombMatrix',
                        data_dir=None,
                        save_dir=None,
                        **kwargs):
+  """Load QM7B dataset
+
+  QM7b is an extension for the QM7 dataset with additional properties predicted at different levels (ZINDO, SCS, PBE0, GW). In total 14 tasks are included for 7211 molecules with up to 7 heavy atoms.
+
+  The dataset in .mat format(for python users, we recommend using `scipy.io.loadmat`) includes two arrays:
+	"X" - (7211 x 23 x 23), Coulomb matrices
+	"T" - (7211 x 14), properties
+		Atomization energies E (PBE0, unit: kcal/mol)
+		Excitation of maximal optimal absorption E_max (ZINDO, unit: eV)
+		Absorption Intensity at maximal absorption I_max (ZINDO)
+		Highest occupied molecular orbital HOMO (ZINDO, unit: eV)
+		Lowest unoccupied molecular orbital LUMO (ZINDO, unit: eV)
+		First excitation energy E_1st (ZINDO, unit: eV)
+		Ionization potential IP (ZINDO, unit: eV)
+		Electron affinity EA (ZINDO, unit: eV)
+		Highest occupied molecular orbital HOMO (PBE0, unit: eV)
+		Lowest unoccupied molecular orbital LUMO (PBE0, unit: eV)
+		Highest occupied molecular orbital HOMO (GW, unit: eV)
+		Lowest unoccupied molecular orbital LUMO (GW, unit: eV)
+		Polarizabilities α (PBE0, unit: Å^3)
+		Polarizabilities α (SCS, unit: Å^3)
+
+  Reference:
+  Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small molecules for virtual screening in the chemical universe database GDB-13." Journal of the American Chemical Society 131.25 (2009): 8732-8733.
+  Montavon, Grégoire, et al. "Machine learning of molecular electronic properties in chemical compound space." New Journal of Physics 15.9 (2013): 095003.
+  """
   if data_dir is None:
     data_dir = DEFAULT_DIR
   if save_dir is None:
@@ -190,7 +216,33 @@ def load_qm7(featurizer='CoulombMatrix',
              data_dir=None,
              save_dir=None,
              **kwargs):
-  """Load qm7 datasets."""
+  """Load qm7 datasets.
+
+  QM7 is a subset of GDB-13 (a database of nearly 1 billion
+  stable and synthetically accessible organic molecules)
+  containing up to 7 heavy atoms C, N, O, and S. The 3D
+  Cartesian coordinates of the most stable conformations and
+  their atomization energies were determined using ab-initio
+  density functional theory (PBE0/tier2 basis set).This dataset
+  also provided Coulomb matrices as calculated in [Rupp et al.
+  PRL, 2012]:
+
+	C_ii = 0.5 * Z^2.4
+        C_ij = Z_i * Z_j/abs(R_i − R_j)
+	Z_i - nuclear charge of atom i
+	R_i - cartesian coordinates of atom i
+
+  The data file (.mat format, we recommend using `scipy.io.loadmat` for python users to load this original data) contains five arrays:
+	"X" - (7165 x 23 x 23), Coulomb matrices
+	"T" - (7165), atomization energies (unit: kcal/mol)
+	"P" - (5 x 1433), cross-validation splits as used in [Montavon et al. NIPS, 2012]
+	"Z" - (7165 x 23), atomic charges
+	"R" - (7165 x 23 x 3), cartesian coordinate (unit: Bohr) of each atom in the molecules
+
+  Reference:
+  Rupp, Matthias, et al. "Fast and accurate modeling of molecular atomization energies with machine learning." Physical review letters 108.5 (2012): 058301.
+  Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction." Advances in Neural Information Processing Systems. 2012.
+  """
   # Featurize qm7 dataset
   logger.info("About to featurize qm7 dataset.")
   if data_dir is None:

diff --git a/deepchem/molnet/load_function/qm8_datasets.py b/deepchem/molnet/load_function/qm8_datasets.py
@@ -19,6 +19,34 @@ def load_qm8(featurizer='CoulombMatrix',
              data_dir=None,
              save_dir=None,
              **kwargs):
+  """Load QM8 Datasets
+
+  The QM8 is the dataset used in a study on modeling quantum
+  mechanical calculations of electronic spectra and excited
+  state energy of small molecules. Multiple methods, including
+  time-dependent density functional theories (TDDFT) and
+  second-order approximate coupled-cluster (CC2), are applied to
+  a collection of molecules that include up to eight heavy atoms
+  (also a subset of the GDB-17 database). In our collection,
+  there are four excited state properties calculated by four
+  different methods on 22 thousand samples:
+
+	S_0 -> S_1 transition energy E_1 and the corresponding oscillator strength f_1
+	S_0 -> S_2 transition energy E_2 and the corresponding oscillator strength f_2
+
+  The source data files (downloadable from moleculenet.ai):
+  qm8.sdf: molecular structures
+  qm8.sdf.csv: tables for molecular properties
+	Column 1:      Molecule ID (gdb9 index) mapping to the .sdf file
+	Columns 2-5:   RI-CC2/def2TZVP; 	E1, E2, f1, f2 in atomic units. f1, f2 in length representation
+	Columns 6-9:   LR-TDPBE0/def2SVP;	E1, E2, f1, f2 in atomic units. f1, f2 in length representation
+	Columns 10-13: LR-TDPBE0/def2TZVP;	E1, E2, f1, f2 in atomic units. f1, f2 in length representation
+	Columns 14-17: LR-TDCAM-B3LYP/def2TZVP;	E1, E2, f1, f2 in atomic units. f1, f2 in length representation
+
+  Reference:
+  Blum, Lorenz C., and Jean-Louis Reymond. "970 million druglike small molecules for virtual screening in the chemical universe database GDB-13." Journal of the American Chemical Society 131.25 (2009): 8732-8733.
+  Ramakrishnan, Raghunathan, et al. "Electronic spectra from TDDFT and machine learning in chemical space." The Journal of chemical physics 143.8 (2015): 084111.
+  """
   qm8_tasks = [
       "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
       "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM",
@@ -68,11 +96,7 @@ def load_qm8(featurizer='CoulombMatrix',
     elif featurizer == 'MP':
       featurizer = deepchem.feat.WeaveFeaturizer(
           graph_distance=False, explicit_H=True)
-    loader = deepchem.data.SDFLoader(
-        tasks=qm8_tasks,
-        smiles_field="smiles",
-        mol_field="mol",
-        featurizer=featurizer)
+    loader = deepchem.data.SDFLoader(tasks=qm8_tasks, featurizer=featurizer)
   else:
     if featurizer == 'ECFP':
       featurizer = deepchem.feat.CircularFingerprint(size=1024)

diff --git a/deepchem/molnet/load_function/sider_datasets.py b/deepchem/molnet/load_function/sider_datasets.py
@@ -18,6 +18,28 @@ def load_sider(featurizer='ECFP',
                data_dir=None,
                save_dir=None,
                **kwargs):
+  """Load SIDER datasets
+
+  The Side Effect Resource (SIDER) is a database of marketed
+  drugs and adverse drug reactions (ADR). The version of the
+  SIDER dataset in DeepChem has grouped drug side effects into
+  27 system organ classes following MedDRA classifications
+  measured for 1427 approved drugs.
+
+  The data file contains a csv table, in which columns below
+  are used:
+
+  - "smiles": SMILES representation of the molecular structure
+  - "Hepatobiliary disorders" ~ "Injury, poisoning and procedural complications": Recorded side effects for the drug
+
+  Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs.
+
+  References:
+  Kuhn, Michael, et al. "The SIDER database of drugs and side effects." Nucleic acids research 44.D1 (2015): D1075-D1079.
+  Altae-Tran, Han, et al. "Low data drug discovery with one-shot learning." ACS central science 3.4 (2017): 283-293.
+  Medical Dictionary for Regulatory Activities. http://www.meddra.org/
+  """
+
   logger.info("About to load SIDER dataset.")
   if data_dir is None:
     data_dir = DEFAULT_DIR

diff --git a/deepchem/molnet/load_function/sweetlead_datasets.py b/deepchem/molnet/load_function/sweetlead_datasets.py
@@ -20,7 +20,12 @@ def load_sweet(featurizer='ECFP',
                data_dir=None,
                save_dir=None,
                **kwargs):
-  """Load sweet datasets."""
+  """Load sweet datasets.
+  
+  Sweetlead is a dataset of chemical structures for approved drugs, chemical isolates from traditional medicinal herbs, and regulated chemicals. Resulting structures are filtered for the active pharmaceutical ingredient, standardized, and differing formulations of the same drug were combined in the final database.
+
+  Novick, Paul A., et al. "SWEETLEAD: an in silico database of approved drugs, regulated chemicals, and herbal isolates for computer-aided drug discovery." PLoS One 8.11 (2013).
+  """
   # Load Sweetlead dataset
   logger.info("About to load Sweetlead dataset.")
   SWEET_tasks = ["task"]