Merge 72e7010 into 4cbaac0

deepchem · Jun 1, 2020 · d4ad5a4 · d4ad5a4
2 parents 4cbaac0 + 72e7010
commit d4ad5a4
Show file tree

Hide file tree

Showing 45 changed files with 1,731 additions and 39 deletions.
diff --git a/examples/data_loading/README.md b/examples/data_loading/README.md
@@ -0,0 +1,7 @@
+# Data Loading Examples
+
+The examples in this directory highlight a number of ways to
+load datasets into DeepChem for downstream analysis: 
+
+- `pandas_csv.py` shows how to directly load a dataset from a CSV file without using a `DataLoader`. 
+- `sdf_load.py` shows how to load a dataset from a sdf file using `SDFLoader`. 
diff --git a/examples/data_loading/example.csv b/examples/data_loading/example.csv
@@ -0,0 +1,11 @@
+Compound ID,log-solubility,smiles
+Amigdalin,0.9740000000000001,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O 
+Fenfuram,2.885,Cc1occc1C(=O)Nc2ccccc2
+citral,2.5789999999999997,CC(C)=CCCC(C)=CC(=O)
+Picene,6.617999999999999,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
+Thiophene,2.2319999999999998,c1ccsc1
+benzothiazole,2.733,c2ccc1scnc1c2 
+"2,2,4,6,6'-PCB",6.545,Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl
+Estradiol,4.138,CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O
+Dieldrin,4.533,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl
+Rotenone,5.246,COc5cc4OCC3Oc2c1CC(Oc1ccc2C(=O)C3c4cc5OC)C(C)=C 
diff --git a/examples/data_loading/membrane_permeability.sdf b/examples/data_loading/membrane_permeability.sdf
diff --git a/examples/data_loading/pandas_csv.py b/examples/data_loading/pandas_csv.py
@@ -0,0 +1,25 @@
+# This example shows how to use Pandas to load data directly
+# without using a CSVLoader object. This may be useful if you
+# want the flexibility of processing your data with Pandas
+# directly.
+import pandas as pd
+import deepchem as dc
+from rdkit import Chem
+
+df = pd.read_csv("example.csv")
+print("Original data loaded as DataFrame:")
+print(df)
+
+featurizer = dc.feat.CircularFingerprint(size=16)
+mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]]
+features = featurizer.featurize(mols)
+dataset = dc.data.NumpyDataset(
+    X=features, y=df["log-solubility"], ids=df["Compound ID"])
+
+print("Data converted into DeepChem Dataset")
+print(dataset)
+
+# Now let's convert from a dataset back to a pandas dataframe
+converted_df = dataset.to_dataframe()
+print("Data converted back into DataFrame:")
+print(converted_df)
diff --git a/examples/data_loading/sdf_load.py b/examples/data_loading/sdf_load.py
@@ -0,0 +1,6 @@
+# This example shows how to load data from a SDF file into DeepChem. The data in this SDF file is stored in field "LogP(RRCK)"
+import deepchem as dc
+
+featurizer = dc.feat.CircularFingerprint(size=16)
+loader = dc.data.SDFLoader(["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
+dataset = loader.featurize("membrane_permeability.sdf")
diff --git a/examples/datasets/README.md b/examples/datasets/README.md
@@ -0,0 +1,3 @@
+# Dataset Examples
+
+This folder countains examples of using DeepChem datasets to do things.
diff --git a/examples/datasets/pretty_print.py b/examples/datasets/pretty_print.py
@@ -0,0 +1,5 @@
+import numpy as np
+import deepchem as dc
+
+dataset = dc.data.NumpyDataset(np.random.rand(500, 5))
+print(dataset)
diff --git a/examples/datasets/scaffold_split_print.py b/examples/datasets/scaffold_split_print.py
@@ -0,0 +1,23 @@
+import numpy as np
+import deepchem as dc
+
+mols = [
+    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
+    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
+    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
+]
+print("Original set of molecules")
+print(mols)
+
+splitter = dc.splits.ScaffoldSplitter()
+# TODO: This should be swapped for simpler splitter API once that's merged in.
+dataset = dc.data.NumpyDataset(X=np.array(mols), ids=mols)
+train, valid, test = splitter.train_valid_test_split(dataset)
+# The return values are dc.data.Dataset objects so we need to extract
+# the ids
+print("Training set")
+print(train)
+print("Valid set")
+print(valid)
+print("Test set")
+print(test)
diff --git a/examples/delaney/README.md b/examples/delaney/README.md
@@ -0,0 +1,15 @@
+The Delaney dataset is a collection of 2874 aqueous solubility measurements from this paper:
+
+Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.
+
+This dataset is commonly used since it's a small molecular
+regression dataset that's convenient for benchmarking various
+techniques. In this example, we train a series of different
+DeepChem models against this task:
+
+- `DAGModel`: In `delaney_DAG.py`. This model will train and
+converge very slowly.
+- `TextCNNModel`: In `delaney_textcnn.py`. This model featurizes compounds as SMILES strings directly and trains a convolutional network directly on the text.
+- `WeaveModel`: In `delaney_weave.py`. This model trains a weave style convolution on Delaney.
+- `ChemCeption`: In `delaney_chemception.py`. This model trains a variant of an Inception convolutional network on images generated from molecules.
+- `MPNNModel`: In `delaney_MPNN.py`. This model trains a little slower, but is faster than `DAGModel`.
diff --git a/examples/delaney/__init__.py b/examples/delaney/__init__.py
diff --git a/examples/delaney/delaney_tensorgraph_DAG.py → examples/delaney/delaney_DAG.py b/examples/delaney/delaney_tensorgraph_DAG.py → examples/delaney/delaney_DAG.py
diff --git a/examples/delaney/delaney_chemception.py b/examples/delaney/delaney_chemception.py
@@ -0,0 +1,35 @@
+"""
+Script that trains Chemception models on delaney dataset.
+"""
+import numpy as np
+np.random.seed(123)
+import tensorflow as tf
+tf.random.set_seed(123)
+import deepchem as dc
+
+# Load Delaney dataset
+delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
+    featurizer='smiles2img', split='index', img_spec="engd")
+train_dataset, valid_dataset, test_dataset = delaney_datasets
+
+# Get Metric
+metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
+
+model = dc.models.ChemCeption(
+    img_spec="engd",
+    n_tasks=len(delaney_tasks),
+    model_dir=None,
+    mode="regression")
+
+# Fit trained model
+model.fit(train_dataset, nb_epoch=50)
+
+print("Evaluating model")
+train_scores = model.evaluate(train_dataset, [metric], transformers)
+valid_scores = model.evaluate(valid_dataset, [metric], transformers)
+
+print("Train scores")
+print(train_scores)
+
+print("Validation scores")
+print(valid_scores)
diff --git a/...ples/delaney/delaney_tensorgraph_weave.py → examples/delaney/delaney_weave.py b/...ples/delaney/delaney_tensorgraph_weave.py → examples/delaney/delaney_weave.py
diff --git a/examples/factors/README.md b/examples/factors/README.md
@@ -0,0 +1,21 @@
+# Factors Examples
+
+The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
+
+It contains 1500 Merck in-house compounds that were measured
+for IC50 of inhibition on 12 serine proteases. Unlike most of
+the other datasets featured in MoleculeNet, the Factors 
+collection does not have structures for the compounds tested
+since they were proprietary Merck compounds. However, the
+collection does feature pre-computed descriptors for these
+compounds.
+
+Note that the original train/valid/test split from the source
+data was preserved here, so this function doesn't allow for
+alternate modes of splitting. Similarly, since the source data
+came pre-featurized, it is not possible to apply alternative
+featurizations.
+
+In this example, we train various models on the Factors dataset.
diff --git a/examples/hiv/README.md b/examples/hiv/README.md
@@ -0,0 +1,21 @@
+# HIV Dataset Examples
+
+The HIV dataset was introduced by the Drug Therapeutics
+Program (DTP) AIDS Antiviral Screen, which tested the ability
+to inhibit HIV replication for over 40,000 compounds.
+Screening results were evaluated and placed into three
+categories: confirmed inactive (CI),confirmed active (CA) and
+confirmed moderately active (CM). We further combine the
+latter two labels, making it a classification task between
+inactive (CI) and active (CA and CM).
+
+The data file contains a csv table, in which columns below
+are used:
+- "smiles": SMILES representation of the molecular structure
+- "activity": Three-class labels for screening results: CI/CM/CA
+- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)
+
+References:
+AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data
+
+In this example we train models on the HIV collection.
diff --git a/examples/hiv/__init__.py b/examples/hiv/__init__.py
diff --git a/examples/hopv/README.md b/examples/hopv/README.md
@@ -0,0 +1,15 @@
+# Harvard Organic Photovoltaic Dataset
+
+The HOPV datasets consist of the "Harvard Organic
+Photovoltaic Dataset. This dataset includes 350 small
+molecules and polymers that were utilized as p-type materials
+in OPVs. Experimental properties include: HOMO [a.u.], LUMO
+[a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
+conversion efficiency [%], Open circuit potential [V], Short
+circuit current density [mA/cm^2], and fill factor [%].
+Theoretical calculations in the original dataset have been
+removed (for now).
+
+Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.
+
+In this example, we train models on the HOPV dataset to predict these properties.
diff --git a/examples/kaggle/README.md b/examples/kaggle/README.md
@@ -0,0 +1,21 @@
+# Kaggle Dataset Examples
+
+The Kaggle dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+Ma, Junshui, et al. "Deep neural nets as a method for quantitative structure–activity relationships." Journal of chemical information and modeling 55.2 (2015): 263-274.
+
+It contains 100,000 unique Merck in-house compounds that were
+measured on 15 enzyme inhibition and ADME/TOX datasets.
+Unlike most of the other datasets featured in MoleculeNet,
+the Kaggle collection does not have structures for the
+compounds tested since they were proprietary Merck compounds.
+However, the collection does feature pre-computed descriptors
+for these compounds.
+
+Note that the original train/valid/test split from the source
+data was preserved here, so this function doesn't allow for
+alternate modes of splitting. Similarly, since the source data
+came pre-featurized, it is not possible to apply alternative
+featurizations.
+
+This folder contains examples training models on the Kaggle dataset.
diff --git a/examples/kaggle/__init__.py b/examples/kaggle/__init__.py
diff --git a/examples/kinase/README.md b/examples/kinase/README.md
@@ -0,0 +1,24 @@
+# README for Kinase Example
+
+The Kinase dataset is an in-house dataset from Merck that was first introduced in the following paper:
+
+Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.
+
+It contains 2500 Merck in-house compounds that were measured
+for IC50 of inhibition on 99 protein kinases. Unlike most of
+the other datasets featured in MoleculeNet, the Kinase
+collection does not have structures for the compounds tested
+since they were proprietary Merck compounds. However, the
+collection does feature pre-computed descriptors for these
+compounds.
+
+Note that the original train/valid/test split from the source
+data was preserved here, so this function doesn't allow for
+alternate modes of splitting. Similarly, since the source data
+came pre-featurized, it is not possible to apply alternative
+featurizations.
+
+This example features a few different models trained on this
+dataset collection. In particular:
+
+- `kinase_rf.py` trains a random forest model
diff --git a/examples/model_restore/.gitignore b/examples/model_restore/.gitignore
@@ -0,0 +1 @@
+model/
diff --git a/examples/model_restore/README.md b/examples/model_restore/README.md
@@ -0,0 +1,26 @@
+# Model Saving/Restoration
+
+In this example, we'll work through an example of using the
+DeepChem API to save and restore a model from disk. We're going
+to be training a ChemCeption model for this purpose on the
+Delaney dataset. 
+
+Here are the files we'll use
+
+- `chemception_model.py`: The file with the model to train
+- `chemception_restore.py`: The file that restores the trained model
+
+To train the model, first run
+
+```
+python chemception_model.py
+```
+
+This will train a model and store it to a subdirectory `./model`. Let's now
+invoke this model to make a prediction with it.
+
+```
+python chemception_restore.py
+```
+
+The scripts are pretty simple so go ahead and peek inside to see how they work.
diff --git a/examples/model_restore/chemception_model.py b/examples/model_restore/chemception_model.py
@@ -0,0 +1,35 @@
+"""
+Script that trains Chemception models on delaney dataset.
+"""
+import numpy as np
+np.random.seed(123)
+import tensorflow as tf
+tf.random.set_seed(123)
+import deepchem as dc
+
+# Load Delaney dataset
+delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
+    featurizer='smiles2img', split='index', img_spec="engd")
+train_dataset, valid_dataset, test_dataset = delaney_datasets
+
+# Get Metric
+metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
+
+model = dc.models.ChemCeption(
+    img_spec="engd",
+    n_tasks=len(delaney_tasks),
+    model_dir="./model",
+    mode="regression")
+
+# Fit trained model
+model.fit(train_dataset, nb_epoch=1)
+
+print("Evaluating model")
+train_scores = model.evaluate(train_dataset, [metric], transformers)
+valid_scores = model.evaluate(valid_dataset, [metric], transformers)
+
+print("Train scores")
+print(train_scores)
+
+print("Validation scores")
+print(valid_scores)
diff --git a/examples/model_restore/chemception_restore.py b/examples/model_restore/chemception_restore.py
@@ -0,0 +1,14 @@
+import deepchem as dc
+import rdkit.Chem as Chem
+
+model = dc.models.ChemCeption(
+    img_spec="engd", n_tasks=1, model_dir="./model", mode="regression")
+model.restore()
+
+smiles = "CCCCC"
+featurizer = dc.feat.SmilesToImage(img_spec="engd", img_size=80, res=0.5)
+dataset = dc.data.NumpyDataset(
+    featurizer.featurize([Chem.MolFromSmiles(smiles)]))
+prediction = model.predict(dataset)
+print("smiles: %s" % smiles)
+print("prediction: %s" % str(prediction))
diff --git a/examples/muv/__init__.py b/examples/muv/__init__.py
diff --git a/examples/nci/__init__.py b/examples/nci/__init__.py
diff --git a/examples/pcba/__init__.py b/examples/pcba/__init__.py
diff --git a/examples/pdbbind/__init__.py b/examples/pdbbind/__init__.py
diff --git a/examples/pretraining/README.md b/examples/pretraining/README.md
@@ -0,0 +1,12 @@
+# Pretraining Example
+
+In this example we will walk you through the use of pretraining
+to transfer learned weights from a trained model to a new model.
+
+The code for transfering pretrained weights for a
+fully-connected network is in `fnet_pretraining.py`. To run this
+example, execute the following command in your shell
+
+```
+python fcnet_pretraining.py
+```
diff --git a/examples/pretraining/fcnet_pretraining.py b/examples/pretraining/fcnet_pretraining.py
@@ -0,0 +1,54 @@
+import deepchem as dc
+import numpy as np
+import tensorflow as tf
+from deepchem.models.losses import L2Loss
+from tensorflow.keras.layers import Input, Dense
+
+
+class MLP(dc.models.KerasModel):
+
+  def __init__(self, n_tasks=1, feature_dim=100, hidden_layer_size=64,
+               **kwargs):
+    self.feature_dim = feature_dim
+    self.hidden_layer_size = hidden_layer_size
+    self.n_tasks = n_tasks
+
+    model, loss, output_types = self._build_graph()
+    super(MLP, self).__init__(
+        model=model, loss=loss, output_types=output_types, **kwargs)
+
+  def _build_graph(self):
+    inputs = Input(dtype=tf.float32, shape=(self.feature_dim,), name="Input")
+    out1 = Dense(units=self.hidden_layer_size, activation='relu')(inputs)
+
+    final = Dense(units=self.n_tasks, activation='sigmoid')(out1)
+    outputs = [final]
+    output_types = ['prediction']
+    loss = dc.models.losses.BinaryCrossEntropy()
+
+    model = tf.keras.Model(inputs=[inputs], outputs=outputs)
+    return model, loss, output_types
+
+
+X_1 = np.random.randn(100, 32)
+y_1 = np.random.randn(100, 100)
+
+dataset_1 = dc.data.NumpyDataset(X_1, y_1)
+
+X_2 = np.random.randn(100, 32)
+y_2 = np.random.randn(100, 10)
+
+dataset_2 = dc.data.NumpyDataset(X_2, y_2)
+
+source_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=100)
+source_model.fit(dataset_1, nb_epoch=100)
+
+dest_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=10)
+dest_model.load_from_pretrained(
+    source_model=source_model,
+    assignment_map=None,
+    value_map=None,
+    model_dir=None,
+    include_top=False)
+
+dest_model.fit(dataset_2, nb_epoch=100)