Skip to content

Commit

Permalink
Merge 72e7010 into 4cbaac0
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Jun 1, 2020
2 parents 4cbaac0 + 72e7010 commit d4ad5a4
Show file tree
Hide file tree
Showing 45 changed files with 1,731 additions and 39 deletions.
7 changes: 7 additions & 0 deletions examples/data_loading/README.md
@@ -0,0 +1,7 @@
# Data Loading Examples

The examples in this directory highlight a number of ways to
load datasets into DeepChem for downstream analysis:

- `pandas_csv.py` shows how to directly load a dataset from a CSV file without using a `DataLoader`.
- `sdf_load.py` shows how to load a dataset from a sdf file using `SDFLoader`.
11 changes: 11 additions & 0 deletions examples/data_loading/example.csv
@@ -0,0 +1,11 @@
Compound ID,log-solubility,smiles
Amigdalin,0.9740000000000001,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O
Fenfuram,2.885,Cc1occc1C(=O)Nc2ccccc2
citral,2.5789999999999997,CC(C)=CCCC(C)=CC(=O)
Picene,6.617999999999999,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
Thiophene,2.2319999999999998,c1ccsc1
benzothiazole,2.733,c2ccc1scnc1c2
"2,2,4,6,6'-PCB",6.545,Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl
Estradiol,4.138,CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O
Dieldrin,4.533,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl
Rotenone,5.246,COc5cc4OCC3Oc2c1CC(Oc1ccc2C(=O)C3c4cc5OC)C(C)=C
1,165 changes: 1,165 additions & 0 deletions examples/data_loading/membrane_permeability.sdf

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions examples/data_loading/pandas_csv.py
@@ -0,0 +1,25 @@
# This example shows how to use Pandas to load data directly
# without using a CSVLoader object. This may be useful if you
# want the flexibility of processing your data with Pandas
# directly.
import pandas as pd
import deepchem as dc
from rdkit import Chem

df = pd.read_csv("example.csv")
print("Original data loaded as DataFrame:")
print(df)

featurizer = dc.feat.CircularFingerprint(size=16)
mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]]
features = featurizer.featurize(mols)
dataset = dc.data.NumpyDataset(
X=features, y=df["log-solubility"], ids=df["Compound ID"])

print("Data converted into DeepChem Dataset")
print(dataset)

# Now let's convert from a dataset back to a pandas dataframe
converted_df = dataset.to_dataframe()
print("Data converted back into DataFrame:")
print(converted_df)
6 changes: 6 additions & 0 deletions examples/data_loading/sdf_load.py
@@ -0,0 +1,6 @@
# This example shows how to load data from a SDF file into DeepChem. The data in this SDF file is stored in field "LogP(RRCK)"
import deepchem as dc

featurizer = dc.feat.CircularFingerprint(size=16)
loader = dc.data.SDFLoader(["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
dataset = loader.featurize("membrane_permeability.sdf")
3 changes: 3 additions & 0 deletions examples/datasets/README.md
@@ -0,0 +1,3 @@
# Dataset Examples

This folder countains examples of using DeepChem datasets to do things.
5 changes: 5 additions & 0 deletions examples/datasets/pretty_print.py
@@ -0,0 +1,5 @@
import numpy as np
import deepchem as dc

dataset = dc.data.NumpyDataset(np.random.rand(500, 5))
print(dataset)
23 changes: 23 additions & 0 deletions examples/datasets/scaffold_split_print.py
@@ -0,0 +1,23 @@
import numpy as np
import deepchem as dc

mols = [
'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

splitter = dc.splits.ScaffoldSplitter()
# TODO: This should be swapped for simpler splitter API once that's merged in.
dataset = dc.data.NumpyDataset(X=np.array(mols), ids=mols)
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train)
print("Valid set")
print(valid)
print("Test set")
print(test)
15 changes: 15 additions & 0 deletions examples/delaney/README.md
@@ -0,0 +1,15 @@
The Delaney dataset is a collection of 2874 aqueous solubility measurements from this paper:

Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.

This dataset is commonly used since it's a small molecular
regression dataset that's convenient for benchmarking various
techniques. In this example, we train a series of different
DeepChem models against this task:

- `DAGModel`: In `delaney_DAG.py`. This model will train and
converge very slowly.
- `TextCNNModel`: In `delaney_textcnn.py`. This model featurizes compounds as SMILES strings directly and trains a convolutional network directly on the text.
- `WeaveModel`: In `delaney_weave.py`. This model trains a weave style convolution on Delaney.
- `ChemCeption`: In `delaney_chemception.py`. This model trains a variant of an Inception convolutional network on images generated from molecules.
- `MPNNModel`: In `delaney_MPNN.py`. This model trains a little slower, but is faster than `DAGModel`.
Empty file removed examples/delaney/__init__.py
Empty file.
File renamed without changes.
35 changes: 35 additions & 0 deletions examples/delaney/delaney_chemception.py
@@ -0,0 +1,35 @@
"""
Script that trains Chemception models on delaney dataset.
"""
import numpy as np
np.random.seed(123)
import tensorflow as tf
tf.random.set_seed(123)
import deepchem as dc

# Load Delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
featurizer='smiles2img', split='index', img_spec="engd")
train_dataset, valid_dataset, test_dataset = delaney_datasets

# Get Metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

model = dc.models.ChemCeption(
img_spec="engd",
n_tasks=len(delaney_tasks),
model_dir=None,
mode="regression")

# Fit trained model
model.fit(train_dataset, nb_epoch=50)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
File renamed without changes.
21 changes: 21 additions & 0 deletions examples/factors/README.md
@@ -0,0 +1,21 @@
# Factors Examples

The Factors dataset is an in-house dataset from Merck that was first introduced in the following paper:

Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.

It contains 1500 Merck in-house compounds that were measured
for IC50 of inhibition on 12 serine proteases. Unlike most of
the other datasets featured in MoleculeNet, the Factors
collection does not have structures for the compounds tested
since they were proprietary Merck compounds. However, the
collection does feature pre-computed descriptors for these
compounds.

Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.

In this example, we train various models on the Factors dataset.
21 changes: 21 additions & 0 deletions examples/hiv/README.md
@@ -0,0 +1,21 @@
# HIV Dataset Examples

The HIV dataset was introduced by the Drug Therapeutics
Program (DTP) AIDS Antiviral Screen, which tested the ability
to inhibit HIV replication for over 40,000 compounds.
Screening results were evaluated and placed into three
categories: confirmed inactive (CI),confirmed active (CA) and
confirmed moderately active (CM). We further combine the
latter two labels, making it a classification task between
inactive (CI) and active (CA and CM).

The data file contains a csv table, in which columns below
are used:
- "smiles": SMILES representation of the molecular structure
- "activity": Three-class labels for screening results: CI/CM/CA
- "HIV_active": Binary labels for screening results: 1 (CA/CM) and 0 (CI)

References:
AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data

In this example we train models on the HIV collection.
Empty file removed examples/hiv/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions examples/hopv/README.md
@@ -0,0 +1,15 @@
# Harvard Organic Photovoltaic Dataset

The HOPV datasets consist of the "Harvard Organic
Photovoltaic Dataset. This dataset includes 350 small
molecules and polymers that were utilized as p-type materials
in OPVs. Experimental properties include: HOMO [a.u.], LUMO
[a.u.], Electrochemical gap [a.u.], Optical gap [a.u.], Power
conversion efficiency [%], Open circuit potential [V], Short
circuit current density [mA/cm^2], and fill factor [%].
Theoretical calculations in the original dataset have been
removed (for now).

Lopez, Steven A., et al. "The Harvard organic photovoltaic dataset." Scientific data 3.1 (2016): 1-7.

In this example, we train models on the HOPV dataset to predict these properties.
21 changes: 21 additions & 0 deletions examples/kaggle/README.md
@@ -0,0 +1,21 @@
# Kaggle Dataset Examples

The Kaggle dataset is an in-house dataset from Merck that was first introduced in the following paper:

Ma, Junshui, et al. "Deep neural nets as a method for quantitative structure–activity relationships." Journal of chemical information and modeling 55.2 (2015): 263-274.

It contains 100,000 unique Merck in-house compounds that were
measured on 15 enzyme inhibition and ADME/TOX datasets.
Unlike most of the other datasets featured in MoleculeNet,
the Kaggle collection does not have structures for the
compounds tested since they were proprietary Merck compounds.
However, the collection does feature pre-computed descriptors
for these compounds.

Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.

This folder contains examples training models on the Kaggle dataset.
Empty file removed examples/kaggle/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions examples/kinase/README.md
@@ -0,0 +1,24 @@
# README for Kinase Example

The Kinase dataset is an in-house dataset from Merck that was first introduced in the following paper:

Ramsundar, Bharath, et al. "Is multitask deep learning practical for pharma?." Journal of chemical information and modeling 57.8 (2017): 2068-2076.

It contains 2500 Merck in-house compounds that were measured
for IC50 of inhibition on 99 protein kinases. Unlike most of
the other datasets featured in MoleculeNet, the Kinase
collection does not have structures for the compounds tested
since they were proprietary Merck compounds. However, the
collection does feature pre-computed descriptors for these
compounds.

Note that the original train/valid/test split from the source
data was preserved here, so this function doesn't allow for
alternate modes of splitting. Similarly, since the source data
came pre-featurized, it is not possible to apply alternative
featurizations.

This example features a few different models trained on this
dataset collection. In particular:

- `kinase_rf.py` trains a random forest model
1 change: 1 addition & 0 deletions examples/model_restore/.gitignore
@@ -0,0 +1 @@
model/
26 changes: 26 additions & 0 deletions examples/model_restore/README.md
@@ -0,0 +1,26 @@
# Model Saving/Restoration

In this example, we'll work through an example of using the
DeepChem API to save and restore a model from disk. We're going
to be training a ChemCeption model for this purpose on the
Delaney dataset.

Here are the files we'll use

- `chemception_model.py`: The file with the model to train
- `chemception_restore.py`: The file that restores the trained model

To train the model, first run

```
python chemception_model.py
```

This will train a model and store it to a subdirectory `./model`. Let's now
invoke this model to make a prediction with it.

```
python chemception_restore.py
```

The scripts are pretty simple so go ahead and peek inside to see how they work.
35 changes: 35 additions & 0 deletions examples/model_restore/chemception_model.py
@@ -0,0 +1,35 @@
"""
Script that trains Chemception models on delaney dataset.
"""
import numpy as np
np.random.seed(123)
import tensorflow as tf
tf.random.set_seed(123)
import deepchem as dc

# Load Delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
featurizer='smiles2img', split='index', img_spec="engd")
train_dataset, valid_dataset, test_dataset = delaney_datasets

# Get Metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

model = dc.models.ChemCeption(
img_spec="engd",
n_tasks=len(delaney_tasks),
model_dir="./model",
mode="regression")

# Fit trained model
model.fit(train_dataset, nb_epoch=1)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
14 changes: 14 additions & 0 deletions examples/model_restore/chemception_restore.py
@@ -0,0 +1,14 @@
import deepchem as dc
import rdkit.Chem as Chem

model = dc.models.ChemCeption(
img_spec="engd", n_tasks=1, model_dir="./model", mode="regression")
model.restore()

smiles = "CCCCC"
featurizer = dc.feat.SmilesToImage(img_spec="engd", img_size=80, res=0.5)
dataset = dc.data.NumpyDataset(
featurizer.featurize([Chem.MolFromSmiles(smiles)]))
prediction = model.predict(dataset)
print("smiles: %s" % smiles)
print("prediction: %s" % str(prediction))
Empty file removed examples/muv/__init__.py
Empty file.
Empty file removed examples/nci/__init__.py
Empty file.
Empty file removed examples/pcba/__init__.py
Empty file.
Empty file removed examples/pdbbind/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions examples/pretraining/README.md
@@ -0,0 +1,12 @@
# Pretraining Example

In this example we will walk you through the use of pretraining
to transfer learned weights from a trained model to a new model.

The code for transfering pretrained weights for a
fully-connected network is in `fnet_pretraining.py`. To run this
example, execute the following command in your shell

```
python fcnet_pretraining.py
```
54 changes: 54 additions & 0 deletions examples/pretraining/fcnet_pretraining.py
@@ -0,0 +1,54 @@
import deepchem as dc
import numpy as np
import tensorflow as tf
from deepchem.models.losses import L2Loss
from tensorflow.keras.layers import Input, Dense


class MLP(dc.models.KerasModel):

def __init__(self, n_tasks=1, feature_dim=100, hidden_layer_size=64,
**kwargs):
self.feature_dim = feature_dim
self.hidden_layer_size = hidden_layer_size
self.n_tasks = n_tasks

model, loss, output_types = self._build_graph()
super(MLP, self).__init__(
model=model, loss=loss, output_types=output_types, **kwargs)

def _build_graph(self):
inputs = Input(dtype=tf.float32, shape=(self.feature_dim,), name="Input")
out1 = Dense(units=self.hidden_layer_size, activation='relu')(inputs)

final = Dense(units=self.n_tasks, activation='sigmoid')(out1)
outputs = [final]
output_types = ['prediction']
loss = dc.models.losses.BinaryCrossEntropy()

model = tf.keras.Model(inputs=[inputs], outputs=outputs)
return model, loss, output_types


X_1 = np.random.randn(100, 32)
y_1 = np.random.randn(100, 100)

dataset_1 = dc.data.NumpyDataset(X_1, y_1)

X_2 = np.random.randn(100, 32)
y_2 = np.random.randn(100, 10)

dataset_2 = dc.data.NumpyDataset(X_2, y_2)

source_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=100)
source_model.fit(dataset_1, nb_epoch=100)

dest_model = MLP(feature_dim=32, hidden_layer_size=100, n_tasks=10)
dest_model.load_from_pretrained(
source_model=source_model,
assignment_map=None,
value_map=None,
model_dir=None,
include_top=False)

dest_model.fit(dataset_2, nb_epoch=100)

0 comments on commit d4ad5a4

Please sign in to comment.