Setting up imports

In [1]:
%load_ext autoreload
%autoreload 2
%pdb off
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Joseph Gomes"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"

import os
import unittest
import tempfile
import shutil

import numpy as np
import numpy.random

from deepchem.utils.evaluate import Evaluator
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models import Model
from deepchem.models.deep import SingleTaskDNN
from deepchem.models.standard import SklearnModel
from deepchem.transformers import NormalizationTransformer
from deepchem.utils.dataset import Dataset
from deepchem.utils.evaluate import Evaluator

from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

Automatic pdb calling has been turned OFF


Using Theano backend.


Creating temporary directories

In [2]:
feature_dir = tempfile.mkdtemp()
samples_dir = tempfile.mkdtemp()
train_dir = tempfile.mkdtemp()
valid_dir = tempfile.mkdtemp()
test_dir = tempfile.mkdtemp()
model_dir = tempfile.mkdtemp()

Setting up model variables

In [3]:
splittype = "random"
compound_featurizers = []
complex_featurizers = []
feature_types = ["user_specified_features"]
user_specified_features = ["evals"]
task_types = {"u0_atom": "regression"}
input_file = "../datasets/gdb7k.pkl.gz"
smiles_field = "smiles"
protein_pdb_field = None
ligand_pdb_field = None

Load featurized data

In [4]:
featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=task_types.keys(),
                            smiles_field=smiles_field,
                            protein_pdb_field=protein_pdb_field,
                            ligand_pdb_field=ligand_pdb_field,
                            compound_featurizers=compound_featurizers,
                            complex_featurizers=complex_featurizers,
                            user_specified_features=user_specified_features,
                            verbose=False)
featurized_samples = featurizer.featurize(input_file, feature_dir, samples_dir)

Perform Train, Validation, and Testing Split

In [5]:
train_samples, valid_samples, test_samples = featurized_samples.train_valid_test_split(
    splittype, train_dir, valid_dir, test_dir)

Creating datasets

In [6]:
train_dataset = Dataset(data_dir=train_dir, samples=train_samples, 
                        featurizers=featurizers, tasks=task_types.keys(),
                        use_user_specified_features=True)
valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples, 
                        featurizers=featurizers, tasks=task_types.keys(),
                        use_user_specified_features=True)
test_dataset = Dataset(data_dir=test_dir, samples=test_samples, 
                       featurizers=featurizers, tasks=task_types.keys(),
                       use_user_specified_features=True)

  if features[feature_ind] == "":
  if y[ind, task] == "":


Transforming datasets

In [7]:
input_transformers = [NormalizationTransformer(transform_X=True, dataset=train_dataset)]
output_transformers = [NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = input_transformers + output_transformers
for transformer in transformers:
    transformer.transform(train_dataset)
for transformer in transformers:
    transformer.transform(valid_dataset)
for transformer in transformers:
    transformer.transform(test_dataset)

Fit Random Forest with hyperparameter search

In [8]:
def model_builder(task_types, params_dict, verbosity):
    n_estimators = params_dict["n_estimators"]
    max_features = params_dict["max_features"]
    return SklearnModel(
        task_types, params_dict,
        model_instance=RandomForestRegressor(n_estimators=n_estimators,
                                             max_features=max_features))
params_dict = {
    "n_estimators": [10, 100],
    "data_shape": [train_dataset.get_data_shape()],
    "max_features": ["auto", "sqrt", "log2", None],
    }
optimizer = HyperparamOpt(model_builder, task_types)
best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, output_transformers, metric="mae", use_max=False)

Model 0/8, Metric mae, Validation set 0: 14.192981
	best_validation_score so  far: 14.192981
Model 1/8, Metric mae, Validation set 1: 17.662740
	best_validation_score so  far: 14.192981
Model 2/8, Metric mae, Validation set 2: 18.310513
	best_validation_score so  far: 14.192981
Model 3/8, Metric mae, Validation set 3: 14.027211
	best_validation_score so  far: 14.027211
Model 4/8, Metric mae, Validation set 4: 13.244072
	best_validation_score so  far: 13.244072
Model 5/8, Metric mae, Validation set 5: 16.299018
	best_validation_score so  far: 13.244072
Model 6/8, Metric mae, Validation set 6: 15.994767
	best_validation_score so  far: 13.244072
Model 7/8, Metric mae, Validation set 7: 13.336670
	best_validation_score so  far: 13.244072
Best hyperparameters: [(u'n_estimators', 100), (u'data_shape', (23,)), (u'max_features', u'auto')]
train_score: 5.061809
validation_score: 13.244072


Compute train/valid/test set mean absolute error for best RF hyperparameters

In [9]:
rf_train_csv_out = "rf_train_regressor.csv"
rf_train_stats_out = "rf_train_stats_regressor.txt"
rf_train_evaluator = Evaluator(best_rf, train_dataset, output_transformers)
rf_train_df, rf_train_mae = rf_train_evaluator.compute_model_performance(
    rf_train_csv_out, rf_train_stats_out)
rf_train_mae = rf_train_mae.iloc[0]["mae"]
print("RF Train set MAE %f" % (rf_train_mae))

rf_valid_csv_out = "rf_valid_regressor.csv"
rf_valid_stats_out = "rf_valid_stats_regressor.txt"
rf_valid_evaluator = Evaluator(best_rf, valid_dataset, output_transformers)
rf_valid_df, rf_valid_mae = rf_valid_evaluator.compute_model_performance(
    rf_valid_csv_out, rf_valid_stats_out)
rf_valid_mae = rf_valid_mae.iloc[0]["mae"]
print("RF Valid set MAE %f" % (rf_valid_mae))

rf_test_csv_out = "rf_test_regressor.csv"
rf_test_stats_out = "rf_test_stats_regressor.txt"
rf_test_evaluator = Evaluator(best_rf, test_dataset, output_transformers)
rf_test_df, rf_test_mae = rf_test_evaluator.compute_model_performance(
    rf_test_csv_out, rf_test_stats_out)
rf_test_mae = rf_test_mae.iloc[0]["mae"]
print("RF Test set MAE error %f" % (rf_test_mae))

RF Train set MAE 5.061809
RF Valid set MAE 13.244072
RF Test set MAE error 12.989110


Fit Kernal Ridge Regression with hyperparameter search

In [10]:
def model_builder(task_types, params_dict, verbosity):
    kernel = params_dict["kernel"]
    alpha = params_dict["alpha"]
    gamma = params_dict["gamma"]
    return SklearnModel(
        task_types, params_dict,
        model_instance=KernelRidge(alpha=alpha,kernel=kernel,gamma=gamma))
params_dict = {
    "kernel": ["rbf", "laplacian"],
    "alpha": [0.0001,0.001,0.01,0.1],
    "gamma": [0.0001,0.001,0.01,0.1]
    }
optimizer = HyperparamOpt(model_builder, task_types)
best_krr, best_krr_hyperparams, all_krr_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, output_transformers, metric="mae", use_max=False)

Model 0/32, Metric mae, Validation set 0: 17.163126
	best_validation_score so  far: 17.163126
Model 1/32, Metric mae, Validation set 1: 20.604338
	best_validation_score so  far: 17.163126
Model 2/32, Metric mae, Validation set 2: 25.012712
	best_validation_score so  far: 17.163126
Model 3/32, Metric mae, Validation set 3: 28.863665
	best_validation_score so  far: 17.163126
Model 4/32, Metric mae, Validation set 4: 11.357195
	best_validation_score so  far: 11.357195
Model 5/32, Metric mae, Validation set 5: 13.396137
	best_validation_score so  far: 11.357195
Model 6/32, Metric mae, Validation set 6: 16.528441
	best_validation_score so  far: 11.357195
Model 7/32, Metric mae, Validation set 7: 20.507368
	best_validation_score so  far: 11.357195
Model 8/32, Metric mae, Validation set 8: 9.390308
	best_validation_score so  far: 9.390308
Model 9/32, Metric mae, Validation set 9: 9.652509
	best_validation_score so  far: 9.390308
Model 10/32, Metric mae, Validation set 10: 10.695947
	best_vali

Compute train/valid/test set mean absolute error for best KRR hyperparameters

In [None]:
krr_train_csv_out = "krr_train_regressor.csv"
krr_train_stats_out = "krr_train_stats_regressor.txt"
krr_train_evaluator = Evaluator(best_krr, train_dataset, output_transformers)
krr_train_df, krr_train_mae = krr_train_evaluator.compute_model_performance(
    krr_train_csv_out, krr_train_stats_out)
krr_train_mae = krr_train_mae.iloc[0]["mae"]
print("KRR Train set MAE %f" % (krr_train_mae))

krr_valid_csv_out = "krr_valid_regressor.csv"
krr_valid_stats_out = "krr_valid_stats_regressor.txt"
krr_valid_evaluator = Evaluator(best_krr, valid_dataset, output_transformers)
krr_valid_df, krr_valid_mae = krr_valid_evaluator.compute_model_performance(
    krr_valid_csv_out, krr_valid_stats_out)
krr_valid_mae = krr_valid_mae.iloc[0]["mae"]
print("KRR Valid set MAE %f" % (krr_valid_mae))

krr_test_csv_out = "krr_test_regressor.csv"
krr_test_stats_out = "krr_test_stats_regressor.txt"
krr_test_evaluator = Evaluator(best_krr, test_dataset, output_transformers)
krr_test_df, krr_test_mae = krr_test_evaluator.compute_model_performance(
    krr_test_csv_out, krr_test_stats_out)
krr_test_mae = krr_test_mae.iloc[0]["mae"]
print("KRR Test set MAE error %f" % (krr_test_mae))

KRR Train set MAE 6.467041
KRR Valid set MAE 8.637119
KRR Test set MAE error 8.990674


Fit Single-task DNN with hyperparameter search

In [None]:
np.random.seed()
params_dict = {"activation": ["relu"],
                "momentum": [.9],
                "batch_size": [50],
                "init": ["glorot_uniform"],
                "data_shape": [train_dataset.get_data_shape()],
                "learning_rate": np.power(10., np.random.uniform(-5, -2, size=5)),
                "decay": np.power(10., np.random.uniform(-6, -4, size=5)),
                "nb_hidden": [1000],
                "nb_epoch": [50],
                "nesterov": [True],
                "dropout": [.1],
                "nb_layers": [1, 2],
                "batchnorm": [False],
              }

optimizer = HyperparamOpt(SingleTaskDNN, task_types)
best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, output_transformers, metric="mae", 
    use_max=False, verbosity=None)

Model 0/50, Metric mae, Validation set 0: nan
	best_validation_score so  far: inf
Model 1/50, Metric mae, Validation set 1: 19.064085
	best_validation_score so  far: 19.064085
Model 2/50, Metric mae, Validation set 2: 18.373328
	best_validation_score so  far: 18.373328
Model 3/50, Metric mae, Validation set 3: 21.892876
	best_validation_score so  far: 18.373328
Model 4/50, Metric mae, Validation set 4: 18.118478
	best_validation_score so  far: 18.118478
Model 5/50, Metric mae, Validation set 5: nan
	best_validation_score so  far: 18.118478
Model 6/50, Metric mae, Validation set 6: 19.239610
	best_validation_score so  far: 18.118478
Model 7/50, Metric mae, Validation set 7: 18.062948
	best_validation_score so  far: 18.062948
Model 8/50, Metric mae, Validation set 8: 20.640809
	best_validation_score so  far: 18.062948
Model 9/50, Metric mae, Validation set 9: 21.097265
	best_validation_score so  far: 18.062948
Model 10/50, Metric mae, Validation set 10: nan
	best_validation_score so  far

Compute train/valid/test set mean absolute error for best DNN hyperparameters

In [None]:
dnn_train_csv_out = "dnn_train_regressor.csv"
dnn_train_stats_out = "dnn_train_regressor_stats.txt"
dnn_train_evaluator = Evaluator(best_dnn, train_dataset, output_transformers)
dnn_train_df, dnn_train_mae = dnn_train_evaluator.compute_model_performance(
    dnn_train_csv_out, dnn_train_stats_out)
dnn_train_mae = dnn_train_mae.iloc[0]["mae"]
print("DNN Train set MAE error %f" % (dnn_train_mae))

dnn_valid_csv_out = "dnn_valid_regressor.csv"
dnn_valid_stats_out = "dnn_valid_regressor_stats.txt"
dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, output_transformers)
dnn_valid_df, dnn_valid_mae = dnn_valid_evaluator.compute_model_performance(
    dnn_valid_csv_out, dnn_valid_stats_out)
dnn_valid_mae = dnn_valid_mae.iloc[0]["mae"]
print("DNN Valid set MAE error %f" % (dnn_valid_mae))

dnn_test_csv_out = "dnn_test_regressor.csv"
dnn_test_stats_out = "dnn_test_regressor_stats.txt"
dnn_test_evaluator = Evaluator(best_dnn, test_dataset, output_transformers)
dnn_test_df, dnn_test_mae = dnn_test_evaluator.compute_model_performance(
    dnn_test_csv_out, dnn_test_stats_out)
dnn_test_mae = dnn_test_mae.iloc[0]["mae"]
print("DNN Test set MAE error %f" % (dnn_test_mae))