# Quantum Machinery with gdb1k

In [9]:
%load_ext autoreload
%autoreload 2
%pdb off
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"

import os
import unittest

import numpy as np
import deepchem as dc
import numpy.random
from deepchem.utils.evaluate import Evaluator
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Automatic pdb calling has been turned OFF


Setting up model variables

In [10]:
featurizer = dc.feat.CoulombMatrixEig(23, remove_hydrogens=False)
tasks = ["atomization_energy"]
dataset_file = "../../datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"

Load featurized data

In [14]:
loader = dc.data.SDFLoader(
      tasks=["atomization_energy"], smiles_field="smiles",
      featurizer=featurizer,
      mol_field="mol")
dataset = loader.featurize(dataset_file)

Loading raw samples now.
shard_size: 8192
Reading structures from ../../datasets/gdb1k.sdf.
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
TIMING: featurizing shard 0 took 0.338 s
TIMING: dataset construction took 0.479 s
Loading dataset from disk.
type of diskData <class 'deepchem.data.datasets.DiskDataset'>


Perform Train, Validation, and Testing Split

In [15]:
random_splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = random_splitter.train_valid_test_split(dataset)

TIMING: dataset construction took 0.011 s
Loading dataset from disk.
TIMING: dataset construction took 0.005 s
Loading dataset from disk.
TIMING: dataset construction took 0.005 s
Loading dataset from disk.


Transforming datasets

In [16]:
transformers = [
    dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset),
    dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

for dataset in [train_dataset, valid_dataset, test_dataset]:
  for transformer in transformers:
      dataset = transformer.transform(dataset)

TIMING: dataset construction took 0.010 s
Loading dataset from disk.
TIMING: dataset construction took 0.010 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.


Fit Random Forest with hyperparameter search

In [17]:
def rf_model_builder(model_params, model_dir):
  sklearn_model = RandomForestRegressor(**model_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)
params_dict = {
    "n_estimators": [10, 100],
    "max_features": ["auto", "sqrt", "log2", None],
}

metric = dc.metrics.Metric(dc.metrics.mean_absolute_error)
optimizer = dc.hyper.HyperparamOpt(rf_model_builder)
best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, transformers,
    metric=metric)

Fitting model 1/8
hyperparameters: {'n_estimators': 10, 'max_features': 'auto'}
computed_metrics: [82238.16595651126]
Model 1/8, Metric mean_absolute_error, Validation set 0: 82238.165957
	best_validation_score so far: 82238.165957
Fitting model 2/8
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt'}
computed_metrics: [83245.07611794364]
Model 2/8, Metric mean_absolute_error, Validation set 1: 83245.076118
	best_validation_score so far: 83245.076118
Fitting model 3/8
hyperparameters: {'n_estimators': 10, 'max_features': 'log2'}
computed_metrics: [85169.93092738534]
Model 3/8, Metric mean_absolute_error, Validation set 2: 85169.930927
	best_validation_score so far: 85169.930927
Fitting model 4/8
hyperparameters: {'n_estimators': 10, 'max_features': None}
computed_metrics: [85954.32998255146]
Model 4/8, Metric mean_absolute_error, Validation set 3: 85954.329983
	best_validation_score so far: 85954.329983
Fitting model 5/8
hyperparameters: {'n_estimators': 100, 'max_features': '

In [19]:
def krr_model_builder(model_params, model_dir):
  sklearn_model = KernelRidge(**model_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)

params_dict = {
    "kernel": ["laplacian"],
    "alpha": [0.0001],
    "gamma": [0.0001]
}

metric = dc.metrics.Metric(dc.metrics.mean_absolute_error)
optimizer = dc.hyper.HyperparamOpt(krr_model_builder)
best_krr, best_krr_hyperparams, all_krr_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, transformers,
    metric=metric)

Fitting model 1/1
hyperparameters: {'kernel': 'laplacian', 'alpha': 0.0001, 'gamma': 0.0001}
computed_metrics: [93703.78574695572]
Model 1/1, Metric mean_absolute_error, Validation set 0: 93703.785747
	best_validation_score so far: 93703.785747
computed_metrics: [36729.00616224196]
Best hyperparameters: ('laplacian', 0.0001, 0.0001)
train_score: 36729.006162
validation_score: 93703.785747
