# Quantum Machinery with gdb1k

In [1]:
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"

import os
import unittest

import numpy as np
import deepchem as dc
import numpy.random
from deepchem.utils.evaluate import Evaluator
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

  from ._conv import register_converters as _register_converters


Setting up model variables

In [2]:
featurizer = dc.feat.CoulombMatrixEig(23, remove_hydrogens=False)
tasks = ["atomization_energy"]
dataset_file = "../../datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"

Load featurized data

In [3]:
loader = dc.data.SDFLoader(
      tasks=["atomization_energy"], smiles_field="smiles",
      featurizer=featurizer,
      mol_field="mol")
dataset = loader.featurize(dataset_file)

Loading raw samples now.
shard_size: 8192
Reading structures from ../../datasets/gdb1k.sdf.
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
TIMING: featurizing shard 0 took 0.528 s
TIMING: dataset construction took 0.715 s
Loading dataset from disk.


Perform Train, Validation, and Testing Split

In [4]:
random_splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = random_splitter.train_valid_test_split(dataset)

TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.007 s
Loading dataset from disk.
TIMING: dataset construction took 0.007 s
Loading dataset from disk.


Transforming datasets

In [5]:
transformers = [
    dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset),
    dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

for dataset in [train_dataset, valid_dataset, test_dataset]:
  for transformer in transformers:
      dataset = transformer.transform(dataset)

TIMING: dataset construction took 0.015 s
Loading dataset from disk.
TIMING: dataset construction took 0.013 s
Loading dataset from disk.
TIMING: dataset construction took 0.004 s
Loading dataset from disk.
TIMING: dataset construction took 0.005 s
Loading dataset from disk.
TIMING: dataset construction took 0.006 s
Loading dataset from disk.
TIMING: dataset construction took 0.006 s
Loading dataset from disk.


Fit Random Forest with hyperparameter search

In [6]:
def rf_model_builder(model_params, model_dir):
  sklearn_model = RandomForestRegressor(**model_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)
params_dict = {
    "n_estimators": [10, 100],
    "max_features": ["auto", "sqrt", "log2", None],
}

metric = dc.metrics.Metric(dc.metrics.mean_absolute_error)
optimizer = dc.hyper.HyperparamOpt(rf_model_builder)
best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, transformers,
    metric=metric)

Fitting model 1/8
hyperparameters: {'max_features': 'auto', 'n_estimators': 10}
computed_metrics: [86127.09575562611]
Model 1/8, Metric mean_absolute_error, Validation set 0: 86127.095756
	best_validation_score so far: 86127.095756
Fitting model 2/8
hyperparameters: {'max_features': 'auto', 'n_estimators': 100}
computed_metrics: [83531.16789342878]
Model 2/8, Metric mean_absolute_error, Validation set 1: 83531.167893
	best_validation_score so far: 86127.095756
Fitting model 3/8
hyperparameters: {'max_features': 'sqrt', 'n_estimators': 10}
computed_metrics: [83769.00864289024]
Model 3/8, Metric mean_absolute_error, Validation set 2: 83769.008643
	best_validation_score so far: 86127.095756
Fitting model 4/8
hyperparameters: {'max_features': 'sqrt', 'n_estimators': 100}
computed_metrics: [83750.48498142553]
Model 4/8, Metric mean_absolute_error, Validation set 3: 83750.484981
	best_validation_score so far: 86127.095756
Fitting model 5/8
hyperparameters: {'max_features': 'log2', 'n_estimat

In [7]:
def krr_model_builder(model_params, model_dir):
  sklearn_model = KernelRidge(**model_params)
  return dc.models.SklearnModel(sklearn_model, model_dir)

params_dict = {
    "kernel": ["laplacian"],
    "alpha": [0.0001],
    "gamma": [0.0001]
}

metric = dc.metrics.Metric(dc.metrics.mean_absolute_error)
optimizer = dc.hyper.HyperparamOpt(krr_model_builder)
best_krr, best_krr_hyperparams, all_krr_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, transformers,
    metric=metric)

Fitting model 1/1
hyperparameters: {'alpha': 0.0001, 'kernel': 'laplacian', 'gamma': 0.0001}
computed_metrics: [92814.92810152171]
Model 1/1, Metric mean_absolute_error, Validation set 0: 92814.928102
	best_validation_score so far: 92814.928102
computed_metrics: [35135.73181665828]
Best hyperparameters: (0.0001, 'laplacian', 0.0001)
train_score: 35135.731817
validation_score: 92814.928102
