diff --git a/deepchem/hyper/__init__.py b/deepchem/hyper/__init__.py index c383293143..29adcf5605 100644 --- a/deepchem/hyper/__init__.py +++ b/deepchem/hyper/__init__.py @@ -1,2 +1,3 @@ -from deepchem.hyper.grid_search import HyperparamOpt +from deepchem.hyper.base_classes import HyperparamOpt +from deepchem.hyper.grid_search import GridHyperparamOpt from deepchem.hyper.gaussian_process import GaussianProcessHyperparamOpt diff --git a/deepchem/hyper/base_classes.py b/deepchem/hyper/base_classes.py new file mode 100644 index 0000000000..5b9060b28c --- /dev/null +++ b/deepchem/hyper/base_classes.py @@ -0,0 +1,119 @@ +import logging + +logger = logging.getLogger(__name__) + + +def _convert_hyperparam_dict_to_filename(hyper_params): + """Helper function that converts a dictionary of hyperparameters to a string that can be a filename. + + Parameters + ---------- + hyper_params: dict + Maps string of hyperparameter name to int/float. + + Returns + ------- + filename: str + A filename of form "_key1_value1_value2_..._key2..." + """ + filename = "" + keys = sorted(hyper_params.keys()) + for key in keys: + filename += "_%s" % str(key) + value = hyper_params[key] + if isinstance(value, int): + filename += "_%s" % str(value) + elif isinstance(value, float): + filename += "_%.2f" % value + else: + filename += "%s" % str(value) + return filename + + +class HyperparamOpt(object): + """Abstract superclass for hyperparameter search classes. + + This class is an abstract base class for hyperparameter search + classes in DeepChem. Hyperparameter search is performed on + `dc.models.Model` classes. Each hyperparameter object accepts a + `dc.models.Model` class upon construct. When the `hyperparam_search` + class is invoked, this class is used to construct many different + concrete models which are trained on the specified training set and + evaluated on a given validation set. + + Different subclasses of `HyperparamOpt` differ in the choice of + strategy for searching the hyperparameter evaluation space. This + class itself is an abstract superclass and should never be directly + instantiated. + """ + + def __init__(self, model_builder): + """Initialize Hyperparameter Optimizer. + + Note this is an abstract constructor which should only be used by + subclasses. + + Parameters + ---------- + model_builder: constructor function. + This parameter must be constructor function which returns an + object which is an instance of `dc.models.Model`. This function + must accept two arguments, `model_params` of type `dict` and + `model_dir`, a string specifying a path to a model directory. + See the example. + """ + if self.__class__.__name__ == "HyperparamOpt": + raise ValueError( + "HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead." + ) + self.model_builder = model_builder + + def hyperparam_search(self, + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + use_max=True, + logdir=None): + """Conduct Hyperparameter search. + + This method defines the common API shared by all hyperparameter + optimization subclasses. Different classes will implement + different search methods but they must all follow this common API. + + Parameters + ---------- + params_dict: dict + Dictionary mapping strings to values. Note that the + precise semantics of `params_dict` will change depending on the + optimizer that you're using. Depending on the type of + hyperparameter optimization, these values can be + ints/floats/strings/lists/etc. Read the documentation for the + concrete hyperparameter optimization subclass you're using to + learn more about what's expected. + train_dataset: `dc.data.Dataset` + dataset used for training + valid_dataset: `dc.data.Dataset` + dataset used for validation(optimization on valid scores) + output_transformers: list[dc.trans.Transformer] + Transformers for evaluation. This argument is needed since + `train_dataset` and `valid_dataset` may have been transformed + for learning and need the transform to be inverted before + the metric can be evaluated on a model. + use_max: bool, optional + If True, return the model with the highest score. Else return + model with the minimum score. + logdir: str, optional + The directory in which to store created models. If not set, will + use a temporary directory. + + Returns + ------- + `(best_model, best_hyperparams, all_scores)` where `best_model` is + an instance of `dc.models.Models`, `best_hyperparams` is a + dictionary of parameters, and `all_scores` is a dictionary mapping + string representations of hyperparameter sets to validation + scores. + """ + raise NotImplementedError diff --git a/deepchem/hyper/gaussian_process.py b/deepchem/hyper/gaussian_process.py index 251869fb37..9d3724cfc3 100644 --- a/deepchem/hyper/gaussian_process.py +++ b/deepchem/hyper/gaussian_process.py @@ -5,238 +5,287 @@ import numpy as np import tempfile import os -from deepchem.hyper.grid_search import HyperparamOpt +import deepchem +from deepchem.hyper.base_classes import HyperparamOpt from deepchem.utils.evaluate import Evaluator -from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression +from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename logger = logging.getLogger(__name__) +def compute_parameter_range(params_dict, search_range): + """Convenience Function to compute parameter search space. + + Parameters + ---------- + params_dict: dict + Dictionary mapping strings to Ints/Floats. An explicit list of + parameters is computed with `search_range`. The optimization range + computed is specified in the documentation for `search_range` + below. + search_range: int(float)/dict (default 4) + The `search_range` specifies the range of parameter values to + search for. If `search_range` is an int/float, it is used as the + global search range for parameters. This creates a search + problem on the following space: + + optimization on [initial value / search_range, + initial value * search_range] + + If `search_range` is a dict, it must contain the same keys as + for `params_dict`. In this case, `search_range` specifies a + per-parameter search range. This is useful in case some + parameters have a larger natural range than others. For a given + hyperparameter `hp` this would create the following search + range: + + optimization on hp on [initial value[hp] / search_range[hp], + initial value[hp] * search_range[hp]] + + Returns + ------- + param_range: dict + Dictionary mapping hyperparameter names to tuples. Each tuple is + of form `(value_type, value_range)` where `value_type` is a string + that is either "int" or "cont" and `value_range` is a list of two + elements of the form `[low, hi]`. This format is expected by + pyGPGO which `GaussianProcessHyperparamOpt` uses to perform + optimization. + """ + # Range of optimization + param_range = {} + if isinstance(search_range, dict): + if sorted(params_dict.keys()) != sorted(search_range.keys()): + raise ValueError( + "If search_range is provided as a dictionary, it must have the same keys as params_dict." + ) + elif (not isinstance(search_range, int)) and (not isinstance( + search_range, float)): + raise ValueError("search_range must be a dict or int or float.") + for hp, value in params_dict.items(): + if isinstance(search_range, dict): + hp_search_range = search_range[hp] + else: + # We know from guard above that this is an int/float + hp_search_range = search_range + if isinstance(value, int): + value_range = [value // hp_search_range, value * hp_search_range] + param_range[hp] = ("int", value_range) + elif isinstance(value, float): + value_range = [value / hp_search_range, value * hp_search_range] + param_range[hp] = ("cont", value_range) + return param_range + + class GaussianProcessHyperparamOpt(HyperparamOpt): """ Gaussian Process Global Optimization(GPGO) + + This class uses Gaussian Process optimization to select + hyperparameters. Underneath the hood it uses pyGPGO to optimize + models. If you don't have pyGPGO installed, you won't be able to use + this class. + + Note that `params_dict` has a different semantics than for + `GridHyperparamOpt`. `param_dict[hp]` must be an int/float and is + used as the center of a search range. + + Example + ------- + This example shows the type of constructor function expected. + + >>> import sklearn + >>> import deepchem as dc + >>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(lambda **p: dc.models.GraphConvModel(n_tasks=1, **p)) + + Here's a more sophisticated example that shows how to optimize only + some parameters of a model. In this case, we have some parameters we + want to optimize, and others which we don't. To handle this type of + search, we create a `model_builder` which hard codes some arguments + (in this case, `n_tasks` and `n_features` which are properties of a + dataset and not hyperparameters to search over.) + + >>> def model_builder(**model_params): + ... n_layers = model_params['layers'] + ... layer_width = model_params['width'] + ... dropout = model_params['dropout'] + ... return dc.models.MultitaskClassifier( + ... n_tasks=5, + ... n_features=100, + ... layer_sizes=[layer_width]*n_layers, + ... dropouts=dropout + ... ) + >>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(model_builder) """ - def hyperparam_search( - self, - params_dict, - train_dataset, - valid_dataset, - output_transformers, - metric, - direction=True, - n_features=1024, - n_tasks=1, - max_iter=20, - search_range=4, - hp_invalid_list=[ - 'seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', - 'n_pair_feat', 'fit_transformers', 'min_child_weight', - 'max_delta_step', 'subsample', 'colsample_bylevel', - 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', - 'base_score' - ], - log_file='GPhypersearch.log'): - """Perform hyperparams search using a gaussian process assumption - - params_dict include single-valued parameters being optimized, - which should only contain int, float and list of int(float) - - parameters with names in hp_invalid_list will not be changed. - - For Molnet models, self.model_class is model name in string, - params_dict = dc.molnet.preset_hyper_parameters.hps[self.model_class] + def hyperparam_search(self, + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + use_max=True, + logdir=None, + max_iter=20, + search_range=4, + logfile=None): + """Perform hyperparameter search using a gaussian process. Parameters ---------- params_dict: dict - dict including parameters and their initial values - parameters not suitable for optimization can be added to hp_invalid_list - train_dataset: dc.data.Dataset struct + Maps hyperparameter names (strings) to possible parameter + values. The semantics of this list are different than for + `GridHyperparamOpt`. `params_dict[hp]` must map to an int/float, + which is used as the center of a search with radius + `search_range` since pyGPGO can only optimize numerical + hyperparameters. + train_dataset: `dc.data.Dataset` dataset used for training - valid_dataset: dc.data.Dataset struct + valid_dataset: `dc.data.Dataset` dataset used for validation(optimization on valid scores) - output_transformers: list of dc.trans.Transformer + transformers: list[dc.trans.Transformer] transformers for evaluation - metric: list of dc.metrics.Metric + metric: `dc.metrics.Metric` metric used for evaluation - direction: bool + use_max: bool, (default True) + Specifies whether to maximize or minimize `metric`. maximization(True) or minimization(False) - n_features: int - number of input features - n_tasks: int - number of tasks - max_iter: int + logdir: str, optional + The directory in which to store created models. If not set, will + use a temporary directory. + max_iter: int, (default 20) number of optimization trials - search_range: int(float) - optimization on [initial values / search_range, - initial values * search_range] - hp_invalid_list: list - names of parameters that should not be optimized - logfile: string - name of log file, hyperparameters and results for each trial will be recorded + search_range: int(float)/dict (default 4) + The `search_range` specifies the range of parameter values to + search for. If `search_range` is an int/float, it is used as the + global search range for parameters. This creates a search + problem on the following space: + + optimization on [initial value / search_range, + initial value * search_range] + + If `search_range` is a dict, it must contain the same keys as + for `params_dict`. In this case, `search_range` specifies a + per-parameter search range. This is useful in case some + parameters have a larger natural range than others. For a given + hyperparameter `hp` this would create the following search + range: + + optimization on hp on [initial value[hp] / search_range[hp], + initial value[hp] * search_range[hp]] + logfile: str + Name of logfile to write results to. If specified, this is must + be a valid file. If not specified, results of hyperparameter + search will be written to `logdir/.txt`. + Returns ------- - hyper_parameters: dict - params_dict with all optimized values - valid_performance_opt: float - best performance on valid dataset - + `(best_model, best_hyperparams, all_scores)` where `best_model` is + an instance of `dc.model.Models`, `best_hyperparams` is a + dictionary of parameters, and `all_scores` is a dictionary mapping + string representations of hyperparameter sets to validation + scores. """ + # Specify logfile + if logfile: + log_file = logfile + elif logdir is not None: + # Make logdir if it doesn't exist. + if not os.path.exists(logdir): + os.makedirs(logdir, exist_ok=True) + log_file = os.path.join(logdir, "results.txt") + else: + log_file = None + + # setup range + param_range = compute_parameter_range(params_dict, search_range) + param_keys = list(param_range.keys()) + + # Stores all results + all_results = {} + # Store all model references so we don't have to reload + all_models = {} + # Stores all model locations + model_locations = {} + + # Demarcating internal function for readability + ######################## + def optimizing_function(**placeholders): + """Private Optimizing function - assert len(metric) == 1, 'Only use one metric' - hyper_parameters = params_dict - hp_list = list(hyper_parameters.keys()) - for hp in hp_invalid_list: - if hp in hp_list: - hp_list.remove(hp) - - hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list] - assert set(hp_list_class) <= set([list, int, float]) - # Float or int hyper parameters(ex. batch_size, learning_rate) - hp_list_single = [ - hp_list[i] for i in range(len(hp_list)) if not hp_list_class[i] is list - ] - # List of float or int hyper parameters(ex. layer_sizes) - hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]])) - for i in range(len(hp_list)) - if hp_list_class[i] is list] - - # Number of parameters - n_param = len(hp_list_single) - if len(hp_list_multiple) > 0: - n_param = n_param + sum([hp[1] for hp in hp_list_multiple]) - # Range of optimization - param_range = [] - for hp in hp_list_single: - if hyper_parameters[hp].__class__ is int: - param_range.append((('int'), [ - hyper_parameters[hp] // search_range, - hyper_parameters[hp] * search_range - ])) - else: - param_range.append((('cont'), [ - hyper_parameters[hp] / search_range, - hyper_parameters[hp] * search_range - ])) - for hp in hp_list_multiple: - if hyper_parameters[hp[0]][0].__class__ is int: - param_range.extend([(('int'), [ - hyper_parameters[hp[0]][i] // search_range, - hyper_parameters[hp[0]][i] * search_range - ]) for i in range(hp[1])]) - else: - param_range.extend([(('cont'), [ - hyper_parameters[hp[0]][i] / search_range, - hyper_parameters[hp[0]][i] * search_range - ]) for i in range(hp[1])]) - - # Dummy names - param_name = ['l' + format(i, '02d') for i in range(20)] - param = dict(zip(param_name[:n_param], param_range)) - - data_dir = os.environ['DEEPCHEM_DATA_DIR'] - log_file = os.path.join(data_dir, log_file) - - def f(l00=0, - l01=0, - l02=0, - l03=0, - l04=0, - l05=0, - l06=0, - l07=0, - l08=0, - l09=0, - l10=0, - l11=0, - l12=0, - l13=0, - l14=0, - l15=0, - l16=0, - l17=0, - l18=0, - l19=0): - """ Optimizing function Take in hyper parameter values and return valid set performances Parameters ---------- - l00~l19: int or float - placeholders for hyperparameters being optimized, - hyper_parameters dict is rebuilt based on input values of placeholders + placeholders: keyword arguments + Should be various hyperparameters as specified in `param_keys` above. Returns: -------- valid_scores: float valid set performances """ - args = locals() - # Input hyper parameters - i = 0 - for hp in hp_list_single: - hyper_parameters[hp] = float(args[param_name[i]]) - if param_range[i][0] == 'int': - hyper_parameters[hp] = int(hyper_parameters[hp]) - i = i + 1 - for hp in hp_list_multiple: - hyper_parameters[hp[0]] = [ - float(args[param_name[j]]) for j in range(i, i + hp[1]) - ] - if param_range[i][0] == 'int': - hyper_parameters[hp[0]] = map(int, hyper_parameters[hp[0]]) - i = i + hp[1] - - logger.info(hyper_parameters) - # Run benchmark - with open(log_file, 'a') as f: - # Record hyperparameters - f.write(str(hyper_parameters)) - f.write('\n') - if isinstance(self.model_class, str) or isinstance( - self.model_class, unicode): + hyper_parameters = {} + for hp in param_keys: + if param_range[hp][0] == "int": + # param values are always float in BO, so this line converts float to int + # see : https://github.com/josejimenezluna/pyGPGO/issues/10 + hyper_parameters[hp] = int(placeholders[hp]) + else: + hyper_parameters[hp] = float(placeholders[hp]) + logger.info("Running hyperparameter set: %s" % str(hyper_parameters)) + if log_file: + with open(log_file, 'w+') as f: + # Record hyperparameters + f.write("Parameters: %s" % str(hyper_parameters)) + f.write('\n') + + hp_str = _convert_hyperparam_dict_to_filename(hyper_parameters) + if logdir is not None: + filename = "model%s" % hp_str + model_dir = os.path.join(logdir, filename) + logger.info("model_dir is %s" % model_dir) try: - train_scores, valid_scores, _ = benchmark_classification( - train_dataset, - valid_dataset, - valid_dataset, ['task_placeholder'] * n_tasks, - output_transformers, - n_features, - metric, - self.model_class, - hyper_parameters=hyper_parameters) - except AssertionError: - train_scores, valid_scores, _ = benchmark_regression( - train_dataset, - valid_dataset, - valid_dataset, ['task_placeholder'] * n_tasks, - output_transformers, - n_features, - metric, - self.model_class, - hyper_parameters=hyper_parameters) - score = valid_scores[self.model_class][metric[0].name] + os.makedirs(model_dir) + except OSError: + if not os.path.isdir(model_dir): + logger.info("Error creating model_dir, using tempfile directory") + model_dir = tempfile.mkdtemp() else: model_dir = tempfile.mkdtemp() - model = self.model_class(hyper_parameters, model_dir) - model.fit(train_dataset, **hyper_parameters) + # Add it on to the information needed for the constructor + hyper_parameters["model_dir"] = model_dir + model = self.model_builder(**hyper_parameters) + model.fit(train_dataset) + try: model.save() - evaluator = Evaluator(model, valid_dataset, output_transformers) - multitask_scores = evaluator.compute_model_performance(metric) - score = multitask_scores[metric[0].name] + # Some models autosave + except NotImplementedError: + pass - with open(log_file, 'a') as f: - # Record performances - f.write(str(score)) - f.write('\n') + multitask_scores = model.evaluate(valid_dataset, [metric]) + score = multitask_scores[metric.name] + + if log_file: + with open(log_file, 'a') as f: + # Record performances + f.write("Score: %s" % str(score)) + f.write('\n') + # Store all results + all_results[hp_str] = score + # Store reference to model + all_models[hp_str] = model + model_locations[hp_str] = model_dir # GPGO maximize performance by default, set performance to its negative value for minimization - if direction: + if use_max: return score else: return -score + ######################## + import pyGPGO from pyGPGO.covfunc import matern32 from pyGPGO.acquisition import Acquisition @@ -245,63 +294,29 @@ def f(l00=0, cov = matern32() gp = GaussianProcess(cov) acq = Acquisition(mode='ExpectedImprovement') - gpgo = GPGO(gp, acq, f, param) + gpgo = GPGO(gp, acq, optimizing_function, param_range) logger.info("Max number of iteration: %i" % max_iter) gpgo.run(max_iter=max_iter) hp_opt, valid_performance_opt = gpgo.getResult() - # Readout best hyper parameters - i = 0 - for hp in hp_list_single: - hyper_parameters[hp] = float(hp_opt[param_name[i]]) - if param_range[i][0] == 'int': - hyper_parameters[hp] = int(hyper_parameters[hp]) - i = i + 1 - for hp in hp_list_multiple: - hyper_parameters[hp[0]] = [ - float(hp_opt[param_name[j]]) for j in range(i, i + hp[1]) - ] - if param_range[i][0] == 'int': - hyper_parameters[hp[0]] = map(int, hyper_parameters[hp[0]]) - i = i + hp[1] + hyper_parameters = {} + for hp in param_keys: + if param_range[hp][0] == "int": + hyper_parameters[hp] = int(hp_opt[hp]) + else: + hyper_parameters[hp] = float(hp_opt[hp]) + hp_str = _convert_hyperparam_dict_to_filename(hyper_parameters) + + # Let's fetch the model with the best parameters + best_model = all_models[hp_str] # Compare best model to default hyperparameters - with open(log_file, 'a') as f: - # Record hyperparameters - f.write(str(params_dict)) - f.write('\n') - if isinstance(self.model_class, str) or isinstance(self.model_class, - unicode): - try: - train_scores, valid_scores, _ = benchmark_classification( - train_dataset, - valid_dataset, - valid_dataset, ['task_placeholder'] * n_tasks, - output_transformers, - n_features, - metric, - self.model_class, - hyper_parameters=params_dict) - except AssertionError: - train_scores, valid_scores, _ = benchmark_regression( - train_dataset, - valid_dataset, - valid_dataset, ['task_placeholder'] * n_tasks, - output_transformers, - n_features, - metric, - self.model_class, - hyper_parameters=params_dict) - score = valid_scores[self.model_class][metric[0].name] + if log_file: with open(log_file, 'a') as f: - # Record performances - f.write(str(score)) + # Record hyperparameters + f.write("params_dict:") + f.write(str(params_dict)) f.write('\n') - if not direction: - score = -score - if score > valid_performance_opt: - # Optimized model is better, return hyperparameters - return params_dict, score # Return default hyperparameters - return hyper_parameters, valid_performance_opt + return best_model, hyper_parameters, all_results diff --git a/deepchem/hyper/grid_search.py b/deepchem/hyper/grid_search.py index 59a0d1ffab..151d944aec 100644 --- a/deepchem/hyper/grid_search.py +++ b/deepchem/hyper/grid_search.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- """ Contains basic hyperparameter optimizations. """ @@ -9,23 +7,54 @@ import tempfile import shutil import collections +import logging from functools import reduce from operator import mul from deepchem.utils.evaluate import Evaluator -from deepchem.utils.save import log +from deepchem.hyper.base_classes import HyperparamOpt +from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename +logger = logging.getLogger(__name__) -class HyperparamOpt(object): - """ - Provides simple hyperparameter search capabilities. + +class GridHyperparamOpt(HyperparamOpt): """ + Provides simple grid hyperparameter search capabilities. + + This class performs a grid hyperparameter search over the specified + hyperparameter space. This implementation is simple and simply does + a direct iteration over all possible hyperparameters and doesn't use + parallelization to speed up the search. + + Example + ------- + This example shows the type of constructor function expected. + + >>> import sklearn + >>> import deepchem as dc + >>> optimizer = dc.hyper.GridHyperparamOpt(lambda **p: dc.models.GraphConvModel(**p)) - def __init__(self, model_class, verbose=True): - self.model_class = model_class - self.verbose = verbose + Here's a more sophisticated example that shows how to optimize only + some parameters of a model. In this case, we have some parameters we + want to optimize, and others which we don't. To handle this type of + search, we create a `model_builder` which hard codes some arguments + (in this case, `n_tasks` and `n_features` which are properties of a + dataset and not hyperparameters to search over.) + + >>> def model_builder(**model_params): + ... n_layers = model_params['layers'] + ... layer_width = model_params['width'] + ... dropout = model_params['dropout'] + ... return dc.models.MultitaskClassifier( + ... n_tasks=5, + ... n_features=100, + ... layer_sizes=[layer_width]*n_layers, + ... dropouts=dropout + ... ) + >>> optimizer = dc.hyper.GridHyperparamOpt(model_builder) + + """ - # TODO(rbharath): This function is complicated and monolithic. Is there a nice - # way to refactor this? def hyperparam_search(self, params_dict, train_dataset, @@ -36,10 +65,36 @@ def hyperparam_search(self, logdir=None): """Perform hyperparams search according to params_dict. - Each key to hyperparams_dict is a model_param. The values should be a list - of potential values for that hyperparam. + Each key to hyperparams_dict is a model_param. The values should + be a list of potential values for that hyperparam. - TODO(rbharath): This shouldn't be stored in a temporary directory. + Parameters + ---------- + params_dict: Dict[str, list] + Maps hyperparameter names (strings) to lists of possible + parameter values. + train_dataset: `dc.data.Dataset` + dataset used for training + valid_dataset: `dc.data.Dataset` + dataset used for validation(optimization on valid scores) + output_transformers: list[dc.trans.Transformer] + transformers for evaluation + metric: dc.metrics.Metric + metric used for evaluation + use_max: bool, optional + If True, return the model with the highest score. Else return + model with the minimum score. + logdir: str, optional + The directory in which to store created models. If not set, will + use a temporary directory. + + Returns + ------- + `(best_model, best_hyperparams, all_scores)` where `best_model` is + an instance of `dc.model.Models`, `best_hyperparams` is a + dictionary of parameters, and `all_scores` is a dictionary mapping + string representations of hyperparameter sets to validation + scores. """ hyperparams = params_dict.keys() hyperparam_vals = params_dict.values() @@ -58,31 +113,38 @@ def hyperparam_search(self, for ind, hyperparameter_tuple in enumerate( itertools.product(*hyperparam_vals)): model_params = {} - log("Fitting model %d/%d" % (ind + 1, number_combinations), self.verbose) + logger.info("Fitting model %d/%d" % (ind + 1, number_combinations)) + # Construction dictionary mapping hyperparameter names to values + hyper_params = dict(zip(hyperparams, hyperparameter_tuple)) for hyperparam, hyperparam_val in zip(hyperparams, hyperparameter_tuple): model_params[hyperparam] = hyperparam_val - log("hyperparameters: %s" % str(model_params), self.verbose) + logger.info("hyperparameters: %s" % str(model_params)) if logdir is not None: model_dir = os.path.join(logdir, str(ind)) - log("model_dir is %s" % model_dir, self.verbose) + logger.info("model_dir is %s" % model_dir) try: os.makedirs(model_dir) except OSError: if not os.path.isdir(model_dir): - log("Error creating model_dir, using tempfile directory", - self.verbose) + logger.info("Error creating model_dir, using tempfile directory") model_dir = tempfile.mkdtemp() else: model_dir = tempfile.mkdtemp() - - model = self.model_class(model_params, model_dir) + model_params['model_dir'] = model_dir + model = self.model_builder(**model_params) model.fit(train_dataset) + try: + model.save() + # Some models autosave + except NotImplementedError: + pass evaluator = Evaluator(model, valid_dataset, output_transformers) multitask_scores = evaluator.compute_model_performance([metric]) valid_score = multitask_scores[metric.name] - all_scores[str(hyperparameter_tuple)] = valid_score + hp_str = _convert_hyperparam_dict_to_filename(hyper_params) + all_scores[hp_str] = valid_score if (use_max and valid_score >= best_validation_score) or ( not use_max and valid_score <= best_validation_score): @@ -95,21 +157,18 @@ def hyperparam_search(self, else: shutil.rmtree(model_dir) - log( - "Model %d/%d, Metric %s, Validation set %s: %f" % - (ind + 1, number_combinations, metric.name, ind, valid_score), - self.verbose) - log("\tbest_validation_score so far: %f" % best_validation_score, - self.verbose) + logger.info("Model %d/%d, Metric %s, Validation set %s: %f" % + (ind + 1, number_combinations, metric.name, ind, valid_score)) + logger.info("\tbest_validation_score so far: %f" % best_validation_score) if best_model is None: - log("No models trained correctly.", self.verbose) + logger.info("No models trained correctly.") # arbitrarily return last model best_model, best_hyperparams = model, hyperparameter_tuple return best_model, best_hyperparams, all_scores train_evaluator = Evaluator(best_model, train_dataset, output_transformers) multitask_scores = train_evaluator.compute_model_performance([metric]) train_score = multitask_scores[metric.name] - log("Best hyperparameters: %s" % str(best_hyperparams), self.verbose) - log("train_score: %f" % train_score, self.verbose) - log("validation_score: %f" % best_validation_score, self.verbose) + logger.info("Best hyperparameters: %s" % str(best_hyperparams)) + logger.info("train_score: %f" % train_score) + logger.info("validation_score: %f" % best_validation_score) return best_model, best_hyperparams, all_scores diff --git a/deepchem/hyper/tests/test_gaussian_hyperparam_opt.py b/deepchem/hyper/tests/test_gaussian_hyperparam_opt.py new file mode 100644 index 0000000000..f1390a01b2 --- /dev/null +++ b/deepchem/hyper/tests/test_gaussian_hyperparam_opt.py @@ -0,0 +1,180 @@ +""" +Tests for Gaussian Process Hyperparameter Optimization. + +These tests fails every so often. I think it's when the Gaussian +process optimizer doesn't find an optimal point. This is still a +valuable test suite so leaving it in despite the flakiness. +""" +import os +import numpy as np +import sklearn +import deepchem as dc +import unittest +import tempfile +from flaky import flaky + + +class TestGaussianHyperparamOpt(unittest.TestCase): + """ + Test Gaussian Hyperparameter Optimization. + """ + + def setUp(self): + """Set up common resources.""" + + def rf_model_builder(**model_params): + rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'} + model_dir = model_params['model_dir'] + sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params) + return dc.models.SklearnModel(sklearn_model, model_dir) + + self.rf_model_builder = rf_model_builder + self.train_dataset = dc.data.NumpyDataset( + X=np.random.rand(50, 5), y=np.random.rand(50, 1)) + self.valid_dataset = dc.data.NumpyDataset( + X=np.random.rand(20, 5), y=np.random.rand(20, 1)) + + def test_rf_example(self): + """Test a simple example of optimizing a RF model with a gaussian process.""" + + optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": 10} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + self.train_dataset, + self.valid_dataset, + transformers, + metric, + max_iter=2) + + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + assert valid_score["pearson_r2_score"] == max(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + def test_rf_example_min(self): + """Test a simple example of optimizing a RF model with a gaussian process looking for minimum score.""" + + optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": 10} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + self.train_dataset, + self.valid_dataset, + transformers, + metric, + use_max=False, + max_iter=2) + + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + assert valid_score["pearson_r2_score"] == min(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + def test_rf_with_logdir(self): + """Test that using a logdir can work correctly.""" + optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": 10} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + with tempfile.TemporaryDirectory() as tmpdirname: + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + self.train_dataset, + self.valid_dataset, + transformers, + metric, + logdir=tmpdirname, + max_iter=2) + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + assert valid_score["pearson_r2_score"] == max(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + @flaky + def test_multitask_example(self): + """Test a simple example of optimizing a multitask model with a gaussian process search.""" + # Generate dummy dataset + np.random.seed(123) + train_dataset = dc.data.NumpyDataset( + np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)), + np.arange(10)) + valid_dataset = dc.data.NumpyDataset( + np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5)) + + optimizer = dc.hyper.GaussianProcessHyperparamOpt( + lambda **p: dc.models.MultitaskRegressor(n_tasks=2, + n_features=3, dropouts=[0.], + weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)], + learning_rate=0.003, **p)) + + params_dict = {"batch_size": 10} + transformers = [] + metric = dc.metrics.Metric( + dc.metrics.mean_squared_error, task_averager=np.mean) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + max_iter=1, + use_max=False) + + valid_score = best_model.evaluate(valid_dataset, [metric]) + assert valid_score["mean-mean_squared_error"] == min(all_results.values()) + assert valid_score["mean-mean_squared_error"] > 0 + + @flaky + def test_multitask_example_different_search_range(self): + """Test a simple example of optimizing a multitask model with a gaussian process search with per-parameter search range.""" + # Generate dummy dataset + np.random.seed(123) + train_dataset = dc.data.NumpyDataset( + np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)), + np.arange(10)) + valid_dataset = dc.data.NumpyDataset( + np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5)) + + optimizer = dc.hyper.GaussianProcessHyperparamOpt( + lambda **p: dc.models.MultitaskRegressor( + n_tasks=2, + n_features=3, + dropouts=[0.], + weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)], + #learning_rate=0.003, **p)) + **p)) + + params_dict = {"learning_rate": 0.003, "batch_size": 10} + # These are per-example multiplier + search_range = {"learning_rate": 10, "batch_size": 4} + transformers = [] + metric = dc.metrics.Metric( + dc.metrics.mean_squared_error, task_averager=np.mean) + + with tempfile.TemporaryDirectory() as tmpdirname: + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + max_iter=2, + logdir=tmpdirname, + search_range=search_range, + use_max=False) + valid_score = best_model.evaluate(valid_dataset, [metric]) + # Test that 2 parameters were optimized + for hp_str in all_results.keys(): + # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example + assert "batch_size" in hp_str + assert "learning_rate" in hp_str + assert valid_score["mean-mean_squared_error"] == min(all_results.values()) + assert valid_score["mean-mean_squared_error"] > 0 diff --git a/deepchem/hyper/tests/test_grid_hyperparam_opt.py b/deepchem/hyper/tests/test_grid_hyperparam_opt.py new file mode 100644 index 0000000000..3f0c5899f5 --- /dev/null +++ b/deepchem/hyper/tests/test_grid_hyperparam_opt.py @@ -0,0 +1,166 @@ +""" +Tests for hyperparam optimization. +""" +import os +import unittest +import tempfile +import shutil +import numpy as np +import tensorflow as tf +import deepchem as dc +import sklearn +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor + + +class TestGridHyperparamOpt(unittest.TestCase): + """ + Test grid hyperparameter optimization API. + """ + + def setUp(self): + """Set up common resources.""" + + def rf_model_builder(**model_params): + rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'} + model_dir = model_params['model_dir'] + sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params) + return dc.models.SklearnModel(sklearn_model, model_dir) + + self.rf_model_builder = rf_model_builder + self.train_dataset = dc.data.NumpyDataset( + X=np.random.rand(50, 5), y=np.random.rand(50, 1)) + self.valid_dataset = dc.data.NumpyDataset( + X=np.random.rand(20, 5), y=np.random.rand(20, 1)) + + def test_rf_hyperparam(self): + """Test of hyperparam_opt with singletask RF ECFP regression API.""" + optimizer = dc.hyper.GridHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": [10, 100]} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, self.train_dataset, self.valid_dataset, transformers, + metric) + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + + assert valid_score["pearson_r2_score"] == max(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + def test_rf_hyperparam_min(self): + """Test of hyperparam_opt with singletask RF ECFP regression API.""" + optimizer = dc.hyper.GridHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": [10, 100]} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + self.train_dataset, + self.valid_dataset, + transformers, + metric, + use_max=False) + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + + assert valid_score["pearson_r2_score"] == min(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + def test_rf_with_logdir(self): + """Test that using a logdir can work correctly.""" + optimizer = dc.hyper.GridHyperparamOpt(self.rf_model_builder) + params_dict = {"n_estimators": [10, 5]} + transformers = [] + metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + with tempfile.TemporaryDirectory() as tmpdirname: + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + self.train_dataset, + self.valid_dataset, + transformers, + metric, + logdir=tmpdirname) + valid_score = best_model.evaluate(self.valid_dataset, [metric], + transformers) + assert valid_score["pearson_r2_score"] == max(all_results.values()) + assert valid_score["pearson_r2_score"] > 0 + + def test_multitask_example(self): + """Test a simple example of optimizing a multitask model with a grid search.""" + # Generate dummy dataset + np.random.seed(123) + train_dataset = dc.data.NumpyDataset( + np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)), + np.arange(10)) + valid_dataset = dc.data.NumpyDataset( + np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5)) + + optimizer = dc.hyper.GridHyperparamOpt( + lambda **p: dc.models.MultitaskRegressor(n_tasks=2, + n_features=3, dropouts=[0.], + weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)], + learning_rate=0.003, **p)) + + params_dict = {"batch_size": [10, 20]} + transformers = [] + metric = dc.metrics.Metric( + dc.metrics.mean_squared_error, task_averager=np.mean) + + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + use_max=False) + + valid_score = best_model.evaluate(valid_dataset, [metric]) + assert valid_score["mean-mean_squared_error"] == min(all_results.values()) + assert valid_score["mean-mean_squared_error"] > 0 + + def test_multitask_example_multiple_params(self): + """Test a simple example of optimizing a multitask model with a grid search with multiple parameters to optimize.""" + # Generate dummy dataset + np.random.seed(123) + train_dataset = dc.data.NumpyDataset( + np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)), + np.arange(10)) + valid_dataset = dc.data.NumpyDataset( + np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5)) + + optimizer = dc.hyper.GridHyperparamOpt( + lambda **p: dc.models.MultitaskRegressor( + n_tasks=2, + n_features=3, + dropouts=[0.], + weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)], + #learning_rate=0.003, **p)) + **p)) + + params_dict = {"learning_rate": [0.003, 0.03], "batch_size": [10, 50]} + # These are per-example multiplier + transformers = [] + metric = dc.metrics.Metric( + dc.metrics.mean_squared_error, task_averager=np.mean) + + with tempfile.TemporaryDirectory() as tmpdirname: + best_model, best_hyperparams, all_results = optimizer.hyperparam_search( + params_dict, + train_dataset, + valid_dataset, + transformers, + metric, + logdir=tmpdirname, + use_max=False) + valid_score = best_model.evaluate(valid_dataset, [metric]) + # Test that 2 parameters were optimized + for hp_str in all_results.keys(): + # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example + assert "batch_size" in hp_str + assert "learning_rate" in hp_str + + assert valid_score["mean-mean_squared_error"] == min(all_results.values()) + assert valid_score["mean-mean_squared_error"] > 0 diff --git a/deepchem/hyper/tests/test_hyperparam_opt.py b/deepchem/hyper/tests/test_hyperparam_opt.py index 41b4e2ac76..92ce092148 100644 --- a/deepchem/hyper/tests/test_hyperparam_opt.py +++ b/deepchem/hyper/tests/test_hyperparam_opt.py @@ -1,154 +1,27 @@ """ -Integration tests for hyperparam optimization. +Tests for hyperparam optimization. """ -__author__ = "Bharath Ramsundar" -__copyright__ = "Copyright 2016, Stanford University" -__license__ = "MIT" -import os import unittest -import tempfile -import shutil -import numpy as np -import tensorflow as tf +import sklearn import deepchem as dc -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import RandomForestRegressor class TestHyperparamOpt(unittest.TestCase): """ - Test hyperparameter optimization API. + Test abstract superclass behavior. """ - def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): - """Test of hyperparam_opt with singletask RF ECFP regression API.""" - featurizer = dc.feat.CircularFingerprint(size=1024) - tasks = ["log-solubility"] - current_dir = os.path.dirname(os.path.abspath(__file__)) - input_file = os.path.join(current_dir, "../../models/tests/example.csv") - loader = dc.data.CSVLoader( - tasks=tasks, smiles_field="smiles", featurizer=featurizer) - dataset = loader.featurize(input_file) - - splitter = dc.splits.ScaffoldSplitter() - train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( - dataset) - - transformers = [ - dc.trans.NormalizationTransformer( - transform_y=True, dataset=train_dataset) - ] - for dataset in [train_dataset, test_dataset]: - for transformer in transformers: - dataset = transformer.transform(dataset) - - params_dict = {"n_estimators": [10, 100]} - metric = dc.metrics.Metric(dc.metrics.r2_score) + def test_cant_be_initialized(self): + """Test HyperparamOpt can't be initialized.""" + initialized = True def rf_model_builder(model_params, model_dir): - sklearn_model = RandomForestRegressor(**model_params) - return dc.models.SklearnModel(sklearn_model, model_dir) - - optimizer = dc.hyper.HyperparamOpt(rf_model_builder) - best_model, best_hyperparams, all_results = optimizer.hyperparam_search( - params_dict, - train_dataset, - valid_dataset, - transformers, - metric, - logdir=None) - - def test_singletask_to_multitask_sklearn_hyperparam_opt(self): - """Test of hyperparam_opt with singletask_to_multitask.""" - tasks = [ - "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", - "task8", "task9", "task10", "task11", "task12", "task13", "task14", - "task15", "task16" - ] - input_file = "multitask_example.csv" - - n_features = 10 - n_tasks = len(tasks) - # Define train dataset - n_train = 100 - X_train = np.random.rand(n_train, n_features) - y_train = np.random.randint(2, size=(n_train, n_tasks)) - w_train = np.ones_like(y_train) - ids_train = ["C"] * n_train - - train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train, w_train, - ids_train, tasks) - - # Define validation dataset - n_valid = 10 - X_valid = np.random.rand(n_valid, n_features) - y_valid = np.random.randint(2, size=(n_valid, n_tasks)) - w_valid = np.ones_like(y_valid) - ids_valid = ["C"] * n_valid - valid_dataset = dc.data.DiskDataset.from_numpy(X_valid, y_valid, w_valid, - ids_valid, tasks) - - transformers = [] - classification_metric = dc.metrics.Metric( - dc.metrics.matthews_corrcoef, np.mean, mode="classification") - params_dict = {"n_estimators": [1, 10]} - - def multitask_model_builder(model_params, model_dir): - - def model_builder(model_dir): - sklearn_model = RandomForestClassifier(**model_params) - return dc.models.SklearnModel(sklearn_model, model_dir) - - return dc.models.SingletaskToMultitask(tasks, model_builder, model_dir) - - optimizer = dc.hyper.HyperparamOpt(multitask_model_builder) - best_model, best_hyperparams, all_results = optimizer.hyperparam_search( - params_dict, - train_dataset, - valid_dataset, - transformers, - classification_metric, - logdir=None) - - def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): - """Straightforward test of Tensorflow multitask deepchem classification API.""" - task_type = "classification" - - current_dir = os.path.dirname(os.path.abspath(__file__)) - input_file = os.path.join(current_dir, - "../../models/tests/multitask_example.csv") - tasks = [ - "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", - "task8", "task9", "task10", "task11", "task12", "task13", "task14", - "task15", "task16" - ] - - n_features = 1024 - featurizer = dc.feat.CircularFingerprint(size=n_features) - - loader = dc.data.CSVLoader( - tasks=tasks, smiles_field="smiles", featurizer=featurizer) - dataset = loader.featurize(input_file) - - splitter = dc.splits.ScaffoldSplitter() - train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( - dataset) - - transformers = [] - metric = dc.metrics.Metric( - dc.metrics.roc_auc_score, np.mean, mode="classification") - params_dict = {"layer_sizes": [(10,), (100,)]} - - def model_builder(model_params, model_dir): - return dc.models.MultitaskClassifier( - len(tasks), n_features, model_dir=model_dir, **model_params) - - optimizer = dc.hyper.HyperparamOpt(model_builder) - best_model, best_hyperparams, all_results = optimizer.hyperparam_search( - params_dict, - train_dataset, - valid_dataset, - transformers, - metric, - logdir=None) + sklearn_model = sklearn.ensemble.RandomForestRegressor(**model_params) + return dc.model.SklearnModel(sklearn_model, model_dir) + + try: + opt = dc.hyper.HyperparamOpt(rf_model_builder) + except: + initialized = False + assert not initialized diff --git a/docs/hyper.rst b/docs/hyper.rst index e21eb120dc..bc5e2fdc67 100644 --- a/docs/hyper.rst +++ b/docs/hyper.rst @@ -8,6 +8,20 @@ learning algorithm used for the rest of learning and have to be set in an alternate fashion. The :code:`dc.hyper` module contains utilities for hyperparameter tuning. +DeepChem's hyperparameter optimzation algorithms are simple and run in +single-threaded fashion. They are not intended to be production grade +hyperparameter utilities, but rather useful first tools as you start +exploring your parameter space. As the needs of your application grow, +we recommend swapping to a more heavy duty hyperparameter +optimization library. + +Hyperparameter Optimization API +------------------------------- + +.. autoclass:: deepchem.hyper.HyperparamOpt + :members: + :special-members: + Grid Hyperparameter Optimization -------------------------------- @@ -15,13 +29,15 @@ This is the simplest form of hyperparameter optimization that simply involves iterating over a fixed grid of possible values for hyperaparameters. -.. autoclass:: deepchem.hyper.HyperparamOpt +.. autoclass:: deepchem.hyper.GridHyperparamOpt :members: + :special-members: Gaussian Process Hyperparameter Optimization -------------------------------------------- .. autoclass:: deepchem.hyper.GaussianProcessHyperparamOpt :members: + :special-members: diff --git a/examples/hyperparam_opt/README.md b/examples/hyperparam_opt/README.md new file mode 100644 index 0000000000..c3a5b6b654 --- /dev/null +++ b/examples/hyperparam_opt/README.md @@ -0,0 +1,4 @@ +# Hyperparameter Optimization + +In this folder we provide examples of performing hyperparameter optimization +with DeepChem. diff --git a/examples/hyperparam_opt/gaussian_hyperparam_opt.py b/examples/hyperparam_opt/gaussian_hyperparam_opt.py new file mode 100644 index 0000000000..0c47b62126 --- /dev/null +++ b/examples/hyperparam_opt/gaussian_hyperparam_opt.py @@ -0,0 +1,25 @@ +import numpy as np +np.random.seed(123) +import tensorflow as tf +tf.random.set_seed(123) +import deepchem as dc +import sklearn + +# Load delaney dataset +delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney( + featurizer="GraphConv") +train, valid, test = delaney_datasets + +# Fit models +metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) +optimizer = dc.hyper.GaussianProcessHyperparamOpt( + lambda **p: dc.models.GraphConvModel( + n_tasks=len(delaney_tasks), mode="regression", **p)) + +params_dict = {"dropout": 0.5} +best_model, best_params, all_results = optimizer.hyperparam_search( + params_dict, train, valid, transformers, metric, max_iter=1, search_range=2) + +valid_score = best_model.evaluate(valid, [metric], transformers) +print("valid_score") +print(valid_score) diff --git a/examples/hyperparam_opt/gaussian_hyperparam_opt_with_logdir.py b/examples/hyperparam_opt/gaussian_hyperparam_opt_with_logdir.py new file mode 100644 index 0000000000..c9579dfe61 --- /dev/null +++ b/examples/hyperparam_opt/gaussian_hyperparam_opt_with_logdir.py @@ -0,0 +1,27 @@ +import numpy as np +np.random.seed(123) +import tensorflow as tf +tf.random.set_seed(123) +import deepchem as dc +import sklearn +import logging +logging.basicConfig(level=logging.INFO) + +# Load delaney dataset +delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney( + featurizer="GraphConv") +train, valid, test = delaney_datasets + +# Fit models +metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) +optimizer = dc.hyper.GaussianProcessHyperparamOpt( + lambda **p: dc.models.GraphConvModel( + n_tasks=len(delaney_tasks), mode="regression", **p)) + +params_dict = {"dropout": 0.5} +best_model, best_params, all_results = optimizer.hyperparam_search( + params_dict, train, valid, transformers, metric, max_iter=2, search_range=2) + +valid_score = best_model.evaluate(valid, [metric], transformers) +print("valid_score") +print(valid_score) diff --git a/examples/hyperparam_opt/grid_hyperparam_opt.py b/examples/hyperparam_opt/grid_hyperparam_opt.py new file mode 100644 index 0000000000..c427c81b6c --- /dev/null +++ b/examples/hyperparam_opt/grid_hyperparam_opt.py @@ -0,0 +1,28 @@ +import numpy as np +np.random.seed(123) +import tensorflow as tf +tf.random.set_seed(123) +import deepchem as dc +import sklearn + +# Load delaney dataset +delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney( + featurizer="GraphConv") +train, valid, test = delaney_datasets + +# Fit models +metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) + +# Fit models +metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) +optimizer = dc.hyper.GridHyperparamOpt( + lambda **p: dc.models.GraphConvModel( + n_tasks=len(delaney_tasks), mode="regression", **p)) + +params_dict = {"dropout": [0.1, 0.5]} +best_model, best_params, all_results = optimizer.hyperparam_search( + params_dict, train, valid, transformers, metric) + +valid_score = best_model.evaluate(valid, [metric], transformers) +print("valid_score") +print(valid_score) diff --git a/scripts/install_deepchem_conda.ps1 b/scripts/install_deepchem_conda.ps1 index 8c3e6d57b2..7ae678fc6b 100644 --- a/scripts/install_deepchem_conda.ps1 +++ b/scripts/install_deepchem_conda.ps1 @@ -35,5 +35,5 @@ conda install -y -q -c deepchem -c rdkit -c conda-forge -c omnia ` pytest-cov ` flaky - +pip install pyGPGO pip install -U matminer tensorflow==2.2 tensorflow-probability==0.10 diff --git a/scripts/install_deepchem_conda.sh b/scripts/install_deepchem_conda.sh index 2cdcf95577..5861e09ed8 100644 --- a/scripts/install_deepchem_conda.sh +++ b/scripts/install_deepchem_conda.sh @@ -38,5 +38,5 @@ conda install -y -q -c deepchem -c rdkit -c conda-forge -c omnia \ pytest \ pytest-cov \ flaky - +yes | pip install pyGPGO yes | pip install -U matminer tensorflow==2.2 tensorflow-probability==0.10