## Setup

In [42]:
import deepchem as dc
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import multiprocessing

## Load, split, and featurize the dataset

First, load the entire MUV dataset. This may take a few minutes.

In [10]:
tasks, (dataset,), transformers = dc.molnet.load_muv(featurizer=dc.feat.RawFeaturizer(smiles=True), splitter=None)
n_tasks = len(dataset.tasks)
n_folds = 5

Then, split the dataset into training and test datasets, and further split the training dataset into 5 folds.

In [12]:
splitter = dc.splits.RandomStratifiedSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset, seed=826)
train_datasets = splitter.k_fold_split(train_dataset, n_folds, seed=826)

Finally, featurize the dataset with circular fingerprint and graph convolution featurizers. This may take several minutes.

In [13]:
def featurize(featurizer, train_datasets, test_dataset):
    transformer = dc.trans.FeaturizationTransformer(featurizer=featurizer)
    
    test_dataset_featurized = transformer.transform(test_dataset)
    train_datasets_featurized = [
        (transformer.transform(train_dataset), transformer.transform(cv_dataset)) 
        for train_dataset, cv_dataset in train_datasets
    ]
    
    return train_datasets_featurized, test_dataset_featurized


ecfp_dataset = featurize(dc.feat.CircularFingerprint(), train_datasets, test_dataset)
graphconv_dataset = featurize(dc.feat.ConvMolFeaturizer(), train_datasets, test_dataset)
weave_dataset = featurize(dc.feat.WeaveFeaturizer(), train_datasets, test_dataset)

## Hyperparameter tuning

In [51]:
def extract_task(dataset, task):
    return dc.data.NumpyDataset(dataset.X, dataset.y[:, task], dataset.w[:, task], dataset.ids)


def evaluate(dataset, model_generator, model_args):
    scores = np.zeros((n_folds, n_tasks))
    metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
    
    train_datasets, test_dataset = dataset
    
    for fold in range(n_folds):
        train_dataset, cv_dataset = train_datasets[fold]
        
        for task in range(n_tasks):
            train_dataset_task = extract_task(train_dataset, task)
            cv_dataset_task = extract_task(cv_dataset, task)
            
            m = model_generator(**model_args)
            m.fit(train_dataset_task)
            scores[fold, task] = m.evaluate(cv_dataset_task, [metric])['roc_auc_score']
    
    return scores

In [15]:
lr_scores = evaluate(ecfp_dataset, dc.models.SklearnModel, LogisticRegression())

In [53]:
rf_n_estimators = [1000, 4000, 16000]
rf_max_features = [None, 'sqrt', 'log2']
rf_class_weight = [None, 'balanced', 'balanced_subsample']

rf_model_args = [
    {
        'model': RandomForestClassifier(n_estimators=n_estimators, 
                                        max_features=max_features, 
                                        class_weight=class_weight)
    }
    for n_estimators in rf_n_estimators
    for max_features in rf_max_features
    for class_weight in rf_class_weight
]

rf_scores = [(model_args, evaluate(ecfp_dataset, dc.models.SklearnModel, model_args))
             for model_args in rf_model_args]

Process ForkPoolWorker-559:
Process ForkPoolWorker-552:
Process ForkPoolWorker-553:
Process ForkPoolWorker-549:
Process ForkPoolWorker-551:
Process ForkPoolWorker-556:
Process ForkPoolWorker-554:
Process ForkPoolWorker-548:
Process ForkPoolWorker-555:
Process ForkPoolWorker-557:
Process ForkPoolWorker-560:
Process ForkPoolWorker-550:
Process ForkPoolWorker-558:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    se

KeyboardInterrupt: 

KeyboardInterrupt
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self

KeyboardInterrupt
Process ForkPoolWorker-542:
Process ForkPoolWorker-532:
Process ForkPoolWorker-545:
Process ForkPoolWorker-525:
Process ForkPoolWorker-534:
Process ForkPoolWorker-523:
Process ForkPoolWorker-531:
Process ForkPoolWorker-521:
Process ForkPoolWorker-528:
Process ForkPoolWorker-539:
Process ForkPoolWorker-547:
Process ForkPoolWorker-535:
Process ForkPoolWorker-537:
Process ForkPoolWorker-533:
Process ForkPoolWorker-530:
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-536:
Process ForkPoolWorker-538:
Process ForkPoolWorker-544:
Process ForkPoolWorker-543:
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessin

  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/tmp/ipykernel_34644/574081408.py", line 2, in extract_task
    return dc.data.NumpyDataset(dataset.X, dataset.y[:, task], dataset.w[:, task], dataset.ids)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/tmp/ipykernel_34644/574081408.py", line 15, in evaluate
    train_dataset_task = extract_task(train_dataset, task)
  File "/tmp/ipykernel_34644/574081408.py", line 2, in extract_task
    return dc.data.NumpyDataset(dataset.X, dataset.y[:, task], dataset.w[:, task], dataset.ids)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/pytho

  File "/tmp/ipykernel_34644/574081408.py", line 2, in extract_task
    return dc.data.NumpyDataset(dataset.X, dataset.y[:, task], dataset.w[:, task], dataset.ids)
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 1528, in <genexpr>
    return (self.get_shard(i) for i in range(self.get_number_shards()))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 2102, in get_shard
    X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run

KeyboardInterrupt
  File "/tmp/ipykernel_34644/574081408.py", line 15, in evaluate
    train_dataset_task = extract_task(train_dataset, task)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/tmp/ipykernel_34644/574081408.py", line 15, in evaluate
    train_dataset_task = extract_task(train_dataset, task)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/numpy/lib/npyio.py", line 441, in load
    pickle_kwargs=pickle_kwargs)
  File "/tmp/ipykernel_34644/574081408.py", line 2, in extract_task
    return dc.data.NumpyDatase

  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 2102, in get_shard
    X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 2102, in get_shard
    X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
KeyboardInterrupt
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py

  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 1528, in <genexpr>
    return (self.get_shard(i) for i in range(self.get_number_shards()))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/site-packages/deepchem/data/datasets.py", line 2102, in get_shard
    X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))
KeyboardInterrupt
Traceback (most recent call last):
Process ForkPoolWorker-526:
Process ForkPoolWorker-522:
Process ForkPoolWorker-524:
Process ForkPoolWorker-540:
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    resul

KeyboardInterrupt
Traceback (most recent call last):
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File

In [55]:
gc_model_args = [
    {
        'n_tasks': 1,
        'graph_conv_layers': layers[:-1],
        'dense_layer_size': layers[-1],
        'dropout': dropout
    }
    for layers in [[64, 64, 128], [128, 128, 256], [256, 256, 512]]
    for dropout in [0.0, 0.1, 0.2]
]

gc_scores = [(model_args, evaluate(graphconv_dataset, dc.models.GraphConvModel, model_args))
             for model_args in gc_model_args]

  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
Process ForkPoolWorker-1508:
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  F

_pickle.UnpicklingError: invalid load key, '?'.
Process ForkPoolWorker-1619:
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/queues.py", line 354, in get
    return _ForkingPickler.loads(res)
_pickle.UnpicklingError: invalid load key, '?'.
Process ForkPoolWorker-1709:
Traceback (most recent call last):
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/users/gaffneyk/miniconda3/envs/bmi826-project/lib

KeyboardInterrupt: 