Skip to content

Commit

Permalink
Multitask Evaluation on Different Magnitude Targets (#290)
Browse files Browse the repository at this point in the history
* Create multitask mean utils function

* Use multitask_mean in place of np mean and remove nanmeans

* Change testing dtypes to floats

* Remove nan filters from testing

* Add informative error for new metrics

* Message in log file about nan results
  • Loading branch information
cjmcgill committed Jun 13, 2022
1 parent c3250c8 commit 0409440
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 61 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ Metrics are used to evaluate the success of the model against the test set as th
* **Multiclass.** cross_entropy (default), accuracy, f1, mcc.
* **Spectra.** sid (default), wasserstein.

When a multitask model is used, the metric score used for evaluation at each epoch or for choosing the best set of hyperparameters during hyperparameter search is obtained by taking the mean of the metric scores for each task. Some metrics scale with the magnitude of the targets (most regression metrics), so geometric mean instead of arithmetic mean is used in those cases in order to avoid having the mean score dominated by changes in the larger magnitude task.
### Cross validation and ensembling

k-fold cross-validation can be run by specifying `--num_folds <k>`. The default is `--num_folds 1`. Each trained model will have different data splits. The reported test score will be the average of the metrics from each fold.
Expand Down
35 changes: 24 additions & 11 deletions chemprop/train/cross_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
import sys
from typing import Callable, Dict, List, Tuple
import subprocess

import numpy as np
import pandas as pd

from .run_training import run_training
from chemprop.args import TrainArgs
from chemprop.constants import TEST_SCORES_FILE_NAME, TRAIN_LOGGER_NAME
from chemprop.data import get_data, get_task_names, MoleculeDataset, validate_dataset_type
from chemprop.utils import create_logger, makedirs, timeit
from chemprop.utils import create_logger, makedirs, timeit, multitask_mean
from chemprop.features import set_extra_atom_fdim, set_extra_bond_fdim, set_explicit_h, set_adding_hs, set_reaction, reset_featurization_parameters


Expand Down Expand Up @@ -60,15 +61,15 @@ def cross_validate(args: TrainArgs,
debug('Could not write the reproducibility section of the arguments to file, thus omitting this section.')
args.save(os.path.join(args.save_dir, 'args.json'), with_reproducibility=False)

#set explicit H option and reaction option
# set explicit H option and reaction option
reset_featurization_parameters(logger=logger)
set_explicit_h(args.explicit_h)
set_adding_hs(args.adding_h)
if args.reaction:
set_reaction(args.reaction, args.reaction_mode)
elif args.reaction_solvent:
set_reaction(True, args.reaction_mode)

# Get data
debug('Loading data')
data = get_data(
Expand Down Expand Up @@ -127,24 +128,36 @@ def cross_validate(args: TrainArgs,
info(f'{args.num_folds}-fold cross validation')

# Report scores for each fold
contains_nan_scores = False
for fold_num in range(args.num_folds):
for metric, scores in all_scores.items():
info(f'\tSeed {init_seed + fold_num} ==> test {metric} = {np.nanmean(scores[fold_num]):.6f}')
info(f'\tSeed {init_seed + fold_num} ==> test {metric} = {multitask_mean(scores[fold_num], metric):.6f}')

if args.show_individual_scores:
for task_name, score in zip(args.task_names, scores[fold_num]):
info(f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}')
if np.isnan(score):
contains_nan_scores = True

# Report scores across folds
for metric, scores in all_scores.items():
avg_scores = np.nanmean(scores, axis=1) # average score for each model across tasks
mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
avg_scores = multitask_mean(scores, axis=1, metric=metric) # average score for each model across tasks
mean_score, std_score = np.mean(avg_scores), np.std(avg_scores)
info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}')

if args.show_individual_scores:
for task_num, task_name in enumerate(args.task_names):
info(f'\tOverall test {task_name} {metric} = '
f'{np.nanmean(scores[:, task_num]):.6f} +/- {np.nanstd(scores[:, task_num]):.6f}')
f'{np.mean(scores[:, task_num]):.6f} +/- {np.std(scores[:, task_num]):.6f}')

if contains_nan_scores:
info("The metric scores observed for some fold test splits contain 'nan' values. \
This can occur when the test set does not meet the requirements \
for a particular metric, such as having no valid instances of one \
task in the test set or not having positive examples for some classification metrics. \
Before v1.5.1, the default behavior was to ignore nan values in individual folds or tasks \
and still return an overall average for the remaining folds or tasks. The behavior now \
is to include them in the average, converting overall average metrics to 'nan' as well.")

# Save scores
with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f:
Expand All @@ -160,21 +173,21 @@ def cross_validate(args: TrainArgs,
row = ['spectra']
for metric, scores in all_scores.items():
task_scores = scores[:,0]
mean, std = np.nanmean(task_scores), np.nanstd(task_scores)
mean, std = np.mean(task_scores), np.std(task_scores)
row += [mean, std] + task_scores.tolist()
writer.writerow(row)
else: # all other data types, separate scores by task
for task_num, task_name in enumerate(args.task_names):
row = [task_name]
for metric, scores in all_scores.items():
task_scores = scores[:, task_num]
mean, std = np.nanmean(task_scores), np.nanstd(task_scores)
mean, std = np.mean(task_scores), np.std(task_scores)
row += [mean, std] + task_scores.tolist()
writer.writerow(row)

# Determine mean and std score of main metric
avg_scores = np.nanmean(all_scores[args.metric], axis=1)
mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
avg_scores = multitask_mean(all_scores[args.metric], metric=args.metric, axis=1)
mean_score, std_score = np.mean(avg_scores), np.std(avg_scores)

# Optionally merge and save test preds
if args.save_preds:
Expand Down
22 changes: 11 additions & 11 deletions chemprop/train/run_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from chemprop.models import MoleculeModel
from chemprop.nn_utils import param_count, param_count_all
from chemprop.utils import build_optimizer, build_lr_scheduler, load_checkpoint, makedirs, \
save_checkpoint, save_smiles_splits, load_frzn_model
save_checkpoint, save_smiles_splits, load_frzn_model, multitask_mean


def run_training(args: TrainArgs,
Expand Down Expand Up @@ -297,10 +297,10 @@ def run_training(args: TrainArgs,
)

for metric, scores in val_scores.items():
# Average validation score
avg_val_score = np.nanmean(scores)
debug(f'Validation {metric} = {avg_val_score:.6f}')
writer.add_scalar(f'validation_{metric}', avg_val_score, n_iter)
# Average validation score\
mean_val_score = multitask_mean(scores, metric=metric)
debug(f'Validation {metric} = {mean_val_score:.6f}')
writer.add_scalar(f'validation_{metric}', mean_val_score, n_iter)

if args.show_individual_scores:
# Individual validation scores
Expand All @@ -309,10 +309,10 @@ def run_training(args: TrainArgs,
writer.add_scalar(f'validation_{task_name}_{metric}', val_score, n_iter)

# Save model checkpoint if improved validation score
avg_val_score = np.nanmean(val_scores[args.metric])
if args.minimize_score and avg_val_score < best_score or \
not args.minimize_score and avg_val_score > best_score:
best_score, best_epoch = avg_val_score, epoch
mean_val_score = multitask_mean(val_scores[args.metric], metric=args.metric)
if args.minimize_score and mean_val_score < best_score or \
not args.minimize_score and mean_val_score > best_score:
best_score, best_epoch = mean_val_score, epoch
save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler,
atom_descriptor_scaler, bond_feature_scaler, args)

Expand Down Expand Up @@ -376,8 +376,8 @@ def run_training(args: TrainArgs,

for metric, scores in ensemble_scores.items():
# Average ensemble score
avg_ensemble_test_score = np.nanmean(scores)
info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}')
mean_ensemble_test_score = multitask_mean(scores, metric=metric)
info(f'Ensemble test {metric} = {mean_ensemble_test_score:.6f}')

# Individual ensemble scores
if args.show_individual_scores:
Expand Down
39 changes: 39 additions & 0 deletions chemprop/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@

import torch
import torch.nn as nn
import numpy as np
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from tqdm import tqdm
from scipy.stats.mstats import gmean

from chemprop.args import PredictArgs, TrainArgs, FingerprintArgs
from chemprop.data import StandardScaler, MoleculeDataset, preprocess_smiles_columns, get_task_names
Expand Down Expand Up @@ -768,3 +770,40 @@ def update_prediction_args(
"(with either --features_generator or --features_path "
"and using --no_features_scaling if applicable)."
)


def multitask_mean(
scores: np.ndarray,
metric: str,
axis: int = None,
) -> float:
"""
A function for combining the metric scores across different
model tasks into a single score. When the metric being used
is one that varies with the magnitude of the task (such as RMSE),
a geometric mean is used, otherwise a more typical arithmetic mean
is used. This prevents a task with a larger magnitude from dominating
over one with a smaller magnitude (e.g., temperature and pressure).
:param scores: The scores from different tasks for a single metric.
:param metric: The metric used to generate the scores.
:axis: The axis along which to take the mean.
:return: The combined score across the tasks.
"""
scale_dependent_metrics = ["rmse", "mae", "mse", "bounded_rmse", "bounded_mae", "bounded_mse"]
nonscale_dependent_metrics = [
"auc", "prc-auc", "r2", "accuracy", "cross_entropy",
"binary_cross_entropy", "sid", "wasserstein", "f1", "mcc",
]

if metric in scale_dependent_metrics:
return gmean(scores, axis=axis)
elif metric in nonscale_dependent_metrics:
return np.mean(scores, axis=axis)
else:
raise NotImplementedError(
f"The metric used, {metric}, has not been added to the list of\
metrics that are scale-dependent or not scale-dependent.\
This metric must be added to the appropriate list in the multitask_mean\
function in `chemprop/utils.py` in order to be used."
)
53 changes: 27 additions & 26 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,32 +270,33 @@ def test_train_single_task_regression(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']
test_scores = np.array(test_scores_data[f'Mean {metric}'])
self.assertEqual(len(test_scores), 1)

mean_score = test_scores.mean()
mean_score = np.mean(test_scores)
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA*expected_score)

@parameterized.expand([
(
'chemprop',
'chemprop',
'auc',
0.63495735,
0.4908104,
['--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
),
(
'chemprop_morgan_features_generator',
'chemprop',
'auc',
0.5827042,
['--features_generator', 'morgan']
0.4733686,
['--features_generator', 'morgan', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
),
(
'chemprop_rdkit_features_path',
'chemprop',
'auc',
0.63613397,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling']
0.4573833,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
),
(
'chemprop_mcc_metric',
Expand Down Expand Up @@ -337,9 +338,8 @@ def test_train_multi_task_classification(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']

mean_score = test_scores.mean()
test_scores = np.array(test_scores_data[f'Mean {metric}'])
mean_score = np.mean(np.array(test_scores))
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA*expected_score)

@parameterized.expand([
Expand Down Expand Up @@ -408,7 +408,7 @@ def test_predict_single_task_regression(self,

pred, true = pred.drop(columns=['smiles']), true.drop(columns=['smiles'])
pred, true = pred.to_numpy(), true.to_numpy()
mse = float(np.nanmean((pred - true) ** 2))
mse = float(np.mean((pred - true) ** 2))
self.assertAlmostEqual(mse, expected_score, delta=DELTA*expected_score)

def test_predict_individual_ensemble(self):
Expand Down Expand Up @@ -440,20 +440,21 @@ def test_predict_individual_ensemble(self):
(
'chemprop',
'chemprop',
0.07072509
0.1804146,
['--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
),
(
'chemprop_morgan_features_generator',
'chemprop',
0.07685293,
['--features_generator', 'morgan'],
0.26773606,
['--features_generator', 'morgan', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
['--features_generator', 'morgan']
),
(
'chemprop_rdkit_features_path',
'chemprop',
0.072059973,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling'],
0.0778546,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
['--features_path', os.path.join(TEST_DATA_DIR, 'classification_test.npz'), '--no_features_scaling']
)
])
Expand Down Expand Up @@ -662,10 +663,10 @@ def test_train_spectra(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']
test_scores = np.array(test_scores_data[f'Mean {metric}'])
self.assertEqual(len(test_scores), 1)

mean_score = test_scores.mean()
mean_score = np.mean(test_scores)
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA*expected_score)

@parameterized.expand([
Expand Down Expand Up @@ -789,10 +790,10 @@ def test_train_single_task_regression_reaction(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']
test_scores = np.array(test_scores_data[f'Mean {metric}'])
self.assertEqual(len(test_scores), 1)

mean_score = test_scores.mean()
mean_score = np.mean(test_scores)
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA*expected_score)

@parameterized.expand([
Expand Down Expand Up @@ -822,22 +823,22 @@ def test_single_task_multimolecule_classification(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']
test_scores = np.array(test_scores_data[f'Mean {metric}'])

mean_score = test_scores.mean()
mean_score = np.mean(test_scores)
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA * expected_score)

@parameterized.expand([
(
'chemprop',
'chemprop',
3473.79893,
2805.4368779,
['--fingerprint_type', 'MPN'],
),
(
'chemprop',
'chemprop',
3504.50003,
2689.55376,
['--fingerprint_type', 'last_FFN'],
)
])
Expand Down Expand Up @@ -992,10 +993,10 @@ def test_train_single_task_regression_reaction_solvent(self,

# Check results
test_scores_data = pd.read_csv(os.path.join(save_dir, TEST_SCORES_FILE_NAME))
test_scores = test_scores_data[f'Mean {metric}']
test_scores = np.array(test_scores_data[f'Mean {metric}'])
self.assertEqual(len(test_scores), 1)

mean_score = test_scores.mean()
mean_score = np.mean(test_scores)
self.assertAlmostEqual(mean_score, expected_score, delta=DELTA*expected_score)

@parameterized.expand([(
Expand Down

0 comments on commit 0409440

Please sign in to comment.