Skip to content

Commit

Permalink
Merge pull request #318 from shihchengli/load_features
Browse files Browse the repository at this point in the history
Allow providing both loaded features and a features generator
  • Loading branch information
oscarwumit committed Feb 3, 2023
2 parents 641fdf3 + 426c40a commit c371063
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 9 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ If you install from source, you can modify the code to load custom features as f

#### Molecule-Level RDKit 2D Features

As a starting point, we recommend using pre-normalized RDKit features by using the `--features_generator rdkit_2d_normalized --no_features_scaling` flags. In general, we recommend NOT using the `--no_features_scaling` flag (i.e. allow the code to automatically perform feature scaling), but in the case of `rdkit_2d_normalized`, those features have been pre-normalized and don't require further scaling.
As a starting point, we recommend using pre-normalized RDKit features by using the `--features_generator rdkit_2d_normalized --no_features_scaling` flags. In general, we recommend NOT using the `--no_features_scaling` flag (i.e. allow the code to automatically perform feature scaling), but in the case of `rdkit_2d_normalized`, those features have been pre-normalized and don't require further scaling. The utilization of the `rdkit_2d_normalized` should be avoided in cases where molecule-level custom features have been loaded and necessitate additional scaling.

The full list of available features for `--features_generator` is as follows.

Expand Down
2 changes: 1 addition & 1 deletion chemprop/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ class SklearnTrainArgs(TrainArgs):
"""How to impute missing data (None means no imputation)."""


class SklearnPredictArgs(Tap):
class SklearnPredictArgs(CommonArgs):
""":class:`SklearnPredictArgs` contains arguments used for predicting with a trained scikit-learn model."""

test_path: str
Expand Down
8 changes: 4 additions & 4 deletions chemprop/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ def __init__(self,
:param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features
"""
if features is not None and features_generator is not None:
raise ValueError('Cannot provide both loaded features and a features generator.')

self.smiles = smiles
self.targets = targets
self.row = row
Expand All @@ -113,7 +110,10 @@ def __init__(self,

# Generate additional features if given a generator
if self.features_generator is not None:
self.features = []
if self.features is None:
self.features = []
else:
self.features = list(self.features)

for fg in self.features_generator:
features_generator = get_features_generator(fg)
Expand Down
1 change: 1 addition & 0 deletions chemprop/sklearn_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def predict_sklearn(args: SklearnPredictArgs) -> None:
"""
print('Loading data')
data = get_data(path=args.test_path,
features_path=args.features_path,
smiles_columns=args.smiles_columns,
target_columns=[],
ignore_columns=[],
Expand Down
8 changes: 6 additions & 2 deletions chemprop/sklearn_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,10 @@ def run_sklearn(args: SklearnTrainArgs,

debug('Loading data')
data = get_data(path=args.data_path,
features_path=args.features_path,
smiles_columns=args.smiles_columns,
target_columns=args.target_columns)
target_columns=args.target_columns,
logger=logger)
args.task_names = get_task_names(path=args.data_path,
smiles_columns=args.smiles_columns,
target_columns=args.target_columns,
Expand All @@ -284,7 +286,8 @@ def run_sklearn(args: SklearnTrainArgs,
seed=args.seed,
sizes=args.split_sizes,
num_folds=args.num_folds,
args=args
args=args,
logger=logger
)

if args.save_smiles_splits:
Expand All @@ -296,6 +299,7 @@ def run_sklearn(args: SklearnTrainArgs,
train_data=train_data,
test_data=test_data,
smiles_columns=args.smiles_columns,
logger=logger
)

debug(f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}')
Expand Down
58 changes: 57 additions & 1 deletion tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,34 @@ def fingerprint(self,
2.0438637,
['--features_generator', 'morgan']
),
(
'sklearn_random_forest_rdkit_features_path',
'random_forest',
'rmse',
0.691494,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
),
(
'sklearn_svm_rdkit_features_path',
'svm',
'rmse',
1.022634,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
),
(
'chemprop_rdkit_features_path',
'chemprop',
'rmse',
2.14015989,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
),
(
'chemprop_features_generator_features_path',
'chemprop',
'rmse',
1.59283050,
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
),
(
'chemprop_bounded_mse_loss',
'chemprop',
Expand Down Expand Up @@ -303,6 +324,13 @@ def test_train_single_task_regression(self,
0.466828424,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
),
(
'chemprop_features_generator_features_path',
'chemprop',
'auc',
0.499183589,
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
),
(
'chemprop_mcc_metric',
'chemprop',
Expand Down Expand Up @@ -370,12 +398,33 @@ def test_train_multi_task_classification(self,
['--features_generator', 'morgan'],
['--features_generator', 'morgan']
),
(
'sklearn_random_forest_rdkit_features_path',
'random_forest',
0.2954347,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
),
(
'sklearn_svm_rdkit_features_path',
'svm',
0.4112432,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
),
(
'chemprop_rdkit_features_path',
'chemprop',
1.51978455,
['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
),
(
'chemprop_features_generator_features_path',
'chemprop',
0.59545263,
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
)
])
def test_predict_single_task_regression(self,
Expand Down Expand Up @@ -458,9 +507,16 @@ def test_predict_individual_ensemble(self):
(
'chemprop_rdkit_features_path',
'chemprop',
0.3071592294,
0.307159229,
['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
['--features_path', os.path.join(TEST_DATA_DIR, 'classification_test.npz'), '--no_features_scaling']
),
(
'chemprop_features_generator_features_path',
'chemprop',
0.193924687,
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification_test.npz'), '--no_features_scaling']
)
])
def test_predict_multi_task_classification(self,
Expand Down

0 comments on commit c371063

Please sign in to comment.