Merge pull request #318 from shihchengli/load_features

Allow providing both loaded features and a features generator
chemprop · Feb 3, 2023 · c371063 · c371063
2 parents 641fdf3 + 426c40a
commit c371063
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -238,7 +238,7 @@ If you install from source, you can modify the code to load custom features as f
 
 #### Molecule-Level RDKit 2D Features
 
-As a starting point, we recommend using pre-normalized RDKit features by using the `--features_generator rdkit_2d_normalized --no_features_scaling` flags. In general, we recommend NOT using the `--no_features_scaling` flag (i.e. allow the code to automatically perform feature scaling), but in the case of `rdkit_2d_normalized`, those features have been pre-normalized and don't require further scaling.
+As a starting point, we recommend using pre-normalized RDKit features by using the `--features_generator rdkit_2d_normalized --no_features_scaling` flags. In general, we recommend NOT using the `--no_features_scaling` flag (i.e. allow the code to automatically perform feature scaling), but in the case of `rdkit_2d_normalized`, those features have been pre-normalized and don't require further scaling. The utilization of the `rdkit_2d_normalized` should be avoided in cases where molecule-level custom features have been loaded and necessitate additional scaling.
 
 The full list of available features for `--features_generator` is as follows. 
 

diff --git a/chemprop/args.py b/chemprop/args.py
@@ -1007,7 +1007,7 @@ class SklearnTrainArgs(TrainArgs):
     """How to impute missing data (None means no imputation)."""
 
 
-class SklearnPredictArgs(Tap):
+class SklearnPredictArgs(CommonArgs):
     """:class:`SklearnPredictArgs` contains arguments used for predicting with a trained scikit-learn model."""
 
     test_path: str

diff --git a/chemprop/data/data.py b/chemprop/data/data.py
@@ -85,9 +85,6 @@ def __init__(self,
         :param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features
 
         """
-        if features is not None and features_generator is not None:
-            raise ValueError('Cannot provide both loaded features and a features generator.')
-
         self.smiles = smiles
         self.targets = targets
         self.row = row
@@ -113,7 +110,10 @@ def __init__(self,
 
         # Generate additional features if given a generator
         if self.features_generator is not None:
-            self.features = []
+            if self.features is None:
+                self.features = []
+            else:
+                self.features = list(self.features)
 
             for fg in self.features_generator:
                 features_generator = get_features_generator(fg)

diff --git a/chemprop/sklearn_predict.py b/chemprop/sklearn_predict.py
@@ -21,6 +21,7 @@ def predict_sklearn(args: SklearnPredictArgs) -> None:
     """
     print('Loading data')
     data = get_data(path=args.test_path,
+                    features_path=args.features_path,
                     smiles_columns=args.smiles_columns,
                     target_columns=[],
                     ignore_columns=[],

diff --git a/chemprop/sklearn_train.py b/chemprop/sklearn_train.py
@@ -266,8 +266,10 @@ def run_sklearn(args: SklearnTrainArgs,
 
     debug('Loading data')
     data = get_data(path=args.data_path,
+                    features_path=args.features_path,
                     smiles_columns=args.smiles_columns,
-                    target_columns=args.target_columns)
+                    target_columns=args.target_columns,
+                    logger=logger)
     args.task_names = get_task_names(path=args.data_path,
                                      smiles_columns=args.smiles_columns,
                                      target_columns=args.target_columns,
@@ -284,7 +286,8 @@ def run_sklearn(args: SklearnTrainArgs,
         seed=args.seed,
         sizes=args.split_sizes,
         num_folds=args.num_folds,
-        args=args
+        args=args,
+        logger=logger
     )
 
     if args.save_smiles_splits:
@@ -296,6 +299,7 @@ def run_sklearn(args: SklearnTrainArgs,
             train_data=train_data,
             test_data=test_data,
             smiles_columns=args.smiles_columns,
+            logger=logger
         )
 
     debug(f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}')

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -239,13 +239,34 @@ def fingerprint(self,
                 2.0438637,
                 ['--features_generator', 'morgan']
         ),
+        (
+                'sklearn_random_forest_rdkit_features_path',
+                'random_forest',
+                'rmse',
+                0.691494,
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
+        ),
+        (
+                'sklearn_svm_rdkit_features_path',
+                'svm',
+                'rmse',
+                1.022634,
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
+        ),
         (
                 'chemprop_rdkit_features_path',
                 'chemprop',
                 'rmse',
                 2.14015989,
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
         ),
+        (
+                'chemprop_features_generator_features_path',
+                'chemprop',
+                'rmse',
+                1.59283050,
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling']
+        ),
         (
                 'chemprop_bounded_mse_loss',
                 'chemprop',
@@ -303,6 +324,13 @@ def test_train_single_task_regression(self,
                 0.466828424,
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
         ),
+        (
+                'chemprop_features_generator_features_path',
+                'chemprop',
+                'auc',
+                0.499183589,
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3']
+        ),
         (
                 'chemprop_mcc_metric',
                 'chemprop',
@@ -370,12 +398,33 @@ def test_train_multi_task_classification(self,
                 ['--features_generator', 'morgan'],
                 ['--features_generator', 'morgan']
         ),
+        (
+                'sklearn_random_forest_rdkit_features_path',
+                'random_forest',
+                0.2954347,
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
+        ),
+        (
+                'sklearn_svm_rdkit_features_path',
+                'svm',
+                0.4112432,
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
+                ['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
+        ),
         (
                 'chemprop_rdkit_features_path',
                 'chemprop',
                 1.51978455,
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
+        ),
+        (
+                'chemprop_features_generator_features_path',
+                'chemprop',
+                0.59545263,
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression.npz'), '--no_features_scaling'],
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'regression_test.npz'), '--no_features_scaling']
         )
     ])
     def test_predict_single_task_regression(self,
@@ -458,9 +507,16 @@ def test_predict_individual_ensemble(self):
         (
                 'chemprop_rdkit_features_path',
                 'chemprop',
-                0.3071592294,
+                0.307159229,
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
                 ['--features_path', os.path.join(TEST_DATA_DIR, 'classification_test.npz'), '--no_features_scaling']
+        ),
+        (
+                'chemprop_features_generator_features_path',
+                'chemprop',
+                0.193924687,
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification.npz'), '--no_features_scaling', '--class_balance', '--split_sizes', '0.4', '0.3', '0.3'],
+                ['--features_generator', 'morgan', '--features_path', os.path.join(TEST_DATA_DIR, 'classification_test.npz'), '--no_features_scaling']
         )
     ])
     def test_predict_multi_task_classification(self,