Merge pull request #449 from datamol-io/caching

Various updates to graphium
datamol-io · Sep 1, 2023 · 0d24634 · 0d24634
2 parents c211dac + 3cf2fb5
commit 0d24634
Show file tree

Hide file tree

Showing 40 changed files with 945 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,7 @@ datacache/
 tests/temp_cache*
 predictions/
 draft/
+scripts-expts/
 
 # Data and predictions
 graphium/data/ZINC_bench_gnn/

diff --git a/README.md b/README.md
@@ -65,9 +65,55 @@ The above step needs to be done once. After that, enable the SDK and the environ
 source enable_ipu.sh .graphium_ipu
 ```
 
-## The Graphium CLI
+## Training a model
 
-Installing `graphium` makes two CLI tools available: `graphium` and `graphium-train`. These CLI tools make it easy to access advanced functionality, such as _training a model_,  _extracting fingerprints from a pre-trained model_ or _precomputing the dataset_. For more information, visit [the documentation](https://graphium-docs.datamol.io/stable/cli/reference.html).
+To learn how to train a model, we invite you to look at the documentation, or the jupyter notebooks available [here](https://github.com/datamol-io/graphium/tree/master/docs/tutorials/model_training).
+
+If you are not familiar with [PyTorch](https://pytorch.org/docs) or [PyTorch-Lightning](https://pytorch-lightning.readthedocs.io/en/latest/), we highly recommend going through their tutorial first.
+
+## Running an experiment
+We have setup Graphium with `hydra` for managing config files. To run an experiment go to the `expts/` folder. For example, to benchmark a GCN on the ToyMix dataset run
+```bash
+graphium-train dataset=toymix model=gcn
+```
+To change parameters specific to this experiment like switching from `fp16` to `fp32` precision, you can either override them directly in the CLI via
+```bash
+graphium-train dataset=toymix model=gcn trainer.trainer.precision=32
+```
+or change them permamently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`.
+Integrating `hydra` also allows you to quickly switch between accelerators. E.g., running
+```bash
+graphium-train dataset=toymix model=gcn accelerator=gpu
+```
+automatically selects the correct configs to run the experiment on GPU.
+Finally, you can also run a fine-tuning loop: 
+```bash
+graphium-train +finetuning=admet
+```
+
+To use a config file you built from scratch you can run
+```bash
+graphium-train --config-path [PATH] --config-name [CONFIG]
+```
+Thanks to the modular nature of `hydra` you can reuse many of our config settings for your own experiments with Graphium.
+
+## Preparing the data in advance
+The data preparation including the featurization (e.g., of molecules from smiles to pyg-compatible format) is embedded in the pipeline and will be performed when executing `graphium-train [...]`.
+
+However, when working with larger datasets, it is recommended to perform data preparation in advance using a machine with sufficient allocated memory (e.g., ~400GB in the case of `LargeMix`). Preparing data in advance is also beneficial when running lots of concurrent jobs with identical molecular featurization, so that resources aren't wasted and processes don't conflict reading/writing in the same directory.
+
+The following command-line will prepare the data and cache it, then use it to train a model.
+```bash
+# First prepare the data and cache it in `path_to_cached_data`
+graphium data prepare ++datamodule.args.processed_graph_data_path=[path_to_cached_data]
+
+# Then train the model on the prepared data
+graphium-train [...] datamodule.args.processed_graph_data_path=[path_to_cached_data]
+```
+
+**Note** that `datamodule.args.processed_graph_data_path` can also be specified at `expts/hydra_configs/`.
+
+**Note** that, every time the configs of `datamodule.args.featurization` changes, you will need to run a new data preparation, which will automatically be saved in a separate directory that uses a hash unique to the configs.
 
 ## License
 

diff --git a/expts/hydra-configs/README.md b/expts/hydra-configs/README.md
@@ -33,7 +33,7 @@ constants:
 
 trainer:
   model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-gin/
+    dirpath: models_checkpoints/neurips2023-small-gin/${now:%Y-%m-%d_%H-%M-%S}/
 ```
 We can now utilize `hydra` to e.g., run a sweep over our models on the ToyMix dataset via
 
@@ -43,7 +43,7 @@ graphium-train -m model=gcn,gin
 where the ToyMix dataset is pre-configured in `main.yaml`. Read on to find out how to define new datasets and architectures for pre-training and fine-tuning.
 
 ## Pre-training / Fine-tuning
-Say you trained a model with the following command: 
+Say you trained a model with the following command:
 ```bash
 graphium-train --config-name "main"
 ```

diff --git a/expts/hydra-configs/architecture/largemix.yaml b/expts/hydra-configs/architecture/largemix.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+
+architecture:
+  model_type: FullGraphMultiTaskNetwork
+  mup_base_path: null
+  pre_nn:   # Set as null to avoid a pre-nn network
+    out_dim: 64
+    hidden_dims: 256
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: &dropout 0.1
+    normalization: &normalization layer_norm
+    last_normalization: *normalization
+    residual_type: none
+
+  pre_nn_edges: null
+
+  pe_encoders:
+    out_dim: 32
+    pool: "sum" #"mean" "max"
+    last_norm: None #"batch_norm", "layer_norm"
+    encoders: #la_pos |  rw_pos
+      la_pos:  # Set as null to avoid a pre-nn network
+        encoder_type: "laplacian_pe"
+        input_keys: ["laplacian_eigvec", "laplacian_eigval"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
+        num_layers: 2
+        num_layers_post: 1 # Num. layers to apply after pooling
+        dropout: 0.1
+        first_normalization: "none" #"batch_norm" or "layer_norm"
+      rw_pos:
+        encoder_type: "mlp"
+        input_keys: ["rw_return_probs"]
+        output_keys: ["feat"]
+        hidden_dim: 64
+        out_dim: 32
+        num_layers: 2
+        dropout: 0.1
+        normalization: "layer_norm" #"batch_norm" or "layer_norm"
+        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
+
+  gnn:  # Set as null to avoid a post-nn network
+    in_dim: 64 # or otherwise the correct value
+    out_dim: &gnn_dim 768
+    hidden_dims: *gnn_dim
+    depth: 4
+    activation: gelu
+    last_activation: none
+    dropout: 0.1
+    normalization: "layer_norm"
+    last_normalization: *normalization
+    residual_type: simple
+    virtual_node: 'none'
+
+  graph_output_nn:
+    graph:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+    node:
+      pooling: [sum]
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+      depth: 1
+      activation: relu
+      last_activation: none
+      dropout: *dropout
+      normalization: *normalization
+      last_normalization: "none"
+      residual_type: none
+
+datamodule:
+  module_type: "MultitaskFromSmilesDataModule"
+  args:
+    prepare_dict_or_graph: pyg:graph
+    featurization_n_jobs: 20
+    featurization_progress: True
+    featurization_backend: "loky"
+    processed_graph_data_path: "../datacache/large-dataset/"
+    dataloading_from: "disk"
+    num_workers: 20 # -1 to use all
+    persistent_workers: True
+    featurization:
+      atom_property_list_onehot: [atomic-number, group, period, total-valence]
+      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
+      edge_property_list: [bond-type-onehot, stereo, in-ring]
+      add_self_loop: False
+      explicit_H: False # if H is included
+      use_bonds_weights: False
+      pos_encoding_as_features: # encoder dropout 0.18
+        pos_types:
+          lap_eigvec:
+            pos_level: node
+            pos_type: laplacian_eigvec
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          lap_eigval:
+            pos_level: node
+            pos_type: laplacian_eigval
+            num_pos: 8
+            normalization: "none" # nomrlization already applied on the eigen vectors
+            disconnected_comp: True # if eigen values/vector for disconnected graph are included
+          rw_pos: # use same name as pe_encoder
+            pos_level: node
+            pos_type: rw_return_probs
+            ksteps: 16
diff --git a/expts/hydra-configs/experiment/toymix_mpnn.yaml b/expts/hydra-configs/experiment/toymix_mpnn.yaml
@@ -10,4 +10,4 @@ constants:
 
 trainer:
   model_checkpoint:
-    dirpath: models_checkpoints/neurips2023-small-mpnn/
+    dirpath: models_checkpoints/neurips2023-small-mpnn/${now:%Y-%m-%d_%H-%M-%S}/
diff --git a/expts/hydra-configs/model/gine.yaml b/expts/hydra-configs/model/gine.yaml
@@ -0,0 +1,26 @@
+# @package _global_
+
+architecture:
+  pre_nn_edges:   # Set as null to avoid a pre-nn network
+    out_dim: 32
+    hidden_dims: 128
+    depth: 2
+    activation: relu
+    last_activation: none
+    dropout: ${architecture.pre_nn.dropout}
+    normalization: ${architecture.pre_nn.normalization}
+    last_normalization: ${architecture.pre_nn.normalization}
+    residual_type: none
+
+  gnn:
+    out_dim: &gnn_dim 704
+    hidden_dims: *gnn_dim
+    layer_type: 'pyg:gine'
+
+  graph_output_nn:
+    graph:
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
+    node:
+      out_dim: *gnn_dim
+      hidden_dims: *gnn_dim
diff --git a/expts/hydra-configs/tasks/l1000_mcf7.yaml b/expts/hydra-configs/tasks/l1000_mcf7.yaml
@@ -0,0 +1,7 @@
+# NOTE: We cannot have a single config, since for fine-tuning we will
+#  only want to override the loss_metrics_datamodule, whereas for training we will
+#  want to override both. 
+
+defaults:
+  - task_heads: l1000_mcf7
+  - loss_metrics_datamodule: l1000_mcf7
diff --git a/expts/hydra-configs/tasks/l1000_vcap.yaml b/expts/hydra-configs/tasks/l1000_vcap.yaml
@@ -0,0 +1,7 @@
+# NOTE: We cannot have a single config, since for fine-tuning we will
+#  only want to override the loss_metrics_datamodule, whereas for training we will
+#  want to override both. 
+
+defaults:
+  - task_heads: l1000_vcap
+  - loss_metrics_datamodule: l1000_vcap
diff --git a/expts/hydra-configs/tasks/largemix.yaml b/expts/hydra-configs/tasks/largemix.yaml
@@ -0,0 +1,7 @@
+# NOTE: We cannot have a single config, since for fine-tuning we will
+#  only want to override the loss_metrics_datamodule, whereas for training we will
+#  want to override both. 
+
+defaults:
+  - task_heads: largemix
+  - loss_metrics_datamodule: largemix
diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/l1000_mcf7.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/l1000_mcf7.yaml
@@ -0,0 +1,49 @@
+# @package _global_
+
+predictor:
+  metrics_on_progress_bar:
+    l1000_mcf7: []
+  metrics_on_training_set:
+    l1000_mcf7: []
+  loss_fun:
+    l1000_mcf7:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+
+metrics:
+  l1000_mcf7:
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+
+datamodule:
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_mcf7:
+        df: null
+        df_path: ../data/graphium/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: ../data/graphium/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        # split_names: [train, val, test_seen]
+        epoch_sampling_fraction: 1.0
diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/l1000_vcap.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/l1000_vcap.yaml
@@ -0,0 +1,49 @@
+# @package _global_
+
+predictor:
+  metrics_on_progress_bar:
+    l1000_vcap: []
+  metrics_on_training_set:
+    l1000_vcap: []
+  loss_fun:
+    l1000_vcap:
+      name: hybrid_ce_ipu
+      n_brackets: 3
+      alpha: 0.5
+
+metrics:
+  l1000_vcap:
+    - name: auroc
+      metric: auroc
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+    - name: avpr
+      metric: averageprecision
+      num_classes: 3
+      task: multiclass
+      target_to_int: True
+      target_nan_mask: -1000
+      ignore_index: -1000
+      multitask_handling: mean-per-label
+      threshold_kwargs: null
+
+datamodule:
+  args: # Matches that in the test_multitask_datamodule.py case.
+    task_specific_args:   # To be replaced by a new class "DatasetParams"
+      l1000_vcap:
+        df: null
+        df_path: ../data/graphium/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
+        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # or set path as the URL directly
+        smiles_col: "SMILES"
+        label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
+        # sample_size: 2000 # use sample_size for test
+        task_level: graph
+        splits_path: ../data/graphium/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        # split_names: [train, val, test_seen]
+        epoch_sampling_fraction: 1.0