In [1]:
from pathlib import Path
import glob
import os
import numpy as np
import pandas as pd
import json
from tqdm.auto import tqdm

In [28]:
TRAIN_SD_PYTHON_SCRIPT_PATH = '.../train_sd.py'
SD_GET_EMBEDDINGS_SCRIPT_PATH = '.../generate_sd_embs_preds.py'
SD_DATASET_FILE_PATH = '.../AID{ds}/SD.csv'

In [22]:
atomic_num_dict = {'1259350-1224905': 53,
 '1259418-1259416': 80,
 '449756-435005': 80,
 '449762': 80,
 '1465': 53,
 '1259375-1259374': 80,
 '1949': 53,
 '1431-873': 53,
 '504329': 80,
 '1445': 53,
 '624273-588549': 80,
 '624326-602261': 80,
 '624330': 80,
 '504941-488895': 80,
 '720512-652162': 53,
 '624474-624304': 80,
 '493155-485273': 80,
 '435010-2221': 53,
 '463203-2650': 80,
 '1259420-1259416': 80,
 '2382-2098': 53,
 '687027-652154': 53,
 '504313-2732': 80}

In [25]:
script_template = 'python {training_script_path} --data-path {data_path} --sd-label {sd_label} --node-latent-dim 50 --graph-latent-dim 64 --gnn-intermediate-dim {gnn_interim_dim} --out-dir {out_dir} --smiles-column neut-smiles  --max-atomic-number {max_atomic_number} --readout {readout} --id-column {id_column} --monitor-loss-name train_total_loss {vgae} --num-layers {num_layers} --conv {conv} --no-use-batch-norm --num-epochs 200 --name {ds}'

# SD training scripts

In [26]:
USE_CUDA = True

# Must be columns in the .csv files
ID_COLUMN = 'CID'
SD_LABEL = 'SD'

MAIN_OUT_DIR = '.../out_SD/'
SD_TRAINING_SCRIPTS_PATH = '.../SD_training_scripts/'

for ds in atomic_num_dict.keys():
    ### Type of graph layer/convolution.
    for conv in ['GCN', 'GIN', 'PNA']:
        ### Number of graph layers (depth of GNN).
        for num_layers in [2, 3]:
            ### Use VGAE (as in the paper) or a GNN alternative with no variational component. VGAE almost always works better for SD.
            for vgae_type in ['VGAE', 'GNN']:
                ### This is the Dense/MLP neural readout. The standard alternatives are 'global_add_pool', 'global_mean_pool', and 'global_max_pool'. The Set Transformer and GRU did not perform well on SD so they are not included at the moment.
                for readout in ['linear']:
                    ### Use edge features or not (edges features have a dimension of 13).
                    for edges in ['--edge-dim 13', None]:

                        ### GCN does not work with edges
                        if conv == 'GCN' and edges is not None:
                            continue
                        
                        vgae = '--use-vgae' if vgae_type == 'VGAE' else '--no-use-vgae'
                        edges_path = 'EDGES' if edges is not None else 'NO-EDGES'

                        gnn_interim_dim = 512 if conv != 'PNA' else 515

                        ### Define a path for the checkpoints. This will be created if it does not exist.
                        out_dir = os.path.join(MAIN_OUT_DIR, f'{ds}/{num_layers}/{vgae_type}/{conv}/{readout}/{edges_path}')
                        Path(out_dir).mkdir(exist_ok=True, parents=True)

                        ### Set input data path. This should be a single .csv file.
                        data_path = SD_DATASET_FILE_PATH.format(ds=ds)

                        ### Generate script with all the settings 
                        script = script_template.format(ds=ds, data_path=data_path, vgae=vgae, conv=conv, num_layers=num_layers, readout=readout, out_dir=out_dir, 
                            max_atomic_number=atomic_num_dict[ds], gnn_interim_dim=gnn_interim_dim, sd_label=SD_LABEL, id_column=ID_COLUMN,
                            training_script_path=TRAIN_SD_PYTHON_SCRIPT_PATH)

                        ### Add edges if value is set
                        if edges_path == 'EDGES':
                            script += ' --edge-dim 13'

                        ### GPU or CPU
                        if USE_CUDA:
                            script += ' --use-cuda'
                        else:
                            script += ' --no-use-cuda'
                        
                        ### If we have checkpoints that we want to resume from. Disabled at the moment.
                        # if (num_layers, conv, vgae_type, readout) in ckpt_dict:
                        #     script += f' --ckpt-path {ckpt_dict[(num_layers, conv, vgae_type, readout)]}'

                        ### Save training scripts in a dedicated directory.
                        with open(os.path.join(SD_TRAINING_SCRIPTS_PATH, f'{ds}_{num_layers}_{conv}_{vgae_type}_{readout}_{edges_path}.sh'), 'w') as f:
                            f.write(script)

# SD generate embeddings and predictions scripts

In [27]:
script_template = 'python {training_script_path} --data-path {data_path} --sd-label {sd_label} --node-latent-dim 50 --graph-latent-dim 64 --gnn-intermediate-dim {gnn_interim_dim} --out-dir {out_dir} --smiles-column neut-smiles  --max-atomic-number {max_atomic_number} --readout {readout} --id-column {id_column} --monitor-loss-name train_total_loss {vgae} --num-layers {num_layers} --conv {conv} --no-use-batch-norm --name {ds}'

In [34]:
USE_CUDA = True

# Must be columns in the .csv files
ID_COLUMN = 'CID'
SD_LABEL = 'SD'

MAIN_OUT_DIR = '.../out_SD/'
SD_GET_EMBEDDINGS_SCRIPTS_PATH = '.../SD_get_embeddings_scripts/'

for ds in atomic_num_dict.keys():
    ### Type of graph layer/convolution.
    for conv in ['GCN', 'GIN', 'PNA']:
        ### Number of graph layers (depth of GNN).
        for num_layers in [2, 3]:
            ### Use VGAE (as in the paper) or a GNN alternative with no variational component. VGAE almost always works better for SD.
            for vgae_type in ['VGAE', 'GNN']:
                ### This is the Dense/MLP neural readout. The standard alternatives are 'global_add_pool', 'global_mean_pool', and 'global_max_pool'. The Set Transformer and GRU did not perform well on SD so they are not included at the moment.
                for readout in ['linear']:
                    ### Use edge features or not.
                    for edges in ['--edge-dim', None]:

                        ### GCN does not work with edges
                        if conv == 'GCN' and edges is not None:
                            continue
                        
                        vgae = '--use-vgae' if vgae_type == 'VGAE' else '--no-use-vgae'
                        edges_path = 'EDGES' if edges is not None else 'NO-EDGES'

                        gnn_interim_dim = 512 if conv != 'PNA' else 515

                        out_dir = os.path.join(MAIN_OUT_DIR, f'{ds}/{num_layers}/{vgae_type}/{conv}/{readout}/{edges_path}')
                        ### Out directory is the same as the checkpoint search directory
                        ### Search for the latest (most recent) checkpoint in the directory defined previously for SD training
                        ckpt_path = os.path.join(MAIN_OUT_DIR, f'{ds}/{num_layers}/{vgae_type}/{conv}/{readout}/{edges_path}')
                        list_of_files = glob.glob(f'{ckpt_path}/*.ckpt')
                        if len(list_of_files) > 0:
                            latest_ckpt = max(list_of_files, key=os.path.getctime)

                            ### Set input data path. This should be a single .csv file.
                            data_path = SD_DATASET_FILE_PATH.format(ds=ds)

                            ### Generate script with all the values 
                            script = script_template.format(ds=ds, data_path=data_path, vgae=vgae, conv=conv, num_layers=num_layers, readout=readout, 
                                out_dir=out_dir, max_atomic_number=atomic_num_dict[ds], gnn_interim_dim=gnn_interim_dim, sd_label=SD_LABEL, 
                                training_script_path=SD_GET_EMBEDDINGS_SCRIPT_PATH, id_column=ID_COLUMN)

                            ### Add edges if value is set
                            if edges_path == 'EDGES':
                                script += ' --edge-dim 13'

                            ### GPU or CPU
                            if USE_CUDA:
                                script += ' --use-cuda'
                            else:
                                script += ' --no-use-cuda'

                            ### Add checkpoint path
                            script += f' --ckpt-path {latest_ckpt}'
                            
                            ### Save training scripts in a dedicated directory.
                            with open(os.path.join(SD_GET_EMBEDDINGS_SCRIPTS_PATH, f'{ds}_{num_layers}_{conv}_{vgae_type}_{readout}_{edges_path}.sh'), 'w') as f:
                                f.write(script)