In [14]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from rdkit import Chem

In [2]:
### This builds a list of directories which have the DR split data inside. DR split data means the 5 random splits in our case.
dr_dirs = list(set(['/'.join(str(p).split('/')[:-2]) for p in Path('.../train_val_test_splits').rglob('*.csv') if 'ipynb' not in str(p)]))

In [3]:
len(dr_dirs)

1

In [8]:
def get_df_for_unsplit_dataset(unsplit_dataset_path: str, is_dr_separate: bool):
    if is_dr_separate:
        return pd.read_csv(unsplit_dataset_path)
    else:
        df = pd.read_csv(unsplit_dataset_path)
        return df[~df['DR'].isna()].reset_index()

In [11]:
def compute_max_atomic_num(df):
    smiles = df['neut-smiles'].values
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    max_atomic_nums = []
    max_num_atoms = []

    for m in mols:
        atom_nums = [a.GetAtomicNum() for a in m.GetAtoms()]
        max_atomic_nums.append(np.max(atom_nums))
        
        num_atoms = len(m.GetAtoms())
        max_num_atoms.append(num_atoms)
    
    df['Largest atomic number'] = np.max(max_atomic_nums)
    df['# atoms'] = np.max(max_num_atoms)

    return df

In [None]:
TRAIN_DR_PYTHON_SCRIPT_PATH = '.../train_dr.py'

In [5]:
template = 'python {training_script_path} --data-path {data_path} --out-dir {out_dir} --target-label {target_label} --node-latent-dim {node_dim} --graph-latent-dim {graph_dim} --smiles-column {smiles_column} --max-atomic-number {max_atomic_number} --readout {readout} --id-column {id_column} {use_vgae} --num-layers {num_layers} --conv {conv} {use_batch_norm} --gnn-intermediate-dim {gnn_interim_dim} --name {name} --task-type {task_type}'

In [16]:
NODE_DIM = 50
EMBEDDINGS_AUX_DIM = 64
BATCH_NORM = '--use-batch-norm'

MAIN_OUT_DIR = '.../out/'
DR_TRAINING_SCRIPTS_PATH = '.../DR_training_scripts/'

# Must be columns in the .csv files
TARGET_LABEL = 'DR'
SMILES_COL = 'neut-smiles'
ID_COLUMN = 'CID'


### CUDA disabled by default as these models are trained quickly. 
### PNA requires GPUs as it is very expensive, but GCN and GIN work well with CPUs.
USE_CUDA = False

for dr_dir in tqdm(dr_dirs):
    ### Extracts the dataset name from the path, e.g. 'DR=AID1431-SD=AID873'.
    dr_sd_name = dr_dir.split('/')[-1]

    ### The entire DR file, without any splits. This is used to get extract some additional information.
    dr_df = get_df_for_unsplit_dataset(f'.../AID{dr_sd_name}/SD.csv', is_dr_separate=False)
    dr_dr = compute_max_atomic_num(dr_df)

    ### Same arguments as in the SD notebook.
    for conv in ['GCN', 'GIN', 'PNA']:
        for vgae in ['--use-vgae', '--no-use-vgae']:
            for num_layers in [2, 3, 4]:
                for agg in ['global_add_pool', 'global_mean_pool', 'global_max_pool', 'linear']:
                    for sd_label in ['Z-Score', 'Embeddings', None]:
                        for itr in range(5):
                            for edges in ['--edge-dim 13', None]:

                                if conv == 'GCN' and edges:
                                    continue

                                if sd_label in ['Z-Score']:
                                    aux_dim = 1
                                    lbl_or_emb = 'lbl'
                                elif sd_label == 'Embeddings':
                                    lbl_or_emb = 'emb'
                                    aux_dim = EMBEDDINGS_AUX_DIM
                                else:
                                    aux_dim = 0
                                    lbl_or_emb = None
                            
                                GRAPH_DIM = 65 if conv == 'PNA' else 64
                                GNN_INTERIM_DIM = 130 if conv == 'PNA' else 128

                                ### Input data path. Expect a directory path (not path to file) where 3 files are available: train.csv, validate.csv, test.csv. 
                                ### This is hard-coded in train_dr.py.
                                data_path = f'{dr_dir}/{itr}'

                                vgae_type = 'VGAE' if vgae == '--use-vgae' else 'GNN'
                                edges_type = 'EDGES' if edges else 'NO-EDGES'

                                ### Output directory. The DR code only saves a single checkpoint (best according to the validation loss) and the test set predictions, true values, and metrics.
                                ### This will be created if it does not exist.
                                out_dir = os.path.join(MAIN_OUT_DIR, f'{dr_sd_name}/{conv}/{vgae_type}/{num_layers}/{agg}/{sd_label}/{itr}/{edges_type}')

                                ### If ussing classification datasets.
                                # task_type = 'classification'

                                task_type = 'regression'
                                
                                max_atomic_num = dr_df['Largest atomic number'].values[0]
                                max_mol_size = np.max(dr_df['# atoms'])

                                script = template.format(data_path=data_path, out_dir=out_dir, target_label=TARGET_LABEL, node_dim=NODE_DIM, graph_dim=GRAPH_DIM, 
                                    smiles_column=SMILES_COL, max_atomic_number=max_atomic_num, readout=agg, id_column=ID_COLUMN, num_layers=num_layers, conv=conv, 
                                    gnn_interim_dim=GNN_INTERIM_DIM, name=dr_sd_name, lbl_or_emb=lbl_or_emb, task_type=task_type, use_vgae=vgae, use_batch_norm=BATCH_NORM, 
                                    training_script_path=TRAIN_DR_PYTHON_SCRIPT_PATH)

                                if sd_label:
                                    script += f' --lbl-or-emb {lbl_or_emb}  --auxiliary-data-column-name {sd_label}  --auxiliary-dim {aux_dim}'

                                if edges:
                                    script += f' {edges}'

                                ### GPU or CPU
                                if USE_CUDA:
                                    script += ' --use-cuda'
                                else:
                                    script += ' --no-use-cuda'

                                ### Save training scripts in a dedicated directory.
                                with open(os.path.join(DR_TRAINING_SCRIPTS_PATH, f'{dr_sd_name}+{conv}+{vgae_type}+{num_layers}+{agg}+{sd_label}+{itr}+{edges_type}.txt'), 'w') as f:
                                    f.write(script)


100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
