# Training Regression - Reaction

# Import packages

In [1]:
import pandas as pd
from lightning import pytorch as pl
from pathlib import Path
import os
os.chdir('/home/labhhc2/Documents/workspace/D20/Tam/chemprop/')

from chemprop import data, featurizers, models, nn

# Change data inputs here

In [2]:
chemprop_dir = Path.cwd().parent
input_path = chemprop_dir / "chemprop" / "tests" / "data" / "reactiondatabase" / "data" / "phosphatase.csv"
num_workers = 0  # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'AAM'
target_columns = ['Conversion']

## Load data

In [3]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,AAM,Conversion
0,[CH2:1]=[C:2]([O:3][P:4](=[O:5])([OH:6])[OH:7]...,0.000
1,[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[CH:6]([OH:...,0.000
2,[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[C@@H:6]([O...,0.000
3,[CH3:1][C:2](=[O:3])[O:4][P:5](=[O:6])([OH:7])...,0.000
4,[CH3:1][C:2]([CH3:3])([CH2:4][O:5][P:6](=[O:7]...,0.000
...,...,...
33349,[O:1]=[c:2]1[nH:3][cH:4][n:5][c:6]2[c:7]1[n:8]...,0.000
33350,[O:1]=[c:2]1[cH:3][cH:4][n:5]([C@@H:6]2[O:7][C...,0.000
33351,[O:1]=[c:2]1[cH:3][cH:4][n:5]([C@@H:6]2[O:7][C...,0.000
33352,[O:1]=[c:2]1[cH:3][cH:4][n:5]([C@@H:6]2[O:7][C...,0.000


## Load smiles and targets

In [4]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

smis[:5], ys[:5]

(array(['[CH2:1]=[C:2]([O:3][P:4](=[O:5])([OH:6])[OH:7])[C:8](=[O:9])[OH:10].[OH2:11]>>[CH2:1]=[C:2]([OH:3])[C:8](=[O:9])[OH:10].[P:4](=[O:5])([OH:6])([OH:7])[OH:11]',
        '[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[CH:6]([OH:7])[O:8][C@H:9]([CH2:10][O:11][P:12](=[O:13])([OH:14])[OH:15])[C@@H:16]([OH:17])[C@@H:18]1[OH:19].[OH2:20]>>[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[CH:6]([OH:7])[O:8][C@H:9]([CH2:10][OH:11])[C@@H:16]([OH:17])[C@@H:18]1[OH:19].[P:12](=[O:13])([OH:14])([OH:15])[OH:20]',
        '[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[C@@H:6]([OH:7])[CH2:8][C:9]([OH:10])([C:11](=[O:12])[OH:13])[O:14][C@H:15]1[C@H:16]([OH:17])[C@H:18]([OH:19])[CH2:20][O:21][P:22](=[O:23])([OH:24])[OH:25].[OH2:26]>>[CH3:1][C:2](=[O:3])[NH:4][C@@H:5]1[C@@H:6]([OH:7])[CH2:8][C:9]([OH:10])([C:11](=[O:12])[OH:13])[O:14][C@H:15]1[C@H:16]([OH:17])[C@H:18]([OH:19])[CH2:20][OH:21].[P:22](=[O:23])([OH:24])([OH:25])[OH:26]',
        '[CH3:1][C:2](=[O:3])[O:4][P:5](=[O:6])([OH:7])[OH:8].[OH2:9]>>[CH3:1][C:2](=[O:3])[OH:4].

## Get datapoints

In [5]:
all_data = [data.ReactionDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

## Perform data splitting for training, validation, and testing

In [6]:
mols = [d.rct for d in all_data]  # Can either split by reactants (.rct) or products (.pdt)
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

# Defining the featurizer

Reactions can be featurized using the ```CondensedGraphOfReactionFeaturizer``` (also labeled ```CGRFeaturizer```).


Use ```_mode``` keyword to set the mode by which a reaction should be featurized into a ```MolGraph```.

Options are can be found with ```featurizers.RxnMode.keys```

In [7]:
for key in featurizers.RxnMode.keys():
    print(key)

REAC_PROD
REAC_PROD_BALANCE
REAC_DIFF
REAC_DIFF_BALANCE
PROD_DIFF
PROD_DIFF_BALANCE


In [8]:
featurizer = featurizers.CondensedGraphOfReactionFeaturizer(mode_="PROD_DIFF")

## Get ReactionDatasets

In [9]:
train_dset = data.ReactionDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

val_dset = data.ReactionDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)
test_dset = data.ReactionDataset(test_data, featurizer)

## Get dataloaders

In [10]:
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

# Change Message-Passing Neural Network (MPNN) inputs here

## Message passing

Message passing blocks must be given the shape of the featurizer's outputs.

Options are `mp = nn.BondMessagePassing()` or `mp = nn.AtomMessagePassing()`

In [11]:
fdims = featurizer.shape # the dimensions of the featurizer, given as (atom_dims, bond_dims).
mp = nn.BondMessagePassing(*fdims)

## Aggregation

In [12]:
print(nn.agg.AggregationRegistry)

ClassRegistry {
    'mean': <class 'chemprop.nn.agg.MeanAggregation'>,
    'sum': <class 'chemprop.nn.agg.SumAggregation'>,
    'norm': <class 'chemprop.nn.agg.NormAggregation'>
}


In [13]:
agg = nn.MeanAggregation()

## Feed-Forward Network (FFN)

In [14]:
print(nn.PredictorRegistry)

ClassRegistry {
    'regression': <class 'chemprop.nn.predictors.RegressionFFN'>,
    'regression-mve': <class 'chemprop.nn.predictors.MveFFN'>,
    'regression-evidential': <class 'chemprop.nn.predictors.EvidentialFFN'>,
    'classification': <class 'chemprop.nn.predictors.BinaryClassificationFFN'>,
    'classification-dirichlet': <class 'chemprop.nn.predictors.BinaryDirichletFFN'>,
    'multiclass': <class 'chemprop.nn.predictors.MulticlassClassificationFFN'>,
    'multiclass-dirichlet': <class 'chemprop.nn.predictors.MulticlassDirichletFFN'>,
    'spectral': <class 'chemprop.nn.predictors.SpectralFFN'>
}


In [15]:
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)

In [16]:
ffn = nn.RegressionFFN(output_transform=output_transform)

## Batch norm

In [17]:
batch_norm = True

## Metrics

In [18]:
print(nn.metrics.MetricRegistry)

ClassRegistry {
    'mae': <class 'chemprop.nn.metrics.MAEMetric'>,
    'mse': <class 'chemprop.nn.metrics.MSEMetric'>,
    'rmse': <class 'chemprop.nn.metrics.RMSEMetric'>,
    'bounded-mae': <class 'chemprop.nn.metrics.BoundedMAEMetric'>,
    'bounded-mse': <class 'chemprop.nn.metrics.BoundedMSEMetric'>,
    'bounded-rmse': <class 'chemprop.nn.metrics.BoundedRMSEMetric'>,
    'r2': <class 'chemprop.nn.metrics.R2Metric'>,
    'roc': <class 'chemprop.nn.metrics.BinaryAUROCMetric'>,
    'prc': <class 'chemprop.nn.metrics.BinaryAUPRCMetric'>,
    'accuracy': <class 'chemprop.nn.metrics.BinaryAccuracyMetric'>,
    'f1': <class 'chemprop.nn.metrics.BinaryF1Metric'>,
    'bce': <class 'chemprop.nn.metrics.BCEMetric'>,
    'ce': <class 'chemprop.nn.metrics.CrossEntropyMetric'>,
    'binary-mcc': <class 'chemprop.nn.metrics.BinaryMCCMetric'>,
    'multiclass-mcc': <class 'chemprop.nn.metrics.MulticlassMCCMetric'>,
    'sid': <class 'chemprop.nn.metrics.SIDMetric'>,
    'wasserstein': <class '

In [19]:
metric_list = [nn.metrics.RMSEMetric(), nn.metrics.MAEMetric()] 
# Only the first metric is used for training and early stopping

## Construct MPNN

In [20]:
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=134, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=406, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSELoss(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
)

# Training and testing

## Set up trainer

In [21]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True,  # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="auto",
    devices=1,
    max_epochs=20,  # number of epochs to train for
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## Start training

In [22]:
trainer.fit(mpnn, train_loader, val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti SUPER') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/labhhc2/anaconda3/envs/synprop/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/labhhc2/Documents/workspace/D20/Tam/chemprop/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/labhhc2/anaconda3/envs/synprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/labhhc2/anaconda3/envs/synprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 417/417 [00:15<00:00, 27.60it/s, train_loss=0.727, val_loss=0.953]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 417/417 [00:15<00:00, 27.58it/s, train_loss=0.727, val_loss=0.953]


## Test results

In [23]:
results = trainer.test(mpnn, test_loader)
results 

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/labhhc2/anaconda3/envs/synprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 53/53 [00:01<00:00, 34.07it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 batch_averaged_test/mae    0.09829461574554443
batch_averaged_test/rmse    0.13805873692035675
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'batch_averaged_test/rmse': 0.13805873692035675,
  'batch_averaged_test/mae': 0.09829461574554443}]