# Import packages

In [1]:
import numpy as np
import pandas as pd
import torch
from lightning import pytorch as pl

from chemprop import data, featurizers
from chemprop.models import multi

# Change model input here

In [2]:
checkpoint_path = '../tests/data/example_model_v2_regression_multi.ckpt' # path to the checkpoint file. 
# If the checkpoint file is generated using the training notebook, it will be in the `checkpoints` folder with name similar to `checkpoints/epoch=19-step=180.ckpt`.

## Load model

In [4]:
mcmpnn = multi.MulticomponentMPNN.load_from_checkpoint(checkpoint_path)
mcmpnn

MulticomponentMPNN(
  (message_passing): MulticomponentMessagePassing(
    (blocks): ModuleList(
      (0-1): 2 x BondMessagePassing(
        (W_i): Linear(in_features=147, out_features=300, bias=False)
        (W_h): Linear(in_features=300, out_features=300, bias=False)
        (W_o): Linear(in_features=433, out_features=300, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (tau): ReLU()
      )
    )
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Linear(in_features=600, out_features=300, bias=True)
      (1): ReLU()
      (2): Dropout(p=0, inplace=False)
      (3): Linear(in_features=300, out_features=1, bias=True)
    )
  )
)

# Change predict input here

In [5]:
test_path = '../tests/data/regression_multimolecule.csv' # path to your .csv file containing SMILES strings to make predictions for
smiles_columns = ['smiles', 'solvent'] # name of the column containing SMILES strings

## Load test smiles

In [6]:
df_test = pd.read_csv(test_path)
df_test

Unnamed: 0,smiles,solvent,peakwavs_max
0,CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2C...,ClCCl,642.0
1,C(=C/c1cnccn1)\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3c...,ClCCl,420.0
2,CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+]...,O,544.0
3,c1ccc2[nH]ccc2c1,O,290.0
4,CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5c...,ClC(Cl)Cl,736.0
...,...,...,...
495,CCC(=O)c1ccc2cc(N(C)C)ccc2c1,Clc1ccccc1,254.0
496,O=Cc1cc(O)c(C=O)cc1O,CN(C)P(=O)(N(C)C)N(C)C,427.0
497,Cc1ccc(C2=C3C=c4c(oc5ccccc45)=[N+]3[B-](F)(F)n...,CC#N,578.0
498,Cc1ccc(-c2nc(-c3cc([N+](=O)[O-])ccc3O)[nH]c2-c...,CO,284.0


## Get smiles

In [7]:
smiss = df_test[smiles_columns].values
smiss[:5]

array([['CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2CCCC)C(=O)N(CCCC)C1=S',
        'ClCCl'],
       ['C(=C/c1cnccn1)\\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3cnccn3)cc2)cc1',
        'ClCCl'],
       ['CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+](C)C)cc-3oc2c1',
        'O'],
       ['c1ccc2[nH]ccc2c1', 'O'],
       ['CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5ccccc5c4C3(C)C)CCCC1=C2c1ccccc1C(=O)O',
        'ClC(Cl)Cl']], dtype=object)

## Get molecule datapoints

In [8]:
n_componenets = len(smiles_columns)
test_datapointss = [[data.MoleculeDatapoint.from_smi(smi) for smi in smiss[:, i]] for i in range(n_componenets)]

## Get molecule datasets

In [9]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
test_dsets = [data.MoleculeDataset(test_datapoints, featurizer) for test_datapoints in test_datapointss]

# Get multicomponent dataset and data loader

In [10]:
test_mcdset = data.MulticomponentDataset(test_dsets)
test_loader = data.MolGraphDataLoader(test_mcdset, shuffle=False)

# Set up trainer

In [11]:
with torch.inference_mode():
    trainer = pl.Trainer(
        logger=None,
        enable_progress_bar=True,
        accelerator="auto",
        devices=1
    )
    test_preds = trainer.predict(mcmpnn, test_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/hwpang/miniforge3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/Users/hwpang/miniforge3/envs/chemprop/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLo

Predicting DataLoader 0: 100%|██████████| 500/500 [00:02<00:00, 185.93it/s]


In [12]:
test_preds = np.concatenate(test_preds, axis=0)
df_test['pred'] = test_preds
df_test

Unnamed: 0,smiles,solvent,peakwavs_max,pred
0,CCCCN1C(=O)C(=C/C=C/C=C/C=C2N(CCCC)c3ccccc3N2C...,ClCCl,642.0,575.635183
1,C(=C/c1cnccn1)\c1ccc(N(c2ccccc2)c2ccc(/C=C/c3c...,ClCCl,420.0,385.680840
2,CN(C)c1ccc2c(-c3ccc(N)cc3C(=O)[O-])c3ccc(=[N+]...,O,544.0,494.264716
3,c1ccc2[nH]ccc2c1,O,290.0,259.968410
4,CCN(CC)c1ccc2c(c1)OC1=C(/C=C/C3=[N+](C)c4ccc5c...,ClC(Cl)Cl,736.0,654.980482
...,...,...,...,...
495,CCC(=O)c1ccc2cc(N(C)C)ccc2c1,Clc1ccccc1,254.0,370.858923
496,O=Cc1cc(O)c(C=O)cc1O,CN(C)P(=O)(N(C)C)N(C)C,427.0,405.490917
497,Cc1ccc(C2=C3C=c4c(oc5ccccc45)=[N+]3[B-](F)(F)n...,CC#N,578.0,576.756477
498,Cc1ccc(-c2nc(-c3cc([N+](=O)[O-])ccc3O)[nH]c2-c...,CO,284.0,376.093719
